rm(list=ls(all=TRUE)) # generating a corpus example ##################################################################### corpus <- c("b","a","m","n","i","b","e","u","p","b","a","s","a","t","b","e","w","q","n","b","c","a","g","a","b","e","s","t","a","b","a","g","h","a","b","e","a","a","t","b","a","h","a","a","b","e","a","x","a","t") corpus.parts <- paste0("p", rep(1:5, c(9,10,10,10,11))) # dispersions1 #################################################################################### ## defining dispersions1 dispersions1 <- function( element, # the word/element for which the dispersion measure is desired corpus, # the whole corpus, one word/element per vector element corpus.parts, # a vector as long as corpus, indicating for each element which part it belongs to corpus.size.in.parts=NA, # the number of corpus parts, n corpus.size.in.units=NA, # the length/size of the corpus l corpus.part.sizes.in.perc=NA, # the sizes of the corpus parts in percent s with.distance.measures=FALSE) # do not compute the distance-based measures (take a long time) { # setting up functions sd.pop <- function (values) { sd(values)*sqrt((length(values)-1)/length(values)) } Kullback.Leibler.div <- function (post.true, prior.theory) { # for KL-divergence logs <- log2(post.true/prior.theory); logs[logs==-Inf] <- 0 return(sum(post.true*logs)) # could be normalized as 1-2^(-KLD) } min.distance.finder <- function (positions) { # for Washtell's nearest-neighbor measure qwe <- abs(outer(positions, positions, "-")) apply(qwe, 1, function (x) min(x[x>0])) } DA.uneq <- function (frequencies, corpus.part.sizes) { numerator.part.1 <- (1/(length(frequencies)*(length(frequencies)-1)/2)) numerator.part.2 <- sum(abs(apply(combn(frequencies/corpus.part.sizes, 2), 2, diff))) denominator <- 2*mean(frequencies/corpus.part.sizes) return(1-((numerator.part.1*numerator.part.2)/denominator)) } # setting up necessary values if (is.na(corpus.size.in.parts)) { corpus.size.in.parts <- length(unique(corpus.parts)) } # n if (is.na(corpus.size.in.units)) { corpus.size.in.units <- length(corpus) } # l frequencies.of.element.in.corpus.parts.v <- as.numeric( frequencies.of.element.in.corpus.parts.df <- rowsum( as.numeric(wheres <- corpus==element), corpus.parts)) # v frequency.of.element.in.corpus <- sum(frequencies.of.element.in.corpus.parts.v) # f if (frequency.of.element.in.corpus<1) { return(NA); break } if (is.na(corpus.part.sizes.in.perc[1])) { corpus.part.sizes.in.perc <- table(corpus.parts)/corpus.size.in.units } # s proportions.of.element.in.corpus.parts <- frequencies.of.element.in.corpus.parts.v / (corpus.part.sizes.in.perc * corpus.size.in.units) # p # compute the traditional measures values <- list() values[["frequency of element in corpus"]] <- frequency.of.element.in.corpus values[["range"]] <- sum(frequencies.of.element.in.corpus.parts.v>0) values[["maxmin"]] <- max(frequencies.of.element.in.corpus.parts.v)-min(frequencies.of.element.in.corpus.parts.v) values[["standard deviation (population)"]] <- sd.pop(frequencies.of.element.in.corpus.parts.v) values[["variation coefficient (population)"]] <- values[["standard deviation (population)"]]/mean(frequencies.of.element.in.corpus.parts.v) values[["chi-squared"]] <- sum(((frequencies.of.element.in.corpus.parts.v-(corpus.part.sizes.in.perc*frequency.of.element.in.corpus))^2)/(corpus.part.sizes.in.perc*frequency.of.element.in.corpus)) values[["Juilland et al.'s D (for equally large corpus parts)"]] <- 1-(values[["variation coefficient (population)"]]/sqrt(corpus.size.in.parts-1)) values[["Juilland et al.'s D (for unequally large corpus parts)"]] <- 1-((sd.pop(proportions.of.element.in.corpus.parts)/mean(proportions.of.element.in.corpus.parts))/sqrt(corpus.size.in.parts-1)) values[["Rosengren's S (for equally large corpus parts)"]] <- ((sum(sqrt(frequencies.of.element.in.corpus.parts.v))^2)/corpus.size.in.parts)/frequency.of.element.in.corpus values[["Rosengren's S (for unequally large corpus parts)"]] <- sum(sqrt(frequencies.of.element.in.corpus.parts.v*corpus.part.sizes.in.perc))^2/frequency.of.element.in.corpus values[["Distributional consistency DC"]] <- ((sum(sqrt(frequencies.of.element.in.corpus.parts.v))/corpus.size.in.parts)^2)/mean(frequencies.of.element.in.corpus.parts.v) values[["Carroll's D2"]] <- -sum((temp.d2 <- proportions.of.element.in.corpus.parts/sum(proportions.of.element.in.corpus.parts))[temp.d2>0] * log2(temp.d2[temp.d2>0])) / log2(corpus.size.in.parts) values[["Inverse document frequency IDF"]] <- log2(corpus.size.in.parts/sum(frequencies.of.element.in.corpus.parts.v>0)) values[["Lyne's D3 (!= corpus parts)"]] <- 1 - values[["chi-squared"]] / (4*frequency.of.element.in.corpus) # adjusted for different file sizes; thanks to Mikkel Ekeland Paulsen values[["Gries's Deviation of Proportions DP"]] <- sum(abs((frequencies.of.element.in.corpus.parts.v/frequency.of.element.in.corpus)-corpus.part.sizes.in.perc))/2 values[["Gries's Deviation of Proportions (normalized) DPnorm"]] <- values[["Gries's Deviation of Proportions DP"]]/(1-min(corpus.part.sizes.in.perc)) values[["Kullback-Leibler divergence"]] <- Kullback.Leibler.div(frequencies.of.element.in.corpus.parts.v/frequency.of.element.in.corpus, corpus.part.sizes.in.perc) values[["Juilland et al.'s U (for equally large corpus parts)"]] <- frequency.of.element.in.corpus*values[["Juilland et al.'s D (for equally large corpus parts)"]] values[["Juilland et al.'s U (for unequally large corpus parts)"]] <- frequency.of.element.in.corpus*values[["Juilland et al.'s D (for unequally large corpus parts)"]] values[["Rosengren's Adj. Frequency (for equally large corpus parts)"]] <- (sum(sqrt(frequencies.of.element.in.corpus.parts.v))^2)/corpus.size.in.parts values[["Rosengren's Adj. Frequency (for unequally large corpus parts)"]] <- sum(sqrt(frequencies.of.element.in.corpus.parts.v*corpus.part.sizes.in.perc))^2 values[["Carroll's Um"]] <- (frequency.of.element.in.corpus*values[["Carroll's D2"]])+(1-values[["Carroll's D2"]])*(frequency.of.element.in.corpus/corpus.size.in.parts) values[["Engvall's measure"]] <- frequency.of.element.in.corpus*(sum(frequencies.of.element.in.corpus.parts.v>0)/corpus.size.in.parts) values[["Kromer's Ur"]] <- sum(digamma(frequencies.of.element.in.corpus.parts.v+1)+0.577215665) # C=0.577215665 # distance-based measures if (with.distance.measures) { # only do this when the distance measures are requested # Savický and Hlavácová wheres.which <- which(wheres) distances <- diff(c(0, wheres.which+(length(corpus)-max(wheres.which)))) segments <- rep(corpus.size.in.units/frequency.of.element.in.corpus, frequency.of.element.in.corpus) values[["Savický & Hlavácová's ARF / f_ARF"]] <- sum(pmin(distances, segments))/(corpus.size.in.units/frequency.of.element.in.corpus) values[["Savický & Hlavácová's AWT"]] <- 0.5*(1+(1/corpus.size.in.units*sum(distances^2))) values[["Savický & Hlavácová's f_AWT"]] <- corpus.size.in.units/((2*values[["Savický & Hlavácová's AWT"]])-1) # corpus.size.in.units^2/sum(distances^2) values[["Savický & Hlavácová's ALD"]] <- sum(distances*log10(distances))/corpus.size.in.units values[["Savický & Hlavácová's f_ALD"]] <- corpus.size.in.units*10^-values[["Savický & Hlavácová's ALD"]] # exp(-sum(distances/corpus.size.in.units*log(distances/corpus.size.in.units))) # Washtell denominator <- (2*frequency.of.element.in.corpus)/corpus.size.in.units candidates <- rownames(frequencies.of.element.in.corpus.parts.df)[frequencies.of.element.in.corpus.parts.v>1] if (length(candidates)>0) { g <- sum(frequencies.of.element.in.corpus.parts.v[frequencies.of.element.in.corpus.parts.v>1]) reduced.corpus <- corpus[corpus.parts %in% candidates] reduced.parts <- corpus.parts[corpus.parts %in% candidates] within.part.distances <- tapply(reduced.corpus, reduced.parts, function (x) which(x==element)) min.dists <- unlist(sapply(within.part.distances, min.distance.finder)) numerator <- (1/g) * sum(1/min.dists) values[["Washtell's Self Dispersion"]] <- numerator/denominator } else { values[["Washtell's Self Dispersion"]] <- NA } } return(values) } # dispersions2 #################################################################################### ## defining dispersions2 dispersions2 <- function ( frequencies.of.element.in.corpus.parts.v, # v = frequencies.of.element.in.corpus.parts.v corpus.part.sizes.in.perc=rep( # s = the corpus part sizes in % 1/length(frequencies.of.element.in.corpus.parts.v), length(frequencies.of.element.in.corpus.parts.v))) { # setting up functions sd.pop <- function (values) { sd(values)*sqrt((length(values)-1)/length(values)) } Kullback.Leibler.div <- function (post.true, prior.theory) { logs <- log2(post.true/prior.theory); logs[logs==-Inf] <- 0 return(sum(post.true*logs)) } # setting up necessary values # v = frequencies.of.element.in.corpus.parts.v = vector with frequencies of word/element in each corpus part # s = corpus.part.sizes.in.perc = the relative sizes of the parts of the corpus (in %) corpus.size.in.parts <- length(frequencies.of.element.in.corpus.parts.v) # n frequency.of.element.in.corpus <- sum(frequencies.of.element.in.corpus.parts.v) # f vv <- frequencies.of.element.in.corpus.parts.v/sum(frequencies.of.element.in.corpus.parts.v) values <- list() values[["frequency of element in corpus"]] <- frequency.of.element.in.corpus values[["range"]] <- sum(frequencies.of.element.in.corpus.parts.v>0) values[["maxmin"]] <- max(frequencies.of.element.in.corpus.parts.v)-min(frequencies.of.element.in.corpus.parts.v) values[["standard deviation (population)"]] <- sd.pop(frequencies.of.element.in.corpus.parts.v) values[["variation coefficient (population)"]] <- sd.pop(frequencies.of.element.in.corpus.parts.v)/mean(frequencies.of.element.in.corpus.parts.v) values[["chi-squared"]] <- sum(((frequencies.of.element.in.corpus.parts.v-(frequency.of.element.in.corpus*corpus.part.sizes.in.perc/sum(corpus.part.sizes.in.perc)))^2)/(frequency.of.element.in.corpus*corpus.part.sizes.in.perc/sum(corpus.part.sizes.in.perc))) values[["Juilland et al.'s D (for equally large corpus parts)"]] <- 1-((sd.pop(frequencies.of.element.in.corpus.parts.v)/mean(frequencies.of.element.in.corpus.parts.v))/sqrt(corpus.size.in.parts-1)) values[["Juilland et al.'s D (for unequally large corpus parts)"]] <- 1-((sd.pop(frequencies.of.element.in.corpus.parts.v/corpus.part.sizes.in.perc)/mean(frequencies.of.element.in.corpus.parts.v/corpus.part.sizes.in.perc))/sqrt(length(frequencies.of.element.in.corpus.parts.v/corpus.part.sizes.in.perc)-1)) values[["Rosengren's S (for equally large corpus parts)"]] <- ((sum(sqrt(frequencies.of.element.in.corpus.parts.v))^2)/corpus.size.in.parts)/frequency.of.element.in.corpus values[["Rosengren's S (for unequally large corpus parts)"]] <- sum(sqrt(frequencies.of.element.in.corpus.parts.v*corpus.part.sizes.in.perc))^2/frequency.of.element.in.corpus values[["Distributional consistency DC"]] <- ((sum(sqrt(frequencies.of.element.in.corpus.parts.v))/corpus.size.in.parts)^2)/mean(frequencies.of.element.in.corpus.parts.v) values[["Carroll's D2"]] <- -sum((temp.d2 <-(frequencies.of.element.in.corpus.parts.v/corpus.part.sizes.in.perc)/sum((frequencies.of.element.in.corpus.parts.v/corpus.part.sizes.in.perc)))[temp.d2>0] * log2(temp.d2[temp.d2>0])) / log2(corpus.size.in.parts) values[["Inverse document frequency IDF"]] <- log2(corpus.size.in.parts/sum(frequencies.of.element.in.corpus.parts.v>0)) values[["Lyne's D3 (!= corpus parts)"]] <- 1 - values[["chi-squared"]] / (4*frequency.of.element.in.corpus) # adjusted for different file sizes; thanks to Mikkel Ekeland Paulsen values[["Gries's Deviation of Proportions DP"]] <- sum(abs((frequencies.of.element.in.corpus.parts.v/frequency.of.element.in.corpus)-corpus.part.sizes.in.perc))/2 values[["Gries's Deviation of Proportions (normalized) DPnorm"]] <- (sum(abs((frequencies.of.element.in.corpus.parts.v/frequency.of.element.in.corpus)-corpus.part.sizes.in.perc))/2)/(1-min(corpus.part.sizes.in.perc)) # corrected values[["Kullback-Leibler divergence"]] <- Kullback.Leibler.div(frequencies.of.element.in.corpus.parts.v/sum(frequencies.of.element.in.corpus.parts.v), corpus.part.sizes.in.perc) values[["Juilland et al.'s U (for equally large corpus parts)"]] <- frequency.of.element.in.corpus*values[["Juilland et al.'s D (for equally large corpus parts)"]] values[["Juilland et al.'s U (for unequally large corpus parts)"]] <- frequency.of.element.in.corpus*values[["Juilland et al.'s D (for unequally large corpus parts)"]] values[["Rosengren's Adj. Frequency (for equally large corpus parts)"]] <- (sum(sqrt(frequencies.of.element.in.corpus.parts.v))^2)/corpus.size.in.parts values[["Rosengren's Adj. Frequency (for unequally large corpus parts)"]] <- sum(sqrt(frequencies.of.element.in.corpus.parts.v*corpus.part.sizes.in.perc))^2 values[["Carroll's Um"]] <- (frequency.of.element.in.corpus * values[["Carroll's D2"]]) + (1-values[["Carroll's D2"]])*(frequency.of.element.in.corpus/corpus.size.in.parts) values[["Engvall's measure"]] <- frequency.of.element.in.corpus*(sum(frequencies.of.element.in.corpus.parts.v>0)/corpus.size.in.parts) values[["Kromer's Ur"]] <- sum(digamma(frequencies.of.element.in.corpus.parts.v+1)+0.577215665) # C=0.577215665 return(values) } # applying dispersions1 to medium-sized corpora (less than, say, the BNC) can be done # with the whole corpus in memory like this: ## simplest application dispersions1(element="a", corpus=corpus, corpus.parts=corpus.parts) ## simple batch application cpsip <- table(corpus.parts)/length(corpus.parts) # compute corpus part sizes in % all.dispersions <- sapply( names(sort(table(corpus))), dispersions1, corpus=corpus, corpus.parts=corpus.parts, corpus.size.in.parts=5, corpus.size.in.units= 50, corpus.part.sizes.in.perc=cpsip) all.dispersions # for all words all.dispersions[,"a"] # for the word a all.dispersions[,c("b", "e")] # for the words b and e # applying dispersions2 to medium-sized corpora (less than, say, the BNC) can be done # using tables of words and corpus parts like this: words.in.parts <- table(corpus, corpus.parts) # cross-tabulate words and corpus parts cpsip <- table(corpus.parts)/length(corpus.parts) # compute corpus part sizes in % ## simplest application dispersions2( words.in.parts["a",], # frequency of "a" in each corpus part cpsip) # observed corpus part sizes in % ## simple batch application all.dispersions <- apply( # apply to words.in.parts, # the table words.in.parts 1, # row-wise dispersions2, # the function dispersions2 corpus.part.sizes.in.perc=cpsip) # corpus part sizes in % round(all.dispersions <- sapply(all.dispersions, unlist), 3) # make this a nice matrix # for (much) larger data sets, you may want to explore ## batch application with parSapply (using the packages foreach and doParallel and/or ## processing much larger tables with table.integer64 (using the package bit64) # Because the program/data are free of charge, there is no warranty for them, to the extent permitted by applicable law. Except when otherwise stated in writing the copyright holders and/or other parties provide the program/data 'as is' without warranty of any kind, either expressed or implied, including, but not limited to, the implied warranties of merchantability and fitness\nfor a particular purpose. The entire risk as to the quality and performance of the program is with you. Should the program/data prove defective, you assume the cost of all necessary servicing, repair or correction. In no event unless required by applicable law or agreed to in writing will any copyright holder, or any other party who may modify and/or redistribute the program/data as permitted above, be liable to you for damages, including any general, special, incidental or consequential damages arising out of the use or inability to use the program/data (including but not limited to loss of program/data or data being rendered inaccurate or losses sustained by you or third parties or a failure of the program/data to operate with any other programs), even if such holder or other party has been advised of the possibility of such damages.