rm(list=ls(all=TRUE))
library(effects)
# preparation of data #############################################################################
dfer <- function(some.matrix) { # change a 2x2 matrix into a case-by-variable data frame
DF <- data.frame(
factor(rep(dimnames(some.matrix)[[1]], rowSums(some.matrix))),
factor(rep(rep(dimnames(some.matrix)[[2]], 2), t(some.matrix)))
); names(DF) <- names(dimnames(some.matrix))
return(DF)
}
# generate a 2x2 co=occurrence matrix representing Table 2 of the paper
addmargins(cooc <- matrix( # generate a matrix cooc (and also show its totals)
c(80, 687-80, 19, 137958), # of these frequencies (columnwise)
nrow=2, # making up 2 rows
dimnames=list(VERB=c("regard","all.other"), # use these row names & labels
CONSTR=c("aspred","all.other")))) # use these column names & labels
## CONSTR
## VERB aspred all.other Sum
## regard 80 19 99
## all.other 607 137958 138565
## Sum 687 137977 138664
# make into data frame with the case-by-variable format
summary(cbv <- dfer(cooc))
## VERB CONSTR
## all.other:138565 all.other:137977
## regard : 99 aspred : 687
head(cbv)
## VERB CONSTR
## 1 regard aspred
## 2 regard aspred
## 3 regard aspred
## 4 regard aspred
## 5 regard aspred
## 6 regard aspred
tail(cbv)
## VERB CONSTR
## 138659 all.other all.other
## 138660 all.other all.other
## 138661 all.other all.other
## 138662 all.other all.other
## 138663 all.other all.other
## 138664 all.other all.other
# check conversion was correct by comparing the table from cbv to cooc
(zxc <- table(cbv$VERB, cbv$CONSTR))
##
## all.other aspred
## all.other 137958 607
## regard 19 80
attach(cbv)
# manual computation of elementary statistics
(80/19) / (607/137958) # effect size of odds ratio: 956.9618
## [1] 956.9618
log((80/19) / (607/137958)) # logged version of the odds ration: log odds: 6.863763
## [1] 6.863763
# association in one direction: predicting VERB from ASPRED #######################################
## association as regression from the case-by-variable format
summary(model_v.from.c.cbv <- glm(VERB ~ CONSTR, family=binomial, data=cbv))
##
## Call:
## glm(formula = VERB ~ CONSTR, family = binomial, data = cbv)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4976 -0.0166 -0.0166 -0.0166 4.2167
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.8903 0.2294 -38.75 <2e-16 ***
## CONSTRaspred 6.8638 0.2584 26.56 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1632.38 on 138663 degrees of freedom
## Residual deviance: 870.18 on 138662 degrees of freedom
## AIC: 874.18
##
## Number of Fisher Scoring iterations: 11
coef(model_v.from.c.cbv)[2] # = log odds / logit: how much CONSTR=="aspred" increases
## CONSTRaspred
## 6.863763
exp(coef(model_v.from.c.cbv)[2]) # = odds ratio the (log) odds of VERB=="regard"
## CONSTRaspred
## 956.9617
drop1(model_v.from.c.cbv, test="LR")$LRT[2] # = G^2 / LLR
## [1] 762.1959
model_v.from.c.cbv$null.deviance - model_v.from.c.cbv$deviance # = G^2 / LLR
## [1] 762.1959
### Delta P
plot(aspred <- effect("CONSTR", model_v.from.c.cbv), type="response",
ylab="Predicted probability of VERB being 'regard'", ylim=c(0,1), grid=TRUE)
as.data.frame(aspred) # the difference between the predicted probabilities
## CONSTR fit se lower upper
## 1 all.other 0.0001377041 3.158669e-05 0.00008784 0.0002158685
## 2 aspred 0.1164483261 1.223782e-02 0.09452235 0.1426590596
diff(as.data.frame(aspred)$fit) # in the column fit is Delta P cx->v : 0.1163106
## [1] 0.1163106
## association as regression from the 2x2 co-occurrence matrix: DV = matrix with the DV-levels in columns
summary(model_v.from.c <- glm(cbind(cooc[1,], cooc[2,]) ~ colnames(cooc), family=binomial)) # same as ...
##
## Call:
## glm(formula = cbind(cooc[1, ], cooc[2, ]) ~ colnames(cooc), family = binomial)
##
## Deviance Residuals:
## [1] 0 0
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.8903 0.2294 -38.75 <2e-16 ***
## colnames(cooc)aspred 6.8638 0.2584 26.56 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7.6220e+02 on 1 degrees of freedom
## Residual deviance: -2.4758e-13 on 0 degrees of freedom
## AIC: 14.889
##
## Number of Fisher Scoring iterations: 3
summary(model_v.from.c <- glm(t(cooc) ~ colnames(cooc), family=binomial)) # ... this one
##
## Call:
## glm(formula = t(cooc) ~ colnames(cooc), family = binomial)
##
## Deviance Residuals:
## [1] 0 0
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.8903 0.2294 -38.75 <2e-16 ***
## colnames(cooc)aspred 6.8638 0.2584 26.56 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7.6220e+02 on 1 degrees of freedom
## Residual deviance: -2.4758e-13 on 0 degrees of freedom
## AIC: 14.889
##
## Number of Fisher Scoring iterations: 3
coef(model_v.from.c)[2] # = log odds / logit: how much CONSTR=="aspred" increases
## colnames(cooc)aspred
## 6.863763
exp(coef(model_v.from.c)[2]) # = odds ratio the (log) odds of VERB=="regard"
## colnames(cooc)aspred
## 956.9618
drop1(model_v.from.c, test="Chisq")$LRT[2] # = G^2 / LLR
## [1] 762.1959
model_v.from.c$null.deviance # = G^2 / LLR
## [1] 762.1959
summary(null.model_v.from.c.cbv <- glm(VERB ~ 1, family=binomial, data=cbv))
##
## Call:
## glm(formula = VERB ~ 1, family = binomial, data = cbv)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.0378 -0.0378 -0.0378 -0.0378 3.8065
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.2440 0.1005 -72.05 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1632.4 on 138663 degrees of freedom
## Residual deviance: 1632.4 on 138663 degrees of freedom
## AIC: 1634.4
##
## Number of Fisher Scoring iterations: 10
coef(null.model_v.from.c.cbv) # logit of the relative frequency of regard in the corpus,
## (Intercept)
## -7.243975
# i.e. of sum(VERB=="regard") / sum(VERB!="")
# association in other direction: predicting ASPRED from VERB #####################################
## association as regression from the case-by-variable format
summary(model_c.from.v.cbv <- glm(CONSTR ~ VERB, family=binomial, data=cbv))
##
## Call:
## glm(formula = CONSTR ~ VERB, family = binomial, data = cbv)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8170 -0.0937 -0.0937 -0.0937 3.2956
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.42618 0.04068 -133.39 <2e-16 ***
## VERBregard 6.86376 0.25843 26.56 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8663.1 on 138663 degrees of freedom
## Residual deviance: 7900.9 on 138662 degrees of freedom
## AIC: 7904.9
##
## Number of Fisher Scoring iterations: 8
coef(model_c.from.v.cbv)[2] # = log odds / logit: how much VERB=="regard" increases
## VERBregard
## 6.863763
exp(coef(model_c.from.v.cbv)[2]) # = odds ratio the (log) odds of CONSTR=="aspred"
## VERBregard
## 956.9618
drop1(model_c.from.v.cbv, test="LR")$LRT[2] # = G^2 / LLR
## [1] 762.1959
model_c.from.v.cbv$null.deviance - model_c.from.v.cbv$deviance # = G^2 / LLR
## [1] 762.1959
### Delta P
plot(regard <- effect("VERB", model_c.from.v.cbv), type="response",
ylab="Predicted probability of CONSTRUCTION being 'as-predicative'", ylim=c(0,1), grid=TRUE)
as.data.frame(regard) # the difference between the predicted probabilities
## VERB fit se lower upper
## 1 all.other 0.004380616 0.0001774135 0.00404628 0.004742445
## 2 regard 0.808080808 0.0395793814 0.71857163 0.874108919
diff(as.data.frame(regard)$fit) # in the column fit is Delta P v->cx : 0.8037002
## [1] 0.8037002
## association as regression from the 2x2 co-occurrence matrix: DV = matrix with the DV-levels in columns
summary(model_c.from.v <- glm(cbind(cooc[,1], cooc[,2]) ~ rownames(cooc), family=binomial)) # same as ...
##
## Call:
## glm(formula = cbind(cooc[, 1], cooc[, 2]) ~ rownames(cooc), family = binomial)
##
## Deviance Residuals:
## [1] 0 0
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.42618 0.04068 -133.39 <2e-16 ***
## rownames(cooc)regard 6.86376 0.25843 26.56 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7.6220e+02 on 1 degrees of freedom
## Residual deviance: 1.3591e-12 on 0 degrees of freedom
## AIC: 16.821
##
## Number of Fisher Scoring iterations: 3
summary(model_c.from.v <- glm(cooc ~ rownames(cooc), family=binomial)) # ... this one
##
## Call:
## glm(formula = cooc ~ rownames(cooc), family = binomial)
##
## Deviance Residuals:
## [1] 0 0
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.42618 0.04068 -133.39 <2e-16 ***
## rownames(cooc)regard 6.86376 0.25843 26.56 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7.6220e+02 on 1 degrees of freedom
## Residual deviance: 1.3591e-12 on 0 degrees of freedom
## AIC: 16.821
##
## Number of Fisher Scoring iterations: 3
coef(model_c.from.v)[2] # = log odds / logit: how much VERB=="regard" increases
## rownames(cooc)regard
## 6.863763
exp(coef(model_c.from.v)[2]) # = odds ratio the (log) odds of CONSTR=="aspred"
## rownames(cooc)regard
## 956.9618
drop1(model_c.from.v, test="Chisq")$LRT[2] # = G^2 / LLR
## [1] 762.1959
model_c.from.v$null.deviance # = G^2 / LLR
## [1] 762.1959
summary(null.model_c.from.v.cbv <- glm(CONSTR ~ 1, family=binomial, data=cbv))
##
## Call:
## glm(formula = CONSTR ~ 1, family = binomial, data = cbv)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.0997 -0.0997 -0.0997 -0.0997 3.2581
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.30251 0.03825 -138.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8663.1 on 138663 degrees of freedom
## Residual deviance: 8663.1 on 138663 degrees of freedom
## AIC: 8665.1
##
## Number of Fisher Scoring iterations: 8
coef(null.model_c.from.v.cbv) # logit of the relative frequency of as-predicatives in the corpus,
## (Intercept)
## -5.302508
# i.e. of sum(CONSTR=="aspred") / sum(CONSTR!="")
# traditional and widely-used measures ############################################################
## obs.a and exp.a computed in the traditional way from the 2x2 co-occurrence matrix cooc
addmargins(cooc)
## CONSTR
## VERB aspred all.other Sum
## regard 80 19 99
## all.other 607 137958 138565
## Sum 687 137977 138664
(obs.a <- cooc["regard","aspred"]) # observed co-occurrence freq
## [1] 80
(exp.a <- chisq.test(cooc, correct=FALSE)$exp["regard","aspred"]) # expected co-occurrence freq
## Warning in chisq.test(cooc, correct = FALSE): Chi-squared approximation may
## be incorrect
## [1] 0.4904878
### from this we can compute
log2(obs.a/exp.a) # 7.349639, = pointwise MI
## [1] 7.349639
(obs.a - exp.a)/sqrt(obs.a) # 8.889434, = t
## [1] 8.889434
(obs.a - exp.a)/sqrt(exp.a) # 113.5285 , = z (same as chisq.test(cooc, correct=FALSE)$residuals["regard","aspred"])
## [1] 113.5285
## obs.a and exp.a computed from the data frame cbv and the regression models
### from model_v.from.c.cbv
(obs.a <- sum(CONSTR=="aspred") * # = how often CONSTR=="aspred" *
ilogit(sum(coef(model_v.from.c.cbv)))) # = predicted probability of VERB=="regard"
## [1] 80
### from model_c.from.v.cbv
(obs.a <- sum(VERB=="regard") * # = how often VERB=="regard" *
ilogit(sum(coef(model_c.from.v.cbv)))) # = predicted probability of CONSTR=="aspred"
## [1] 80
(exp.a <- ilogit(coef(null.model_c.from.v.cbv)) * ilogit(coef(null.model_v.from.c.cbv)) *
nrow(null.model_c.from.v.cbv$data))
## (Intercept)
## 0.4904878
### from this we can compute (as above)
log2(obs.a/exp.a) # 7.349639, = pointwise MI
## (Intercept)
## 7.349639
(obs.a - exp.a)/sqrt(obs.a) # 8.889434, = t
## (Intercept)
## 8.889434
(obs.a - exp.a)/sqrt(exp.a) # 113.5285 , = z
## (Intercept)
## 113.5285