Analyzing co-occurrence data

rm(list=ls(all=TRUE))
library(effects)
# preparation of data #############################################################################
dfer <- function(some.matrix) { # change a 2x2 matrix into a case-by-variable data frame
   DF <- data.frame(
      factor(rep(dimnames(some.matrix)[[1]], rowSums(some.matrix))),
      factor(rep(rep(dimnames(some.matrix)[[2]], 2), t(some.matrix)))
   ); names(DF) <- names(dimnames(some.matrix))
   return(DF)
}

# generate a 2x2 co=occurrence matrix representing Table 2 of the paper
addmargins(cooc <- matrix(    # generate a matrix cooc (and also show its totals)
   c(80, 687-80, 19, 137958), # of these frequencies (columnwise)
   nrow=2,                    # making up 2 rows
   dimnames=list(VERB=c("regard","all.other"),     # use these row names & labels
                 CONSTR=c("aspred","all.other")))) # use these column names & labels

##            CONSTR
## VERB        aspred all.other    Sum
##   regard        80        19     99
##   all.other    607    137958 138565
##   Sum          687    137977 138664

# make into data frame with the case-by-variable format
summary(cbv <- dfer(cooc))

##         VERB              CONSTR
##  all.other:138565   all.other:137977
##  regard   :    99   aspred   :   687

head(cbv)

##     VERB CONSTR
## 1 regard aspred
## 2 regard aspred
## 3 regard aspred
## 4 regard aspred
## 5 regard aspred
## 6 regard aspred

tail(cbv)

##             VERB    CONSTR
## 138659 all.other all.other
## 138660 all.other all.other
## 138661 all.other all.other
## 138662 all.other all.other
## 138663 all.other all.other
## 138664 all.other all.other

# check conversion was correct by comparing the table from cbv to cooc
(zxc <- table(cbv$VERB, cbv$CONSTR))

##
##             all.other aspred
##   all.other    137958    607
##   regard           19     80

attach(cbv)

# manual computation of elementary statistics
(80/19) / (607/137958)      # effect size of odds ratio: 956.9618

## [1] 956.9618

log((80/19) / (607/137958)) # logged version of the odds ration: log odds: 6.863763

## [1] 6.863763

# association in one direction: predicting VERB from ASPRED #######################################
## association as regression from the case-by-variable format
summary(model_v.from.c.cbv <- glm(VERB ~ CONSTR, family=binomial, data=cbv))

##
## Call:
## glm(formula = VERB ~ CONSTR, family = binomial, data = cbv)
##
## Deviance Residuals:
##     Min       1Q   Median       3Q      Max
## -0.4976  -0.0166  -0.0166  -0.0166   4.2167
##
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept)   -8.8903     0.2294  -38.75   <2e-16 ***
## CONSTRaspred   6.8638     0.2584   26.56   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
##     Null deviance: 1632.38  on 138663  degrees of freedom
## Residual deviance:  870.18  on 138662  degrees of freedom
## AIC: 874.18
##
## Number of Fisher Scoring iterations: 11

   coef(model_v.from.c.cbv)[2]      # = log odds / logit: how much CONSTR=="aspred" increases

## CONSTRaspred
##     6.863763

   exp(coef(model_v.from.c.cbv)[2]) # = odds ratio        the (log) odds of VERB=="regard"

## CONSTRaspred
##     956.9617

   drop1(model_v.from.c.cbv, test="LR")$LRT[2]                    # = G^2 / LLR

## [1] 762.1959

   model_v.from.c.cbv$null.deviance - model_v.from.c.cbv$deviance # = G^2 / LLR

## [1] 762.1959

### Delta P
plot(aspred <- effect("CONSTR", model_v.from.c.cbv), type="response",
     ylab="Predicted probability of VERB being 'regard'", ylim=c(0,1), grid=TRUE)

   as.data.frame(aspred)           # the difference between the predicted probabilities

##      CONSTR          fit           se      lower        upper
## 1 all.other 0.0001377041 3.158669e-05 0.00008784 0.0002158685
## 2    aspred 0.1164483261 1.223782e-02 0.09452235 0.1426590596

   diff(as.data.frame(aspred)$fit) # in the column fit is Delta P cx->v : 0.1163106

## [1] 0.1163106

## association as regression from the 2x2 co-occurrence matrix: DV = matrix with the DV-levels in columns
summary(model_v.from.c <- glm(cbind(cooc[1,], cooc[2,]) ~ colnames(cooc), family=binomial)) # same as ...

##
## Call:
## glm(formula = cbind(cooc[1, ], cooc[2, ]) ~ colnames(cooc), family = binomial)
##
## Deviance Residuals:
## [1]  0  0
##
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)
## (Intercept)           -8.8903     0.2294  -38.75   <2e-16 ***
## colnames(cooc)aspred   6.8638     0.2584   26.56   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
##     Null deviance:  7.6220e+02  on 1  degrees of freedom
## Residual deviance: -2.4758e-13  on 0  degrees of freedom
## AIC: 14.889
##
## Number of Fisher Scoring iterations: 3

summary(model_v.from.c <- glm(t(cooc) ~ colnames(cooc), family=binomial)) # ... this one

##
## Call:
## glm(formula = t(cooc) ~ colnames(cooc), family = binomial)
##
## Deviance Residuals:
## [1]  0  0
##
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)
## (Intercept)           -8.8903     0.2294  -38.75   <2e-16 ***
## colnames(cooc)aspred   6.8638     0.2584   26.56   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
##     Null deviance:  7.6220e+02  on 1  degrees of freedom
## Residual deviance: -2.4758e-13  on 0  degrees of freedom
## AIC: 14.889
##
## Number of Fisher Scoring iterations: 3

   coef(model_v.from.c)[2]      # = log odds / logit: how much CONSTR=="aspred" increases

## colnames(cooc)aspred
##             6.863763

   exp(coef(model_v.from.c)[2]) # = odds ratio        the (log) odds of VERB=="regard"

## colnames(cooc)aspred
##             956.9618

   drop1(model_v.from.c, test="Chisq")$LRT[2] # = G^2 / LLR

## [1] 762.1959

   model_v.from.c$null.deviance               # = G^2 / LLR

## [1] 762.1959

summary(null.model_v.from.c.cbv <- glm(VERB ~ 1, family=binomial, data=cbv))

##
## Call:
## glm(formula = VERB ~ 1, family = binomial, data = cbv)
##
## Deviance Residuals:
##     Min       1Q   Median       3Q      Max
## -0.0378  -0.0378  -0.0378  -0.0378   3.8065
##
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  -7.2440     0.1005  -72.05   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
##     Null deviance: 1632.4  on 138663  degrees of freedom
## Residual deviance: 1632.4  on 138663  degrees of freedom
## AIC: 1634.4
##
## Number of Fisher Scoring iterations: 10

   coef(null.model_v.from.c.cbv) # logit of the relative frequency of regard in the corpus,

## (Intercept)
##   -7.243975

                                 # i.e. of sum(VERB=="regard") / sum(VERB!="")



# association in other direction: predicting ASPRED from VERB #####################################
## association as regression from the case-by-variable format
summary(model_c.from.v.cbv <- glm(CONSTR ~ VERB, family=binomial, data=cbv))

##
## Call:
## glm(formula = CONSTR ~ VERB, family = binomial, data = cbv)
##
## Deviance Residuals:
##     Min       1Q   Median       3Q      Max
## -1.8170  -0.0937  -0.0937  -0.0937   3.2956
##
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.42618    0.04068 -133.39   <2e-16 ***
## VERBregard   6.86376    0.25843   26.56   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
##     Null deviance: 8663.1  on 138663  degrees of freedom
## Residual deviance: 7900.9  on 138662  degrees of freedom
## AIC: 7904.9
##
## Number of Fisher Scoring iterations: 8

   coef(model_c.from.v.cbv)[2]      # = log odds / logit: how much VERB=="regard" increases

## VERBregard
##   6.863763

   exp(coef(model_c.from.v.cbv)[2]) # = odds ratio        the (log) odds of CONSTR=="aspred"

## VERBregard
##   956.9618

   drop1(model_c.from.v.cbv, test="LR")$LRT[2]                    # = G^2 / LLR

## [1] 762.1959

   model_c.from.v.cbv$null.deviance - model_c.from.v.cbv$deviance # = G^2 / LLR

## [1] 762.1959

### Delta P
plot(regard <- effect("VERB", model_c.from.v.cbv), type="response",
     ylab="Predicted probability of CONSTRUCTION being 'as-predicative'", ylim=c(0,1), grid=TRUE)

   as.data.frame(regard)           # the difference between the predicted probabilities

##        VERB         fit           se      lower       upper
## 1 all.other 0.004380616 0.0001774135 0.00404628 0.004742445
## 2    regard 0.808080808 0.0395793814 0.71857163 0.874108919

   diff(as.data.frame(regard)$fit) # in the column fit is Delta P v->cx : 0.8037002

## [1] 0.8037002

## association as regression from the 2x2 co-occurrence matrix: DV = matrix with the DV-levels in columns
summary(model_c.from.v <- glm(cbind(cooc[,1], cooc[,2]) ~ rownames(cooc), family=binomial)) # same as ...

##
## Call:
## glm(formula = cbind(cooc[, 1], cooc[, 2]) ~ rownames(cooc), family = binomial)
##
## Deviance Residuals:
## [1]  0  0
##
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)
## (Intercept)          -5.42618    0.04068 -133.39   <2e-16 ***
## rownames(cooc)regard  6.86376    0.25843   26.56   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
##     Null deviance: 7.6220e+02  on 1  degrees of freedom
## Residual deviance: 1.3591e-12  on 0  degrees of freedom
## AIC: 16.821
##
## Number of Fisher Scoring iterations: 3

summary(model_c.from.v <- glm(cooc ~ rownames(cooc), family=binomial)) # ... this one

##
## Call:
## glm(formula = cooc ~ rownames(cooc), family = binomial)
##
## Deviance Residuals:
## [1]  0  0
##
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)
## (Intercept)          -5.42618    0.04068 -133.39   <2e-16 ***
## rownames(cooc)regard  6.86376    0.25843   26.56   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
##     Null deviance: 7.6220e+02  on 1  degrees of freedom
## Residual deviance: 1.3591e-12  on 0  degrees of freedom
## AIC: 16.821
##
## Number of Fisher Scoring iterations: 3

   coef(model_c.from.v)[2]      # = log odds / logit: how much VERB=="regard" increases

## rownames(cooc)regard
##             6.863763

   exp(coef(model_c.from.v)[2]) # = odds ratio        the (log) odds of CONSTR=="aspred"

## rownames(cooc)regard
##             956.9618

   drop1(model_c.from.v, test="Chisq")$LRT[2] # = G^2 / LLR

## [1] 762.1959

   model_c.from.v$null.deviance               # = G^2 / LLR

## [1] 762.1959

summary(null.model_c.from.v.cbv <- glm(CONSTR ~ 1, family=binomial, data=cbv))

##
## Call:
## glm(formula = CONSTR ~ 1, family = binomial, data = cbv)
##
## Deviance Residuals:
##     Min       1Q   Median       3Q      Max
## -0.0997  -0.0997  -0.0997  -0.0997   3.2581
##
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.30251    0.03825  -138.6   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
##     Null deviance: 8663.1  on 138663  degrees of freedom
## Residual deviance: 8663.1  on 138663  degrees of freedom
## AIC: 8665.1
##
## Number of Fisher Scoring iterations: 8

   coef(null.model_c.from.v.cbv) # logit of the relative frequency of as-predicatives in the corpus,

## (Intercept)
##   -5.302508

                                 # i.e. of sum(CONSTR=="aspred") / sum(CONSTR!="")



# traditional and widely-used measures ############################################################
## obs.a and exp.a computed in the traditional way from the 2x2 co-occurrence matrix cooc
addmargins(cooc)

##            CONSTR
## VERB        aspred all.other    Sum
##   regard        80        19     99
##   all.other    607    137958 138565
##   Sum          687    137977 138664

(obs.a <- cooc["regard","aspred"])                                # observed co-occurrence freq

## [1] 80

(exp.a <- chisq.test(cooc, correct=FALSE)$exp["regard","aspred"]) # expected co-occurrence freq

## Warning in chisq.test(cooc, correct = FALSE): Chi-squared approximation may
## be incorrect

## [1] 0.4904878

### from this we can compute
log2(obs.a/exp.a)           #   7.349639, = pointwise MI

## [1] 7.349639

(obs.a - exp.a)/sqrt(obs.a) #   8.889434, = t

## [1] 8.889434

(obs.a - exp.a)/sqrt(exp.a) # 113.5285  , = z (same as chisq.test(cooc, correct=FALSE)$residuals["regard","aspred"])

## [1] 113.5285

## obs.a and exp.a computed from the data frame cbv and the regression models
### from model_v.from.c.cbv
(obs.a <- sum(CONSTR=="aspred") *         # = how often CONSTR=="aspred" *
   ilogit(sum(coef(model_v.from.c.cbv)))) # = predicted probability of VERB=="regard"

## [1] 80

### from model_c.from.v.cbv
(obs.a <- sum(VERB=="regard") *           # = how often VERB=="regard" *
   ilogit(sum(coef(model_c.from.v.cbv)))) # = predicted probability of CONSTR=="aspred"

## [1] 80

(exp.a <- ilogit(coef(null.model_c.from.v.cbv)) * ilogit(coef(null.model_v.from.c.cbv)) *
   nrow(null.model_c.from.v.cbv$data))

## (Intercept)
##   0.4904878

### from this we can compute (as above)
log2(obs.a/exp.a)           #   7.349639, = pointwise MI

## (Intercept)
##    7.349639

(obs.a - exp.a)/sqrt(obs.a) #   8.889434, = t

## (Intercept)
##    8.889434

(obs.a - exp.a)/sqrt(exp.a) # 113.5285  , = z

## (Intercept)
##    113.5285

Analyzing co-occurrence data

Stefan Th. Gries

Sun June 02 12:34:56 2019