1 Intro

Let’s load a data set for an example of a multinomial model:

rm(list=ls(all.names=TRUE))
library(car); library(effects); library(nnet)
source("helpers/202_R2.r"); source("helpers/202_summarize.multinom.r") # load small helper functions
summary(d <- read.delim(   # summarize d, the result of loading
   file="inputfiles/202_06_DATALT.csv", # this file
   stringsAsFactors=TRUE)) # change categorical variables into factors
##       CASE        TRANSITIVITY    REC_LEN          PAT_LEN      
##  Min.   :   1.0   d_a:810      Min.   : 1.000   Min.   : 1.000  
##  1st Qu.: 474.8   d_p:349      1st Qu.: 1.000   1st Qu.: 2.000  
##  Median : 935.5   p_a:476      Median : 2.000   Median : 3.000  
##  Mean   : 939.4   p_p:141      Mean   : 3.128   Mean   : 4.443  
##  3rd Qu.:1412.2                3rd Qu.: 3.000   3rd Qu.: 6.000  
##  Max.   :1871.0                Max.   :44.000   Max.   :33.000  
##       REC_ANIM           PAT_SEM     REC_ACCESSIB PAT_ACCESSIB
##  animate  :1106   abstract   :1265   given:993    given: 312  
##  inanimate: 670   concrete   : 335   new  :783    new  :1464  
##                   information: 176                            
##                                                               
##                                                               
## 

Here’s what the columns are/contain:

  • the column CASE is a simple case/row number just so that each case has a unique ID;
  • the column TRANSITIVITY is the response variable: it’s the construction a speaker chose in a naturalistic setting:
    • d_a means ‘ditransitive active’ as in He gave herrecipient a bookpatient;
    • d_p means ‘ditransitive passive’ as in Sherecipient was given a bookpatient by him;
    • p_a means ‘prepositional dative active’ as in He gave a bookpatient to herrecipient;
    • p_p means ‘prepositional dative passive’ as in A bookpatient was given to herrecipient by him;
  • the column REC_LEN is a numeric predictor: it’s the length of the recipient in characters (e.g., her has 3 characters);
  • the column PAT_LEN is a numeric predictor: it’s the length of the patient in characters (e.g., a book has 6 characters (incl. the space));
  • the column REC_ANIM is a binary predictor: it’s whether the recipient is animate (e.g., He gave the customer a dirty look) or inanimate (e.g., He gave the table a dirty look);
  • the column PAT_SEM is a ternary predictor: it’s whether the patient was abstract (e.g., He gave the idea some thought) or concrete (e.g., He gave her a book) or information (e.g., He told her a story);
  • the column REC_ACCESSIB is a binary predictor: it’s whether the recipient was given (i.e., inferrable from information in the previous sentence) or new (i.e., maybe newly introduced to the discourse);
  • the column PAT_ACCESSIB is a binary predictor: it’s patient the recipient was given (i.e., inferrable from information in the previous sentence) or new (i.e., maybe newly introduced to the discourse).

We want to know whether TRANSITIVITY is predictable from

  • REC_ANIM, REC_ACCESSIB, PAT_ACCESSIB;
  • a short-before-long effect regarding the recipient and the patient;
  • the interaction of the accessibility predictors;
  • the interaction of the REC_ANIM and a short-before-long effect.

2 Deviance & baseline(s)

Let’s already compute the baselines for what will be the response variable, TRANSITIVITY:

(baselines <- c(
   "baseline 1"=max(              # make baselines[1] the highest
      prop.table(                 # proportion in the
         table(d$TRANSITIVITY))), # frequency table of the response
   "baseline 2"=sum(                 # make baselines[2] the sum of the
      prop.table(                    # proportions in the
         table(d$TRANSITIVITY))^2))) # frequency table of the response squared
## baseline 1 baseline 2 
##  0.4560811  0.3247625

Let’s also compute the deviance of the null model:

deviance(m.00 <- multinom(TRANSITIVITY ~ 1, data=d, na.action=na.exclude))
## # weights:  8 (3 variable)
## initial  value 2462.058785 
## iter  10 value 2187.711475
## iter  10 value 2187.711469
## final  value 2187.711344 
## converged
## [1] 4375.423

3 Exploration & preparation

Some exploration of the relevant variables and we prepare our length-difference predictor:

# the predictor(s)/response on its/their own
hist(d$REC_LEN)

hist(d$PAT_LEN)

hist(d$LEN_RECmPAT_LOG <- log2(d$REC_LEN)-log2(d$PAT_LEN), main="")

table(d$REC_ANIM)
## 
##   animate inanimate 
##      1106       670
table(d$REC_ACCESSIB)
## 
## given   new 
##   993   783
table(d$PAT_ACCESSIB)
## 
## given   new 
##   312  1464
table(d$REC_ACCESSIB, d$PAT_ACCESSIB)
##        
##         given new
##   given   205 788
##   new     107 676
# the predictor(s) w/ the response
spineplot(d$TRANSITIVITY ~ d$LEN_RECmPAT_LOG)

table(d$REC_ANIM, d$TRANSITIVITY)
##            
##             d_a d_p p_a p_p
##   animate   588 242 197  79
##   inanimate 222 107 279  62
ftable(d$REC_ACCESSIB, d$PAT_ACCESSIB, d$TRANSITIVITY)
##              d_a d_p p_a p_p
##                             
## given given   98  48  39  20
##       new    469 165 117  37
## new   given   21  21  36  29
##       new    222 115 284  55

4 Modeling & numerical interpretation

Let’s fit our initial regression model:

summary(m.01 <- multinom(        # make/summarize the multinomial model m.01:
   TRANSITIVITY ~ 1 +            # TRANSITIVITY ~ an overall intercept (1)
   REC_ACCESSIB + PAT_ACCESSIB + # the accessibility predictors & ...
   REC_ACCESSIB:PAT_ACCESSIB +   # their interaction
   REC_ANIM + LEN_RECmPAT_LOG +  # the rec animacy & length diff preds & ...
   REC_ANIM:LEN_RECmPAT_LOG,     # their interaction
   data=d,                # those vars are in d
   model=TRUE,            # & save the model frame (optional)
   na.action=na.exclude)) # skip cases with NA/missing data
## # weights:  32 (21 variable)
## initial  value 2462.058785 
## iter  10 value 1814.322650
## iter  20 value 1767.454974
## final  value 1762.521921 
## converged
## Call:
## multinom(formula = TRANSITIVITY ~ 1 + REC_ACCESSIB + PAT_ACCESSIB + 
##     REC_ACCESSIB:PAT_ACCESSIB + REC_ANIM + LEN_RECmPAT_LOG + 
##     REC_ANIM:LEN_RECmPAT_LOG, data = d, na.action = na.exclude, 
##     model = TRUE)
## 
## Coefficients:
##     (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew REC_ANIMinanimate
## d_p  -0.1210011       0.4906650      -0.2429275       -0.06653502
## p_a  -0.3860966       0.8360726      -0.4249039        0.88620903
## p_p  -0.8663412       1.3627552      -0.8513750        0.41282684
##     LEN_RECmPAT_LOG REC_ACCESSIBnew:PAT_ACCESSIBnew
## d_p       0.4666473                      -0.4347810
## p_a       1.1615820                      -0.1403867
## p_p       1.0006701                      -1.0594962
##     REC_ANIMinanimate:LEN_RECmPAT_LOG
## d_p                       -0.07724824
## p_a                       -0.24365931
## p_p                       -0.10326591
## 
## Std. Errors:
##     (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew REC_ANIMinanimate
## d_p   0.1989219       0.3658536       0.2039208         0.1787075
## p_a   0.2291365       0.3830872       0.2485331         0.1642522
## p_p   0.2751310       0.4128681       0.3179271         0.2245364
##     LEN_RECmPAT_LOG REC_ACCESSIBnew:PAT_ACCESSIBnew
## d_p      0.06357114                       0.3951689
## p_a      0.08302891                       0.4162775
## p_p      0.10013924                       0.4785921
##     REC_ANIMinanimate:LEN_RECmPAT_LOG
## d_p                         0.1063059
## p_a                         0.1164769
## p_p                         0.1458351
## 
## Residual Deviance: 3525.044 
## AIC: 3567.044

Let’s also compute the z-scores for these coefficient values (however uninterpretable those are):

(m.01.zscores <- summary(m.01)$coefficients / summary(m.01)$standard.errors)
##     (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew REC_ANIMinanimate
## d_p  -0.6082847        1.341151       -1.191284        -0.3723124
## p_a  -1.6850072        2.182460       -1.709647         5.3954176
## p_p  -3.1488316        3.300703       -2.677894         1.8385745
##     LEN_RECmPAT_LOG REC_ACCESSIBnew:PAT_ACCESSIBnew
## d_p        7.340553                       -1.100241
## p_a       13.990092                       -0.337243
## p_p        9.992787                       -2.213777
##     REC_ANIMinanimate:LEN_RECmPAT_LOG
## d_p                        -0.7266601
## p_a                        -2.0919113
## p_p                        -0.7081005

And from that we compute the p-values:

(m.01.pvalues <- 2*pnorm(abs(m.01.zscores), lower.tail=FALSE))
##     (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew REC_ANIMinanimate
## d_p 0.542998690    0.1798713698     0.233542271      7.096603e-01
## p_a 0.091987185    0.0290755883     0.087331199      6.836435e-08
## p_p 0.001639247    0.0009644279     0.007408676      6.597780e-02
##     LEN_RECmPAT_LOG REC_ACCESSIBnew:PAT_ACCESSIBnew
## d_p    2.127136e-13                       0.2712271
## p_a    1.791808e-44                       0.7359337
## p_p    1.639074e-23                       0.0268441
##     REC_ANIMinanimate:LEN_RECmPAT_LOG
## d_p                        0.46743422
## p_a                        0.03644645
## p_p                        0.47888286

So there are definitely significant results. Since we can’t use drop1, let’s do anova two times, once for each interaction:

summary(m.02a <- update(         # make m.02a an update of
   m.01, .~.                     # m.01, namely all of it (.~.), but then
   - REC_ACCESSIB:PAT_ACCESSIB)) # remove this interaction
## # weights:  28 (18 variable)
## initial  value 2462.058785 
## iter  10 value 1801.290105
## iter  20 value 1767.457695
## final  value 1765.590290 
## converged
## Call:
## multinom(formula = TRANSITIVITY ~ REC_ACCESSIB + PAT_ACCESSIB + 
##     REC_ANIM + LEN_RECmPAT_LOG + REC_ANIM:LEN_RECmPAT_LOG, data = d, 
##     na.action = na.exclude, model = TRUE)
## 
## Coefficients:
##     (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew REC_ANIMinanimate
## d_p -0.03493706       0.1158438      -0.3509796       -0.07198044
## p_a -0.40506728       0.7027753      -0.3967182        0.88305548
## p_p -0.55675057       0.6115019      -1.3339509        0.40044051
##     LEN_RECmPAT_LOG REC_ANIMinanimate:LEN_RECmPAT_LOG
## d_p       0.4661081                       -0.08120209
## p_a       1.1589835                       -0.24308442
## p_p       1.0015156                       -0.11875462
## 
## Std. Errors:
##     (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew REC_ANIMinanimate
## d_p   0.1803328       0.1431314       0.1737402         0.1786000
## p_a   0.1995348       0.1519139       0.1959591         0.1642183
## p_p   0.2231066       0.2124143       0.2304589         0.2240036
##     LEN_RECmPAT_LOG REC_ANIMinanimate:LEN_RECmPAT_LOG
## d_p      0.06351107                         0.1061884
## p_a      0.08292303                         0.1163086
## p_p      0.10006980                         0.1455638
## 
## Residual Deviance: 3531.181 
## AIC: 3567.181
anova(m.01, m.02a, # compare m.01 to m.02a
   test="Chisq")   # using a LRT
## Likelihood ratio tests of Multinomial Models
## 
## Response: TRANSITIVITY
##                                                                                                                 Model
## 1                                 REC_ACCESSIB + PAT_ACCESSIB + REC_ANIM + LEN_RECmPAT_LOG + REC_ANIM:LEN_RECmPAT_LOG
## 2 1 + REC_ACCESSIB + PAT_ACCESSIB + REC_ACCESSIB:PAT_ACCESSIB + REC_ANIM + LEN_RECmPAT_LOG + REC_ANIM:LEN_RECmPAT_LOG
##   Resid. df Resid. Dev   Test    Df LR stat.   Pr(Chi)
## 1      5310   3531.181                                
## 2      5307   3525.044 1 vs 2     3 6.136739 0.1051438
summary(m.02b <- update(        # make m.02b an update of
   m.01, .~.                    # m.01, namely all of it (.~.), but then
   - REC_ANIM:LEN_RECmPAT_LOG)) # remove this interaction
## # weights:  28 (18 variable)
## initial  value 2462.058785 
## iter  10 value 1790.360208
## iter  20 value 1764.915297
## final  value 1764.868000 
## converged
## Call:
## multinom(formula = TRANSITIVITY ~ REC_ACCESSIB + PAT_ACCESSIB + 
##     REC_ANIM + LEN_RECmPAT_LOG + REC_ACCESSIB:PAT_ACCESSIB, data = d, 
##     na.action = na.exclude, model = TRUE)
## 
## Coefficients:
##     (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew REC_ANIMinanimate
## d_p  -0.1614717       0.4998229      -0.2469363        0.04513013
## p_a  -0.4024288       0.8670756      -0.4384312        0.95737277
## p_p  -0.9283095       1.3712701      -0.8553391        0.56971191
##     LEN_RECmPAT_LOG REC_ACCESSIBnew:PAT_ACCESSIBnew
## d_p       0.4398474                      -0.4412384
## p_a       1.0442234                      -0.1707976
## p_p       0.9617214                      -1.0607971
## 
## Std. Errors:
##     (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew REC_ANIMinanimate
## d_p   0.1945742       0.3655852       0.2036241         0.1454223
## p_a   0.2249132       0.3805211       0.2468435         0.1492921
## p_p   0.2728389       0.4126677       0.3177536         0.2076899
##     LEN_RECmPAT_LOG REC_ACCESSIBnew:PAT_ACCESSIBnew
## d_p      0.05175417                       0.3949123
## p_a      0.05886288                       0.4139378
## p_p      0.07451965                       0.4781441
## 
## Residual Deviance: 3529.736 
## AIC: 3565.736
anova(m.01, m.02b, # compare m.01 to m.02b
   test="Chisq")   # using a LRT
## Likelihood ratio tests of Multinomial Models
## 
## Response: TRANSITIVITY
##                                                                                                                 Model
## 1                                REC_ACCESSIB + PAT_ACCESSIB + REC_ANIM + LEN_RECmPAT_LOG + REC_ACCESSIB:PAT_ACCESSIB
## 2 1 + REC_ACCESSIB + PAT_ACCESSIB + REC_ACCESSIB:PAT_ACCESSIB + REC_ANIM + LEN_RECmPAT_LOG + REC_ANIM:LEN_RECmPAT_LOG
##   Resid. df Resid. Dev   Test    Df LR stat.   Pr(Chi)
## 1      5310   3529.736                                
## 2      5307   3525.044 1 vs 2     3 4.692158 0.1957774

Both interactions are ns so we first delete the one with the higher p-value, i.e. we delete REC_ANIM:LEN_RECmPAT_LOG and go with m.02b. But can we also delete the other interaction?

summary(m.03 <- update(          # make m.03 an update of
   m.02b, .~.                    # m.02b, namely all of it (.~.), but then
   - REC_ACCESSIB:PAT_ACCESSIB)) # remove this interaction
## # weights:  24 (15 variable)
## initial  value 2462.058785 
## iter  10 value 1794.993382
## iter  20 value 1767.868111
## final  value 1767.866967 
## converged
## Call:
## multinom(formula = TRANSITIVITY ~ REC_ACCESSIB + PAT_ACCESSIB + 
##     REC_ANIM + LEN_RECmPAT_LOG, data = d, na.action = na.exclude, 
##     model = TRUE)
## 
## Coefficients:
##     (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew REC_ANIMinanimate
## d_p -0.07582023       0.1192941      -0.3564488        0.04390158
## p_a -0.41438351       0.7085056      -0.4224211        0.95956732
## p_p -0.61758006       0.6157147      -1.3373812        0.55492462
##     LEN_RECmPAT_LOG
## d_p       0.4380379
## p_a       1.0425965
## p_p       0.9557104
## 
## Std. Errors:
##     (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew REC_ANIMinanimate
## d_p   0.1760039       0.1430298       0.1732770         0.1454120
## p_a   0.1950505       0.1519703       0.1942310         0.1492648
## p_p   0.2199506       0.2123822       0.2295769         0.2072032
##     LEN_RECmPAT_LOG
## d_p      0.05167523
## p_a      0.05876898
## p_p      0.07436727
## 
## Residual Deviance: 3535.734 
## AIC: 3565.734
anova(m.02b, m.03, # compare  m.02b to m.03
   test="Chisq")   # using a LRT
## Likelihood ratio tests of Multinomial Models
## 
## Response: TRANSITIVITY
##                                                                                  Model
## 1                             REC_ACCESSIB + PAT_ACCESSIB + REC_ANIM + LEN_RECmPAT_LOG
## 2 REC_ACCESSIB + PAT_ACCESSIB + REC_ANIM + LEN_RECmPAT_LOG + REC_ACCESSIB:PAT_ACCESSIB
##   Resid. df Resid. Dev   Test    Df LR stat.   Pr(Chi)
## 1      5313   3535.734                                
## 2      5310   3529.736 1 vs 2     3 5.997935 0.1117107

Yes, we can so we delete REC_ACCESSIB:PAT_ACCESSIB and go with m.03. But can we also delete main effects?

summary(m.04a <- update( # make m.04a an update of
   m.03, .~.             # m.03, namely all of it (.~.), but then
   - REC_ACCESSIB))      # remove this predictor
## # weights:  20 (12 variable)
## initial  value 2462.058785 
## iter  10 value 1805.888093
## final  value 1780.292471 
## converged
## Call:
## multinom(formula = TRANSITIVITY ~ PAT_ACCESSIB + REC_ANIM + LEN_RECmPAT_LOG, 
##     data = d, na.action = na.exclude, model = TRUE)
## 
## Coefficients:
##     (Intercept) PAT_ACCESSIBnew REC_ANIMinanimate LEN_RECmPAT_LOG
## d_p -0.03385164      -0.3446204        0.04927733       0.4487349
## p_a -0.13661249      -0.3158986        0.97063967       1.1111132
## p_p -0.38134000      -1.2462871        0.56502687       1.0158562
## 
## Std. Errors:
##     (Intercept) PAT_ACCESSIBnew REC_ANIMinanimate LEN_RECmPAT_LOG
## d_p   0.1694037       0.1723732         0.1453316      0.05021125
## p_a   0.1843444       0.1917693         0.1480772      0.05767235
## p_p   0.2014785       0.2262448         0.2066145      0.07242679
## 
## Residual Deviance: 3560.585 
## AIC: 3584.585
   anova(m.03, m.04a, test="Chisq") # compare models with a LRT
## Likelihood ratio tests of Multinomial Models
## 
## Response: TRANSITIVITY
##                                                      Model Resid. df Resid. Dev
## 1                PAT_ACCESSIB + REC_ANIM + LEN_RECmPAT_LOG      5316   3560.585
## 2 REC_ACCESSIB + PAT_ACCESSIB + REC_ANIM + LEN_RECmPAT_LOG      5313   3535.734
##     Test    Df LR stat.      Pr(Chi)
## 1                                   
## 2 1 vs 2     3 24.85101 1.658861e-05
summary(m.04b <- update( # make m.04b an update of
   m.03, .~.             # m.03, namely all of it (.~.), but then
   - PAT_ACCESSIB))      # remove this predictor
## # weights:  20 (12 variable)
## initial  value 2462.058785 
## iter  10 value 1798.965851
## final  value 1784.182896 
## converged
## Call:
## multinom(formula = TRANSITIVITY ~ REC_ACCESSIB + REC_ANIM + LEN_RECmPAT_LOG, 
##     data = d, na.action = na.exclude, model = TRUE)
## 
## Coefficients:
##     (Intercept) REC_ACCESSIBnew REC_ANIMinanimate LEN_RECmPAT_LOG
## d_p  -0.3514652      0.09136321        0.02985044       0.4429333
## p_a  -0.7382389      0.67129889        0.94339105       1.0495865
## p_p  -1.4885262      0.46162804        0.40302393       0.9614303
## 
## Std. Errors:
##     (Intercept) REC_ACCESSIBnew REC_ANIMinanimate LEN_RECmPAT_LOG
## d_p   0.1155751       0.1420761         0.1449772      0.05162030
## p_a   0.1308031       0.1505184         0.1481442      0.05877521
## p_p   0.1714852       0.2072247         0.2029757      0.07349044
## 
## Residual Deviance: 3568.366 
## AIC: 3592.366
   anova(m.03, m.04b, test="Chisq") # compare models with a LRT
## Likelihood ratio tests of Multinomial Models
## 
## Response: TRANSITIVITY
##                                                      Model Resid. df Resid. Dev
## 1                REC_ACCESSIB + REC_ANIM + LEN_RECmPAT_LOG      5316   3568.366
## 2 REC_ACCESSIB + PAT_ACCESSIB + REC_ANIM + LEN_RECmPAT_LOG      5313   3535.734
##     Test    Df LR stat.      Pr(Chi)
## 1                                   
## 2 1 vs 2     3 32.63186 3.851131e-07
summary(m.04c <- update( # make m.04c an update of
   m.03, .~.             # m.03, namely all of it (.~.), but then
   - REC_ANIM))          # remove this predictor
## # weights:  20 (12 variable)
## initial  value 2462.058785 
## iter  10 value 1817.346357
## final  value 1791.948604 
## converged
## Call:
## multinom(formula = TRANSITIVITY ~ REC_ACCESSIB + PAT_ACCESSIB + 
##     LEN_RECmPAT_LOG, data = d, na.action = na.exclude, model = TRUE)
## 
## Coefficients:
##     (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew LEN_RECmPAT_LOG
## d_p -0.05752161       0.1223786      -0.3639212       0.4382956
## p_a -0.11697191       0.7248194      -0.2874666       1.0718270
## p_p -0.44904006       0.6259620      -1.2894656       0.9707461
## 
## Std. Errors:
##     (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew LEN_RECmPAT_LOG
## d_p   0.1722890       0.1429678       0.1728674      0.05139989
## p_a   0.1884496       0.1499452       0.1922632      0.05805851
## p_p   0.2116643       0.2121530       0.2267210      0.07405764
## 
## Residual Deviance: 3583.897 
## AIC: 3607.897
   anova(m.03, m.04c, test="Chisq") # compare models with a LRT
## Likelihood ratio tests of Multinomial Models
## 
## Response: TRANSITIVITY
##                                                      Model Resid. df Resid. Dev
## 1            REC_ACCESSIB + PAT_ACCESSIB + LEN_RECmPAT_LOG      5316   3583.897
## 2 REC_ACCESSIB + PAT_ACCESSIB + REC_ANIM + LEN_RECmPAT_LOG      5313   3535.734
##     Test    Df LR stat.      Pr(Chi)
## 1                                   
## 2 1 vs 2     3 48.16327 1.965748e-10
summary(m.04d <- update( # make m.04d an update of
   m.03, .~.             # m.03, namely all of it (.~.), but then
   - LEN_RECmPAT_LOG))   # remove this predictor
## # weights:  20 (12 variable)
## initial  value 2462.058785 
## iter  10 value 2038.920162
## final  value 2021.374467 
## converged
## Call:
## multinom(formula = TRANSITIVITY ~ REC_ACCESSIB + PAT_ACCESSIB + 
##     REC_ANIM, data = d, na.action = na.exclude, model = TRUE)
## 
## Coefficients:
##     (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew REC_ANIMinanimate
## d_p  -0.6785844       0.4221001      -0.4274774         0.1569561
## p_a  -1.3836692       1.5125323      -0.4844716         1.2427424
## p_p  -1.5400404       1.3393672      -1.4252069         0.7773680
## 
## Std. Errors:
##     (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew REC_ANIMinanimate
## d_p   0.1578792       0.1351589       0.1690794         0.1416616
## p_a   0.1705027       0.1289408       0.1735308         0.1283868
## p_p   0.2002157       0.1943860       0.2135040         0.1943153
## 
## Residual Deviance: 4042.749 
## AIC: 4066.749
   anova(m.03, m.04d, test="Chisq") # compare models with a LRT
## Likelihood ratio tests of Multinomial Models
## 
## Response: TRANSITIVITY
##                                                      Model Resid. df Resid. Dev
## 1                   REC_ACCESSIB + PAT_ACCESSIB + REC_ANIM      5316   4042.749
## 2 REC_ACCESSIB + PAT_ACCESSIB + REC_ANIM + LEN_RECmPAT_LOG      5313   3535.734
##     Test    Df LR stat. Pr(Chi)
## 1                              
## 2 1 vs 2     3  507.015       0

No, we can’t delete any more predictors, we’re done with model selection:

summary(m.final <- m.03); rm(m.04a, m.04b, m.04c, m.04d); invisible(gc())
## Call:
## multinom(formula = TRANSITIVITY ~ REC_ACCESSIB + PAT_ACCESSIB + 
##     REC_ANIM + LEN_RECmPAT_LOG, data = d, na.action = na.exclude, 
##     model = TRUE)
## 
## Coefficients:
##     (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew REC_ANIMinanimate
## d_p -0.07582023       0.1192941      -0.3564488        0.04390158
## p_a -0.41438351       0.7085056      -0.4224211        0.95956732
## p_p -0.61758006       0.6157147      -1.3373812        0.55492462
##     LEN_RECmPAT_LOG
## d_p       0.4380379
## p_a       1.0425965
## p_p       0.9557104
## 
## Std. Errors:
##     (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew REC_ANIMinanimate
## d_p   0.1760039       0.1430298       0.1732770         0.1454120
## p_a   0.1950505       0.1519703       0.1942310         0.1492648
## p_p   0.2199506       0.2123822       0.2295769         0.2072032
##     LEN_RECmPAT_LOG
## d_p      0.05167523
## p_a      0.05876898
## p_p      0.07436727
## 
## Residual Deviance: 3535.734 
## AIC: 3565.734
(m.final.zscores <- summary(m.final)$coefficients / summary(m.final)$standard.errors)
##     (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew REC_ANIMinanimate
## d_p  -0.4307872        0.834051       -2.057104         0.3019117
## p_a  -2.1244935        4.662131       -2.174839         6.4286223
## p_p  -2.8078127        2.899088       -5.825417         2.6781657
##     LEN_RECmPAT_LOG
## d_p        8.476748
## p_a       17.740594
## p_p       12.851222
(m.final.pvalues <- 2*pnorm(abs(m.final.zscores), lower.tail=FALSE))
##     (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew REC_ANIMinanimate
## d_p 0.666623116    4.042523e-01    3.967623e-02      7.627194e-01
## p_a 0.033628902    3.129516e-06    2.964221e-02      1.287656e-10
## p_p 0.004987923    3.742502e-03    5.697018e-09      7.402659e-03
##     LEN_RECmPAT_LOG
## d_p    2.315757e-17
## p_a    2.037701e-70
## p_p    8.466167e-38

Let’s also compute the overall p-value for this model:

anova(m.00, m.final, # compare the null model to m.final
   test="Chisq")     # using a LRT
## Likelihood ratio tests of Multinomial Models
## 
## Response: TRANSITIVITY
##                                                      Model Resid. df Resid. Dev
## 1                                                        1      5325   4375.423
## 2 REC_ACCESSIB + PAT_ACCESSIB + REC_ANIM + LEN_RECmPAT_LOG      5313   3535.734
##     Test    Df LR stat. Pr(Chi)
## 1                              
## 2 1 vs 2    12 839.6888       0
pchisq(              # compute the area under the chi-squared curve
   q=839.6888,       # for this chi-squared value
   df=12,            # at 12 df
   lower.tail=FALSE) # only using the right/upper tail/side
## [1] 5.073936e-172

On to model quality: We make ‘predictions’ …

d <- cbind(d,       # add to d the columns resulting from
   predict(         # predicting
      m.final,      # from m.final
      type="prob")) # predicted probabilities
# give those columns better names
names(d)[10:13] <- paste0( # rename those columns to the result of pasting
   "PREDS.",               # "PREDS."
   names(d)[10:13])        # in front of the existing names
d <- cbind(d,         # add to d a column(d
   PREDS.CAT=predict( # called PREDS.CAT, the predictions
      m.final,        # from m.final, namely
      type="class"))  # categorical predictions
head(d) # check the result
##   CASE TRANSITIVITY REC_LEN PAT_LEN  REC_ANIM     PAT_SEM REC_ACCESSIB
## 1    1          d_a       5       8   animate    abstract          new
## 2    2          d_a       2       3   animate    abstract          new
## 3    3          d_a       2       2 inanimate information        given
## 4    4          p_a       5       3 inanimate    abstract          new
## 5    5          d_a       2       3   animate    abstract          new
## 6    6          d_a       1       4   animate    concrete        given
##   PAT_ACCESSIB LEN_RECmPAT_LOG PREDS.d_a PREDS.d_p PREDS.p_a  PREDS.p_p
## 1          new      -0.6780719 0.4729958 0.2570037 0.2051686 0.06483201
## 2          new      -0.5849625 0.4558345 0.2579896 0.2178814 0.06829445
## 3          new       0.0000000 0.3272929 0.2219580 0.3700426 0.08070648
## 4          new       0.7369656 0.1261111 0.1330745 0.6243988 0.11641564
## 5          new      -0.5849625 0.4558345 0.2579896 0.2178814 0.06829445
## 6        given      -2.0000000 0.6460504 0.2493809 0.0530535 0.05151519
##   PREDS.CAT
## 1       d_a
## 2       d_a
## 3       p_a
## 4       p_a
## 5       d_a
## 6       d_a

… and then we evaluate them:

(c.m <- table(             # confusion matrix: cross-tabulate
   "OBS"  =d$TRANSITIVITY, # observed constructions in the rows
   "PREDS"=d$PREDS.CAT))   # predicted constructions in the columns
##      PREDS
## OBS   d_a d_p p_a p_p
##   d_a 736   0  74   0
##   d_p 252   1  96   0
##   p_a 132   1 343   0
##   p_p  58   0  83   0
c(tapply(d$TRANSITIVITY==d$PREDS.CAT, d$TRANSITIVITY, mean), # accuracies for ...
  "overall"=mean(d$TRANSITIVITY==d$PREDS.CAT)) # ... each level & overall
##        d_a        d_p        p_a        p_p    overall 
## 0.90864198 0.00286533 0.72058824 0.00000000 0.60810811

Plus, we need the model’s R2-values. We begin with the widely-used version of Nagelkerke’s R2, but then also add McFadden’s R2:

c("Nagelkerke's R2"=R2(m.final),
  "McFadden's R2"  =(m.00$deviance-m.final$deviance) / m.00$deviance)
## # weights:  8 (3 variable)
## initial  value 2462.058785 
## iter  10 value 2187.711475
## iter  10 value 2187.711469
## final  value 2187.711344 
## converged
## Nagelkerke's R2   McFadden's R2 
##       0.4117980       0.1919103

We can also add PREDS.NUM.obs, like we did for binary logistic regression, but it’s a bit more complex here so I’ll just show how it’s done without more details:

d$PREDS.NUM.obs <- predict(m.final, type="prob")[ # make this column the predictions
   matrix(c(                                # that are in the positions defined here
      seq(nrow(d)),                         # one prediction for each row
      as.numeric(d$TRANSITIVITY)), ncol=2)] # namely the column of the observed result

Why are we doing this? To see whether we can compute the deviance of the final model from its contributions to logloss – i.e., -log(d$PREDS.NUM.obs)) – again. And we can, the logic is the same as before:

sum(-log(d$PREDS.NUM.obs)) * 2
## [1] 3535.734
deviance(m.final)
## [1] 3535.734

Of course, if you paid attention to the book and worked through everything there, much of this you could have done with this:

summarize.multinom(m.final)
## # weights:  8 (3 variable)
## initial  value 2462.058785 
## iter  10 value 2187.711475
## iter  10 value 2187.711469
## final  value 2187.711344 
## converged
## $Coefficients
##                          Estimate  Std.Error       2.5%       97.5%          z
## (Intercept):d_p       -0.07582023 0.17600392 -0.4207816  0.26914111 -0.4307872
## (Intercept):p_a       -0.41438351 0.19505050 -0.7966755 -0.03209156 -2.1244935
## (Intercept):p_p       -0.61758006 0.21995059 -1.0486753 -0.18648482 -2.8078127
## REC_ACCESSIBnew:d_p    0.11929412 0.14302977 -0.1610391  0.39962731  0.8340510
## REC_ACCESSIBnew:p_a    0.70850560 0.15197033  0.4106492  1.00636196  4.6621312
## REC_ACCESSIBnew:p_p    0.61571469 0.21238222  0.1994532  1.03197619  2.8990877
## PAT_ACCESSIBnew:d_p   -0.35644878 0.17327699 -0.6960654 -0.01683212 -2.0571039
## PAT_ACCESSIBnew:p_a   -0.42242110 0.19423101 -0.8031069 -0.04173531 -2.1748386
## PAT_ACCESSIBnew:p_p   -1.33738125 0.22957691 -1.7873437 -0.88741876 -5.8254170
## REC_ANIMinanimate:d_p  0.04390158 0.14541201 -0.2411007  0.32890388  0.3019117
## REC_ANIMinanimate:p_a  0.95956732 0.14926485  0.6670136  1.25212105  6.4286223
## REC_ANIMinanimate:p_p  0.55492462 0.20720324  0.1488137  0.96103551  2.6781657
## LEN_RECmPAT_LOG:d_p    0.43803793 0.05167523  0.3367563  0.53931953  8.4767481
## LEN_RECmPAT_LOG:p_a    1.04259654 0.05876898  0.9274115  1.15778162 17.7405938
## LEN_RECmPAT_LOG:p_p    0.95571035 0.07436727  0.8099532  1.10146753 12.8512225
##                           p2tailed
## (Intercept):d_p       6.666231e-01
## (Intercept):p_a       3.362890e-02
## (Intercept):p_p       4.987923e-03
## REC_ACCESSIBnew:d_p   4.042523e-01
## REC_ACCESSIBnew:p_a   3.129516e-06
## REC_ACCESSIBnew:p_p   3.742502e-03
## PAT_ACCESSIBnew:d_p   3.967623e-02
## PAT_ACCESSIBnew:p_a   2.964221e-02
## PAT_ACCESSIBnew:p_p   5.697018e-09
## REC_ANIMinanimate:d_p 7.627194e-01
## REC_ANIMinanimate:p_a 1.287656e-10
## REC_ANIMinanimate:p_p 7.402659e-03
## LEN_RECmPAT_LOG:d_p   2.315757e-17
## LEN_RECmPAT_LOG:p_a   2.037701e-70
## LEN_RECmPAT_LOG:p_p   8.466167e-38
## 
## $`Confusion matrix`
##      PRED
## OBS   d_a d_p p_a p_p
##   d_a 736   0  74   0
##   d_p 252   1  96   0
##   p_a 132   1 343   0
##   p_p  58   0  83   0
## 
## $`Classification metrics`
##         Baseline     Acc. for d_a     Acc. for d_p     Acc. for p_a 
##     4.560811e-01     9.086420e-01     2.865330e-03     7.205882e-01 
##     Acc. for p_p Acc. for overall           pbinom          logloss 
##     0.000000e+00     6.081081e-01     5.990962e-38     9.954206e-01 
## 
## $`Deviances & R2s`
## Deviance of null model Deviance of this model            McFadden R2 
##           4375.4226877           3535.7339348              0.1919103 
##          Nagelkerke R2 
##              0.4117980 
## 
## $`Model significance test`
##          LR statistic                    Df               P-value 
##          8.396888e+02          1.200000e+01         5.074054e-172 
## Number of data points 
##          1.776000e+03

5 Visual interpretation

Now let’s visualize the results of the predictors based on the predictions, one at a time. We begin with REC_ACCESSIB:

ra.d <- data.frame(   # make ra.d a data frame of
   ra <- effect(      # the effect
      "REC_ACCESSIB", # of REC_ACCESSIB
      m.final))       # in m.final
ra.d[,1:5] # show just the predicted probabilities
##   REC_ACCESSIB  prob.d_a  prob.d_p  prob.p_a   prob.p_p
## 1        given 0.4995828 0.2663792 0.1736107 0.06042732
## 2          new 0.3951903 0.2374149 0.2789172 0.08847765
plot(ra,                       # plot the effect of REC_ACCESSIB
   type="probability",         # but plot predicted probabilities
   ylim=c(0, 1),               # w/ these y-axis limits
   grid=TRUE,                  # & w/ a grid
   multiline=TRUE,             # put all lines in one panel
   confint=list(style="auto"), # keep confidence intervals
   lattice=list(key.args=list( # make the
      columns=4,               # legend have 4 columns &
      cex.title=0)))           # no title

Next, PAT_ACCESSIB:

pa.d <- data.frame(   # make pa.d a data frame of
   pa <- effect(      # the effect
      "PAT_ACCESSIB", # of PAT_ACCESSIB
      m.final))       # in m.final
pa.d[,1:5] # show just the predicted probabilities
##   PAT_ACCESSIB  prob.d_a  prob.d_p  prob.p_a   prob.p_p
## 1        given 0.3442863 0.2595748 0.2316187 0.16452012
## 2          new 0.4774871 0.2520581 0.2105525 0.05990231
plot(pa,                       # plot the effect of PAT_ACCESSIB
   type="probability",         # but plot predicted probabilities
   ylim=c(0, 1),               # w/ these y-axis limits
   grid=TRUE,                  # & w/ a grid
   multiline=TRUE,             # put all lines in one panel
   confint=list(style="auto"), # keep confidence intervals
   lattice=list(key.args=list( # make the
      columns=4,               # legend have 4 columns &
      cex.title=0)))           # no title

REC_ANIM is next:

rca.d <- data.frame( # make rca.d a data frame of
   rca <- effect(     # the effect
      "REC_ANIM",     # of REC_ANIM
      m.final))       # in m.final
rca.d[,1:5] # show just the predicted probabilities
##    REC_ANIM  prob.d_a  prob.d_p  prob.p_a   prob.p_p
## 1   animate 0.4969793 0.2747134 0.1643431 0.06396429
## 2 inanimate 0.3752305 0.2167235 0.3239262 0.08411979
plot(rca,                      # plot the effect of REC_ANIM
   type="probability",         # but plot predicted probabilities
   ylim=c(0, 1),               # w/ these y-axis limits
   grid=TRUE,                  # & w/ a grid
   multiline=TRUE,             # put all lines in one panel
   confint=list(style="auto"), # keep confidence intervals
   lattice=list(key.args=list( # make the
      columns=4,               # legend have 4 columns &
      cex.title=0)))           # no title