Let’s load a data set for an example of a multinomial model:
rm(list=ls(all.names=TRUE))
library(car); library(effects); library(nnet)
source("helpers/202_R2.r"); source("helpers/202_summarize.multinom.r") # load small helper functions
summary(d <- read.delim( # summarize d, the result of loading
file="inputfiles/202_06_DATALT.csv", # this file
stringsAsFactors=TRUE)) # change categorical variables into factors
## CASE TRANSITIVITY REC_LEN PAT_LEN
## Min. : 1.0 d_a:810 Min. : 1.000 Min. : 1.000
## 1st Qu.: 474.8 d_p:349 1st Qu.: 1.000 1st Qu.: 2.000
## Median : 935.5 p_a:476 Median : 2.000 Median : 3.000
## Mean : 939.4 p_p:141 Mean : 3.128 Mean : 4.443
## 3rd Qu.:1412.2 3rd Qu.: 3.000 3rd Qu.: 6.000
## Max. :1871.0 Max. :44.000 Max. :33.000
## REC_ANIM PAT_SEM REC_ACCESSIB PAT_ACCESSIB
## animate :1106 abstract :1265 given:993 given: 312
## inanimate: 670 concrete : 335 new :783 new :1464
## information: 176
##
##
##
Here’s what the columns are/contain:
CASE
is a simple case/row number just so
that each case has a unique ID;TRANSITIVITY
is the response variable: it’s
the construction a speaker chose in a naturalistic setting:
REC_LEN
is a numeric predictor: it’s the
length of the recipient in characters (e.g., her has 3
characters);PAT_LEN
is a numeric predictor: it’s the
length of the patient in characters (e.g., a book has 6
characters (incl. the space));REC_ANIM
is a binary predictor: it’s whether
the recipient is animate (e.g., He gave the customer a
dirty look) or inanimate (e.g., He gave the table a
dirty look);PAT_SEM
is a ternary predictor: it’s whether
the patient was abstract (e.g., He gave the idea some
thought) or concrete (e.g., He gave her a book)
or information (e.g., He told her a story);REC_ACCESSIB
is a binary predictor: it’s
whether the recipient was given (i.e., inferrable from
information in the previous sentence) or new (i.e., maybe newly
introduced to the discourse);PAT_ACCESSIB
is a binary predictor: it’s
patient the recipient was given (i.e., inferrable from
information in the previous sentence) or new (i.e., maybe newly
introduced to the discourse).We want to know whether TRANSITIVITY
is predictable
from
REC_ANIM
, REC_ACCESSIB
,
PAT_ACCESSIB
;REC_ANIM
and a short-before-long
effect.Let’s already compute the baselines for what will be the response
variable, TRANSITIVITY
:
(baselines <- c(
"baseline 1"=max( # make baselines[1] the highest
prop.table( # proportion in the
table(d$TRANSITIVITY))), # frequency table of the response
"baseline 2"=sum( # make baselines[2] the sum of the
prop.table( # proportions in the
table(d$TRANSITIVITY))^2))) # frequency table of the response squared
## baseline 1 baseline 2
## 0.4560811 0.3247625
Let’s also compute the deviance of the null model:
deviance(m.00 <- multinom(TRANSITIVITY ~ 1, data=d, na.action=na.exclude))
## # weights: 8 (3 variable)
## initial value 2462.058785
## iter 10 value 2187.711475
## iter 10 value 2187.711469
## final value 2187.711344
## converged
## [1] 4375.423
Some exploration of the relevant variables and we prepare our length-difference predictor:
# the predictor(s)/response on its/their own
hist(d$REC_LEN)
hist(d$PAT_LEN)
hist(d$LEN_RECmPAT_LOG <- log2(d$REC_LEN)-log2(d$PAT_LEN), main="")
table(d$REC_ANIM)
##
## animate inanimate
## 1106 670
table(d$REC_ACCESSIB)
##
## given new
## 993 783
table(d$PAT_ACCESSIB)
##
## given new
## 312 1464
table(d$REC_ACCESSIB, d$PAT_ACCESSIB)
##
## given new
## given 205 788
## new 107 676
# the predictor(s) w/ the response
spineplot(d$TRANSITIVITY ~ d$LEN_RECmPAT_LOG)
table(d$REC_ANIM, d$TRANSITIVITY)
##
## d_a d_p p_a p_p
## animate 588 242 197 79
## inanimate 222 107 279 62
ftable(d$REC_ACCESSIB, d$PAT_ACCESSIB, d$TRANSITIVITY)
## d_a d_p p_a p_p
##
## given given 98 48 39 20
## new 469 165 117 37
## new given 21 21 36 29
## new 222 115 284 55
Let’s fit our initial regression model:
summary(m.01 <- multinom( # make/summarize the multinomial model m.01:
TRANSITIVITY ~ 1 + # TRANSITIVITY ~ an overall intercept (1)
REC_ACCESSIB + PAT_ACCESSIB + # the accessibility predictors & ...
REC_ACCESSIB:PAT_ACCESSIB + # their interaction
REC_ANIM + LEN_RECmPAT_LOG + # the rec animacy & length diff preds & ...
REC_ANIM:LEN_RECmPAT_LOG, # their interaction
data=d, # those vars are in d
model=TRUE, # & save the model frame (optional)
na.action=na.exclude)) # skip cases with NA/missing data
## # weights: 32 (21 variable)
## initial value 2462.058785
## iter 10 value 1814.322650
## iter 20 value 1767.454974
## final value 1762.521921
## converged
## Call:
## multinom(formula = TRANSITIVITY ~ 1 + REC_ACCESSIB + PAT_ACCESSIB +
## REC_ACCESSIB:PAT_ACCESSIB + REC_ANIM + LEN_RECmPAT_LOG +
## REC_ANIM:LEN_RECmPAT_LOG, data = d, na.action = na.exclude,
## model = TRUE)
##
## Coefficients:
## (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew REC_ANIMinanimate
## d_p -0.1210011 0.4906650 -0.2429275 -0.06653502
## p_a -0.3860966 0.8360726 -0.4249039 0.88620903
## p_p -0.8663412 1.3627552 -0.8513750 0.41282684
## LEN_RECmPAT_LOG REC_ACCESSIBnew:PAT_ACCESSIBnew
## d_p 0.4666473 -0.4347810
## p_a 1.1615820 -0.1403867
## p_p 1.0006701 -1.0594962
## REC_ANIMinanimate:LEN_RECmPAT_LOG
## d_p -0.07724824
## p_a -0.24365931
## p_p -0.10326591
##
## Std. Errors:
## (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew REC_ANIMinanimate
## d_p 0.1989219 0.3658536 0.2039208 0.1787075
## p_a 0.2291365 0.3830872 0.2485331 0.1642522
## p_p 0.2751310 0.4128681 0.3179271 0.2245364
## LEN_RECmPAT_LOG REC_ACCESSIBnew:PAT_ACCESSIBnew
## d_p 0.06357114 0.3951689
## p_a 0.08302891 0.4162775
## p_p 0.10013924 0.4785921
## REC_ANIMinanimate:LEN_RECmPAT_LOG
## d_p 0.1063059
## p_a 0.1164769
## p_p 0.1458351
##
## Residual Deviance: 3525.044
## AIC: 3567.044
Let’s also compute the z-scores for these coefficient values (however uninterpretable those are):
(m.01.zscores <- summary(m.01)$coefficients / summary(m.01)$standard.errors)
## (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew REC_ANIMinanimate
## d_p -0.6082847 1.341151 -1.191284 -0.3723124
## p_a -1.6850072 2.182460 -1.709647 5.3954176
## p_p -3.1488316 3.300703 -2.677894 1.8385745
## LEN_RECmPAT_LOG REC_ACCESSIBnew:PAT_ACCESSIBnew
## d_p 7.340553 -1.100241
## p_a 13.990092 -0.337243
## p_p 9.992787 -2.213777
## REC_ANIMinanimate:LEN_RECmPAT_LOG
## d_p -0.7266601
## p_a -2.0919113
## p_p -0.7081005
And from that we compute the p-values:
(m.01.pvalues <- 2*pnorm(abs(m.01.zscores), lower.tail=FALSE))
## (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew REC_ANIMinanimate
## d_p 0.542998690 0.1798713698 0.233542271 7.096603e-01
## p_a 0.091987185 0.0290755883 0.087331199 6.836435e-08
## p_p 0.001639247 0.0009644279 0.007408676 6.597780e-02
## LEN_RECmPAT_LOG REC_ACCESSIBnew:PAT_ACCESSIBnew
## d_p 2.127136e-13 0.2712271
## p_a 1.791808e-44 0.7359337
## p_p 1.639074e-23 0.0268441
## REC_ANIMinanimate:LEN_RECmPAT_LOG
## d_p 0.46743422
## p_a 0.03644645
## p_p 0.47888286
So there are definitely significant results. Since we can’t use
drop1
, let’s do anova
two times, once for each
interaction:
summary(m.02a <- update( # make m.02a an update of
m.01, .~. # m.01, namely all of it (.~.), but then
- REC_ACCESSIB:PAT_ACCESSIB)) # remove this interaction
## # weights: 28 (18 variable)
## initial value 2462.058785
## iter 10 value 1801.290105
## iter 20 value 1767.457695
## final value 1765.590290
## converged
## Call:
## multinom(formula = TRANSITIVITY ~ REC_ACCESSIB + PAT_ACCESSIB +
## REC_ANIM + LEN_RECmPAT_LOG + REC_ANIM:LEN_RECmPAT_LOG, data = d,
## na.action = na.exclude, model = TRUE)
##
## Coefficients:
## (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew REC_ANIMinanimate
## d_p -0.03493706 0.1158438 -0.3509796 -0.07198044
## p_a -0.40506728 0.7027753 -0.3967182 0.88305548
## p_p -0.55675057 0.6115019 -1.3339509 0.40044051
## LEN_RECmPAT_LOG REC_ANIMinanimate:LEN_RECmPAT_LOG
## d_p 0.4661081 -0.08120209
## p_a 1.1589835 -0.24308442
## p_p 1.0015156 -0.11875462
##
## Std. Errors:
## (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew REC_ANIMinanimate
## d_p 0.1803328 0.1431314 0.1737402 0.1786000
## p_a 0.1995348 0.1519139 0.1959591 0.1642183
## p_p 0.2231066 0.2124143 0.2304589 0.2240036
## LEN_RECmPAT_LOG REC_ANIMinanimate:LEN_RECmPAT_LOG
## d_p 0.06351107 0.1061884
## p_a 0.08292303 0.1163086
## p_p 0.10006980 0.1455638
##
## Residual Deviance: 3531.181
## AIC: 3567.181
anova(m.01, m.02a, # compare m.01 to m.02a
test="Chisq") # using a LRT
## Likelihood ratio tests of Multinomial Models
##
## Response: TRANSITIVITY
## Model
## 1 REC_ACCESSIB + PAT_ACCESSIB + REC_ANIM + LEN_RECmPAT_LOG + REC_ANIM:LEN_RECmPAT_LOG
## 2 1 + REC_ACCESSIB + PAT_ACCESSIB + REC_ACCESSIB:PAT_ACCESSIB + REC_ANIM + LEN_RECmPAT_LOG + REC_ANIM:LEN_RECmPAT_LOG
## Resid. df Resid. Dev Test Df LR stat. Pr(Chi)
## 1 5310 3531.181
## 2 5307 3525.044 1 vs 2 3 6.136739 0.1051438
summary(m.02b <- update( # make m.02b an update of
m.01, .~. # m.01, namely all of it (.~.), but then
- REC_ANIM:LEN_RECmPAT_LOG)) # remove this interaction
## # weights: 28 (18 variable)
## initial value 2462.058785
## iter 10 value 1790.360208
## iter 20 value 1764.915297
## final value 1764.868000
## converged
## Call:
## multinom(formula = TRANSITIVITY ~ REC_ACCESSIB + PAT_ACCESSIB +
## REC_ANIM + LEN_RECmPAT_LOG + REC_ACCESSIB:PAT_ACCESSIB, data = d,
## na.action = na.exclude, model = TRUE)
##
## Coefficients:
## (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew REC_ANIMinanimate
## d_p -0.1614717 0.4998229 -0.2469363 0.04513013
## p_a -0.4024288 0.8670756 -0.4384312 0.95737277
## p_p -0.9283095 1.3712701 -0.8553391 0.56971191
## LEN_RECmPAT_LOG REC_ACCESSIBnew:PAT_ACCESSIBnew
## d_p 0.4398474 -0.4412384
## p_a 1.0442234 -0.1707976
## p_p 0.9617214 -1.0607971
##
## Std. Errors:
## (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew REC_ANIMinanimate
## d_p 0.1945742 0.3655852 0.2036241 0.1454223
## p_a 0.2249132 0.3805211 0.2468435 0.1492921
## p_p 0.2728389 0.4126677 0.3177536 0.2076899
## LEN_RECmPAT_LOG REC_ACCESSIBnew:PAT_ACCESSIBnew
## d_p 0.05175417 0.3949123
## p_a 0.05886288 0.4139378
## p_p 0.07451965 0.4781441
##
## Residual Deviance: 3529.736
## AIC: 3565.736
anova(m.01, m.02b, # compare m.01 to m.02b
test="Chisq") # using a LRT
## Likelihood ratio tests of Multinomial Models
##
## Response: TRANSITIVITY
## Model
## 1 REC_ACCESSIB + PAT_ACCESSIB + REC_ANIM + LEN_RECmPAT_LOG + REC_ACCESSIB:PAT_ACCESSIB
## 2 1 + REC_ACCESSIB + PAT_ACCESSIB + REC_ACCESSIB:PAT_ACCESSIB + REC_ANIM + LEN_RECmPAT_LOG + REC_ANIM:LEN_RECmPAT_LOG
## Resid. df Resid. Dev Test Df LR stat. Pr(Chi)
## 1 5310 3529.736
## 2 5307 3525.044 1 vs 2 3 4.692158 0.1957774
Both interactions are ns so we first delete the one with the higher
p-value, i.e. we delete REC_ANIM:LEN_RECmPAT_LOG
and go with m.02b
. But can we also delete the other
interaction?
summary(m.03 <- update( # make m.03 an update of
m.02b, .~. # m.02b, namely all of it (.~.), but then
- REC_ACCESSIB:PAT_ACCESSIB)) # remove this interaction
## # weights: 24 (15 variable)
## initial value 2462.058785
## iter 10 value 1794.993382
## iter 20 value 1767.868111
## final value 1767.866967
## converged
## Call:
## multinom(formula = TRANSITIVITY ~ REC_ACCESSIB + PAT_ACCESSIB +
## REC_ANIM + LEN_RECmPAT_LOG, data = d, na.action = na.exclude,
## model = TRUE)
##
## Coefficients:
## (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew REC_ANIMinanimate
## d_p -0.07582023 0.1192941 -0.3564488 0.04390158
## p_a -0.41438351 0.7085056 -0.4224211 0.95956732
## p_p -0.61758006 0.6157147 -1.3373812 0.55492462
## LEN_RECmPAT_LOG
## d_p 0.4380379
## p_a 1.0425965
## p_p 0.9557104
##
## Std. Errors:
## (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew REC_ANIMinanimate
## d_p 0.1760039 0.1430298 0.1732770 0.1454120
## p_a 0.1950505 0.1519703 0.1942310 0.1492648
## p_p 0.2199506 0.2123822 0.2295769 0.2072032
## LEN_RECmPAT_LOG
## d_p 0.05167523
## p_a 0.05876898
## p_p 0.07436727
##
## Residual Deviance: 3535.734
## AIC: 3565.734
anova(m.02b, m.03, # compare m.02b to m.03
test="Chisq") # using a LRT
## Likelihood ratio tests of Multinomial Models
##
## Response: TRANSITIVITY
## Model
## 1 REC_ACCESSIB + PAT_ACCESSIB + REC_ANIM + LEN_RECmPAT_LOG
## 2 REC_ACCESSIB + PAT_ACCESSIB + REC_ANIM + LEN_RECmPAT_LOG + REC_ACCESSIB:PAT_ACCESSIB
## Resid. df Resid. Dev Test Df LR stat. Pr(Chi)
## 1 5313 3535.734
## 2 5310 3529.736 1 vs 2 3 5.997935 0.1117107
Yes, we can so we delete REC_ACCESSIB:PAT_ACCESSIB
and
go with m.03
. But can we also delete main effects?
summary(m.04a <- update( # make m.04a an update of
m.03, .~. # m.03, namely all of it (.~.), but then
- REC_ACCESSIB)) # remove this predictor
## # weights: 20 (12 variable)
## initial value 2462.058785
## iter 10 value 1805.888093
## final value 1780.292471
## converged
## Call:
## multinom(formula = TRANSITIVITY ~ PAT_ACCESSIB + REC_ANIM + LEN_RECmPAT_LOG,
## data = d, na.action = na.exclude, model = TRUE)
##
## Coefficients:
## (Intercept) PAT_ACCESSIBnew REC_ANIMinanimate LEN_RECmPAT_LOG
## d_p -0.03385164 -0.3446204 0.04927733 0.4487349
## p_a -0.13661249 -0.3158986 0.97063967 1.1111132
## p_p -0.38134000 -1.2462871 0.56502687 1.0158562
##
## Std. Errors:
## (Intercept) PAT_ACCESSIBnew REC_ANIMinanimate LEN_RECmPAT_LOG
## d_p 0.1694037 0.1723732 0.1453316 0.05021125
## p_a 0.1843444 0.1917693 0.1480772 0.05767235
## p_p 0.2014785 0.2262448 0.2066145 0.07242679
##
## Residual Deviance: 3560.585
## AIC: 3584.585
anova(m.03, m.04a, test="Chisq") # compare models with a LRT
## Likelihood ratio tests of Multinomial Models
##
## Response: TRANSITIVITY
## Model Resid. df Resid. Dev
## 1 PAT_ACCESSIB + REC_ANIM + LEN_RECmPAT_LOG 5316 3560.585
## 2 REC_ACCESSIB + PAT_ACCESSIB + REC_ANIM + LEN_RECmPAT_LOG 5313 3535.734
## Test Df LR stat. Pr(Chi)
## 1
## 2 1 vs 2 3 24.85101 1.658861e-05
summary(m.04b <- update( # make m.04b an update of
m.03, .~. # m.03, namely all of it (.~.), but then
- PAT_ACCESSIB)) # remove this predictor
## # weights: 20 (12 variable)
## initial value 2462.058785
## iter 10 value 1798.965851
## final value 1784.182896
## converged
## Call:
## multinom(formula = TRANSITIVITY ~ REC_ACCESSIB + REC_ANIM + LEN_RECmPAT_LOG,
## data = d, na.action = na.exclude, model = TRUE)
##
## Coefficients:
## (Intercept) REC_ACCESSIBnew REC_ANIMinanimate LEN_RECmPAT_LOG
## d_p -0.3514652 0.09136321 0.02985044 0.4429333
## p_a -0.7382389 0.67129889 0.94339105 1.0495865
## p_p -1.4885262 0.46162804 0.40302393 0.9614303
##
## Std. Errors:
## (Intercept) REC_ACCESSIBnew REC_ANIMinanimate LEN_RECmPAT_LOG
## d_p 0.1155751 0.1420761 0.1449772 0.05162030
## p_a 0.1308031 0.1505184 0.1481442 0.05877521
## p_p 0.1714852 0.2072247 0.2029757 0.07349044
##
## Residual Deviance: 3568.366
## AIC: 3592.366
anova(m.03, m.04b, test="Chisq") # compare models with a LRT
## Likelihood ratio tests of Multinomial Models
##
## Response: TRANSITIVITY
## Model Resid. df Resid. Dev
## 1 REC_ACCESSIB + REC_ANIM + LEN_RECmPAT_LOG 5316 3568.366
## 2 REC_ACCESSIB + PAT_ACCESSIB + REC_ANIM + LEN_RECmPAT_LOG 5313 3535.734
## Test Df LR stat. Pr(Chi)
## 1
## 2 1 vs 2 3 32.63186 3.851131e-07
summary(m.04c <- update( # make m.04c an update of
m.03, .~. # m.03, namely all of it (.~.), but then
- REC_ANIM)) # remove this predictor
## # weights: 20 (12 variable)
## initial value 2462.058785
## iter 10 value 1817.346357
## final value 1791.948604
## converged
## Call:
## multinom(formula = TRANSITIVITY ~ REC_ACCESSIB + PAT_ACCESSIB +
## LEN_RECmPAT_LOG, data = d, na.action = na.exclude, model = TRUE)
##
## Coefficients:
## (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew LEN_RECmPAT_LOG
## d_p -0.05752161 0.1223786 -0.3639212 0.4382956
## p_a -0.11697191 0.7248194 -0.2874666 1.0718270
## p_p -0.44904006 0.6259620 -1.2894656 0.9707461
##
## Std. Errors:
## (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew LEN_RECmPAT_LOG
## d_p 0.1722890 0.1429678 0.1728674 0.05139989
## p_a 0.1884496 0.1499452 0.1922632 0.05805851
## p_p 0.2116643 0.2121530 0.2267210 0.07405764
##
## Residual Deviance: 3583.897
## AIC: 3607.897
anova(m.03, m.04c, test="Chisq") # compare models with a LRT
## Likelihood ratio tests of Multinomial Models
##
## Response: TRANSITIVITY
## Model Resid. df Resid. Dev
## 1 REC_ACCESSIB + PAT_ACCESSIB + LEN_RECmPAT_LOG 5316 3583.897
## 2 REC_ACCESSIB + PAT_ACCESSIB + REC_ANIM + LEN_RECmPAT_LOG 5313 3535.734
## Test Df LR stat. Pr(Chi)
## 1
## 2 1 vs 2 3 48.16327 1.965748e-10
summary(m.04d <- update( # make m.04d an update of
m.03, .~. # m.03, namely all of it (.~.), but then
- LEN_RECmPAT_LOG)) # remove this predictor
## # weights: 20 (12 variable)
## initial value 2462.058785
## iter 10 value 2038.920162
## final value 2021.374467
## converged
## Call:
## multinom(formula = TRANSITIVITY ~ REC_ACCESSIB + PAT_ACCESSIB +
## REC_ANIM, data = d, na.action = na.exclude, model = TRUE)
##
## Coefficients:
## (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew REC_ANIMinanimate
## d_p -0.6785844 0.4221001 -0.4274774 0.1569561
## p_a -1.3836692 1.5125323 -0.4844716 1.2427424
## p_p -1.5400404 1.3393672 -1.4252069 0.7773680
##
## Std. Errors:
## (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew REC_ANIMinanimate
## d_p 0.1578792 0.1351589 0.1690794 0.1416616
## p_a 0.1705027 0.1289408 0.1735308 0.1283868
## p_p 0.2002157 0.1943860 0.2135040 0.1943153
##
## Residual Deviance: 4042.749
## AIC: 4066.749
anova(m.03, m.04d, test="Chisq") # compare models with a LRT
## Likelihood ratio tests of Multinomial Models
##
## Response: TRANSITIVITY
## Model Resid. df Resid. Dev
## 1 REC_ACCESSIB + PAT_ACCESSIB + REC_ANIM 5316 4042.749
## 2 REC_ACCESSIB + PAT_ACCESSIB + REC_ANIM + LEN_RECmPAT_LOG 5313 3535.734
## Test Df LR stat. Pr(Chi)
## 1
## 2 1 vs 2 3 507.015 0
No, we can’t delete any more predictors, we’re done with model selection:
summary(m.final <- m.03); rm(m.04a, m.04b, m.04c, m.04d); invisible(gc())
## Call:
## multinom(formula = TRANSITIVITY ~ REC_ACCESSIB + PAT_ACCESSIB +
## REC_ANIM + LEN_RECmPAT_LOG, data = d, na.action = na.exclude,
## model = TRUE)
##
## Coefficients:
## (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew REC_ANIMinanimate
## d_p -0.07582023 0.1192941 -0.3564488 0.04390158
## p_a -0.41438351 0.7085056 -0.4224211 0.95956732
## p_p -0.61758006 0.6157147 -1.3373812 0.55492462
## LEN_RECmPAT_LOG
## d_p 0.4380379
## p_a 1.0425965
## p_p 0.9557104
##
## Std. Errors:
## (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew REC_ANIMinanimate
## d_p 0.1760039 0.1430298 0.1732770 0.1454120
## p_a 0.1950505 0.1519703 0.1942310 0.1492648
## p_p 0.2199506 0.2123822 0.2295769 0.2072032
## LEN_RECmPAT_LOG
## d_p 0.05167523
## p_a 0.05876898
## p_p 0.07436727
##
## Residual Deviance: 3535.734
## AIC: 3565.734
(m.final.zscores <- summary(m.final)$coefficients / summary(m.final)$standard.errors)
## (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew REC_ANIMinanimate
## d_p -0.4307872 0.834051 -2.057104 0.3019117
## p_a -2.1244935 4.662131 -2.174839 6.4286223
## p_p -2.8078127 2.899088 -5.825417 2.6781657
## LEN_RECmPAT_LOG
## d_p 8.476748
## p_a 17.740594
## p_p 12.851222
(m.final.pvalues <- 2*pnorm(abs(m.final.zscores), lower.tail=FALSE))
## (Intercept) REC_ACCESSIBnew PAT_ACCESSIBnew REC_ANIMinanimate
## d_p 0.666623116 4.042523e-01 3.967623e-02 7.627194e-01
## p_a 0.033628902 3.129516e-06 2.964221e-02 1.287656e-10
## p_p 0.004987923 3.742502e-03 5.697018e-09 7.402659e-03
## LEN_RECmPAT_LOG
## d_p 2.315757e-17
## p_a 2.037701e-70
## p_p 8.466167e-38
Let’s also compute the overall p-value for this model:
anova(m.00, m.final, # compare the null model to m.final
test="Chisq") # using a LRT
## Likelihood ratio tests of Multinomial Models
##
## Response: TRANSITIVITY
## Model Resid. df Resid. Dev
## 1 1 5325 4375.423
## 2 REC_ACCESSIB + PAT_ACCESSIB + REC_ANIM + LEN_RECmPAT_LOG 5313 3535.734
## Test Df LR stat. Pr(Chi)
## 1
## 2 1 vs 2 12 839.6888 0
pchisq( # compute the area under the chi-squared curve
q=839.6888, # for this chi-squared value
df=12, # at 12 df
lower.tail=FALSE) # only using the right/upper tail/side
## [1] 5.073936e-172
On to model quality: We make ‘predictions’ …
d <- cbind(d, # add to d the columns resulting from
predict( # predicting
m.final, # from m.final
type="prob")) # predicted probabilities
# give those columns better names
names(d)[10:13] <- paste0( # rename those columns to the result of pasting
"PREDS.", # "PREDS."
names(d)[10:13]) # in front of the existing names
d <- cbind(d, # add to d a column(d
PREDS.CAT=predict( # called PREDS.CAT, the predictions
m.final, # from m.final, namely
type="class")) # categorical predictions
head(d) # check the result
## CASE TRANSITIVITY REC_LEN PAT_LEN REC_ANIM PAT_SEM REC_ACCESSIB
## 1 1 d_a 5 8 animate abstract new
## 2 2 d_a 2 3 animate abstract new
## 3 3 d_a 2 2 inanimate information given
## 4 4 p_a 5 3 inanimate abstract new
## 5 5 d_a 2 3 animate abstract new
## 6 6 d_a 1 4 animate concrete given
## PAT_ACCESSIB LEN_RECmPAT_LOG PREDS.d_a PREDS.d_p PREDS.p_a PREDS.p_p
## 1 new -0.6780719 0.4729958 0.2570037 0.2051686 0.06483201
## 2 new -0.5849625 0.4558345 0.2579896 0.2178814 0.06829445
## 3 new 0.0000000 0.3272929 0.2219580 0.3700426 0.08070648
## 4 new 0.7369656 0.1261111 0.1330745 0.6243988 0.11641564
## 5 new -0.5849625 0.4558345 0.2579896 0.2178814 0.06829445
## 6 given -2.0000000 0.6460504 0.2493809 0.0530535 0.05151519
## PREDS.CAT
## 1 d_a
## 2 d_a
## 3 p_a
## 4 p_a
## 5 d_a
## 6 d_a
… and then we evaluate them:
(c.m <- table( # confusion matrix: cross-tabulate
"OBS" =d$TRANSITIVITY, # observed constructions in the rows
"PREDS"=d$PREDS.CAT)) # predicted constructions in the columns
## PREDS
## OBS d_a d_p p_a p_p
## d_a 736 0 74 0
## d_p 252 1 96 0
## p_a 132 1 343 0
## p_p 58 0 83 0
c(tapply(d$TRANSITIVITY==d$PREDS.CAT, d$TRANSITIVITY, mean), # accuracies for ...
"overall"=mean(d$TRANSITIVITY==d$PREDS.CAT)) # ... each level & overall
## d_a d_p p_a p_p overall
## 0.90864198 0.00286533 0.72058824 0.00000000 0.60810811
Plus, we need the model’s R2-values. We begin with the widely-used version of Nagelkerke’s R2, but then also add McFadden’s R2:
c("Nagelkerke's R2"=R2(m.final),
"McFadden's R2" =(m.00$deviance-m.final$deviance) / m.00$deviance)
## # weights: 8 (3 variable)
## initial value 2462.058785
## iter 10 value 2187.711475
## iter 10 value 2187.711469
## final value 2187.711344
## converged
## Nagelkerke's R2 McFadden's R2
## 0.4117980 0.1919103
We can also add PREDS.NUM.obs
, like we did for binary
logistic regression, but it’s a bit more complex here so I’ll just show
how it’s done without more details:
d$PREDS.NUM.obs <- predict(m.final, type="prob")[ # make this column the predictions
matrix(c( # that are in the positions defined here
seq(nrow(d)), # one prediction for each row
as.numeric(d$TRANSITIVITY)), ncol=2)] # namely the column of the observed result
Why are we doing this? To see whether we can compute the deviance of
the final model from its contributions to logloss – i.e.,
-log(d$PREDS.NUM.obs))
– again. And we can, the logic is
the same as before:
sum(-log(d$PREDS.NUM.obs)) * 2
## [1] 3535.734
deviance(m.final)
## [1] 3535.734
Of course, if you paid attention to the book and worked through everything there, much of this you could have done with this:
summarize.multinom(m.final)
## # weights: 8 (3 variable)
## initial value 2462.058785
## iter 10 value 2187.711475
## iter 10 value 2187.711469
## final value 2187.711344
## converged
## $Coefficients
## Estimate Std.Error 2.5% 97.5% z
## (Intercept):d_p -0.07582023 0.17600392 -0.4207816 0.26914111 -0.4307872
## (Intercept):p_a -0.41438351 0.19505050 -0.7966755 -0.03209156 -2.1244935
## (Intercept):p_p -0.61758006 0.21995059 -1.0486753 -0.18648482 -2.8078127
## REC_ACCESSIBnew:d_p 0.11929412 0.14302977 -0.1610391 0.39962731 0.8340510
## REC_ACCESSIBnew:p_a 0.70850560 0.15197033 0.4106492 1.00636196 4.6621312
## REC_ACCESSIBnew:p_p 0.61571469 0.21238222 0.1994532 1.03197619 2.8990877
## PAT_ACCESSIBnew:d_p -0.35644878 0.17327699 -0.6960654 -0.01683212 -2.0571039
## PAT_ACCESSIBnew:p_a -0.42242110 0.19423101 -0.8031069 -0.04173531 -2.1748386
## PAT_ACCESSIBnew:p_p -1.33738125 0.22957691 -1.7873437 -0.88741876 -5.8254170
## REC_ANIMinanimate:d_p 0.04390158 0.14541201 -0.2411007 0.32890388 0.3019117
## REC_ANIMinanimate:p_a 0.95956732 0.14926485 0.6670136 1.25212105 6.4286223
## REC_ANIMinanimate:p_p 0.55492462 0.20720324 0.1488137 0.96103551 2.6781657
## LEN_RECmPAT_LOG:d_p 0.43803793 0.05167523 0.3367563 0.53931953 8.4767481
## LEN_RECmPAT_LOG:p_a 1.04259654 0.05876898 0.9274115 1.15778162 17.7405938
## LEN_RECmPAT_LOG:p_p 0.95571035 0.07436727 0.8099532 1.10146753 12.8512225
## p2tailed
## (Intercept):d_p 6.666231e-01
## (Intercept):p_a 3.362890e-02
## (Intercept):p_p 4.987923e-03
## REC_ACCESSIBnew:d_p 4.042523e-01
## REC_ACCESSIBnew:p_a 3.129516e-06
## REC_ACCESSIBnew:p_p 3.742502e-03
## PAT_ACCESSIBnew:d_p 3.967623e-02
## PAT_ACCESSIBnew:p_a 2.964221e-02
## PAT_ACCESSIBnew:p_p 5.697018e-09
## REC_ANIMinanimate:d_p 7.627194e-01
## REC_ANIMinanimate:p_a 1.287656e-10
## REC_ANIMinanimate:p_p 7.402659e-03
## LEN_RECmPAT_LOG:d_p 2.315757e-17
## LEN_RECmPAT_LOG:p_a 2.037701e-70
## LEN_RECmPAT_LOG:p_p 8.466167e-38
##
## $`Confusion matrix`
## PRED
## OBS d_a d_p p_a p_p
## d_a 736 0 74 0
## d_p 252 1 96 0
## p_a 132 1 343 0
## p_p 58 0 83 0
##
## $`Classification metrics`
## Baseline Acc. for d_a Acc. for d_p Acc. for p_a
## 4.560811e-01 9.086420e-01 2.865330e-03 7.205882e-01
## Acc. for p_p Acc. for overall pbinom logloss
## 0.000000e+00 6.081081e-01 5.990962e-38 9.954206e-01
##
## $`Deviances & R2s`
## Deviance of null model Deviance of this model McFadden R2
## 4375.4226877 3535.7339348 0.1919103
## Nagelkerke R2
## 0.4117980
##
## $`Model significance test`
## LR statistic Df P-value
## 8.396888e+02 1.200000e+01 5.074054e-172
## Number of data points
## 1.776000e+03
Now let’s visualize the results of the predictors based on the
predictions, one at a time. We begin with REC_ACCESSIB
:
ra.d <- data.frame( # make ra.d a data frame of
ra <- effect( # the effect
"REC_ACCESSIB", # of REC_ACCESSIB
m.final)) # in m.final
ra.d[,1:5] # show just the predicted probabilities
## REC_ACCESSIB prob.d_a prob.d_p prob.p_a prob.p_p
## 1 given 0.4995828 0.2663792 0.1736107 0.06042732
## 2 new 0.3951903 0.2374149 0.2789172 0.08847765
plot(ra, # plot the effect of REC_ACCESSIB
type="probability", # but plot predicted probabilities
ylim=c(0, 1), # w/ these y-axis limits
grid=TRUE, # & w/ a grid
multiline=TRUE, # put all lines in one panel
confint=list(style="auto"), # keep confidence intervals
lattice=list(key.args=list( # make the
columns=4, # legend have 4 columns &
cex.title=0))) # no title
Next, PAT_ACCESSIB
:
pa.d <- data.frame( # make pa.d a data frame of
pa <- effect( # the effect
"PAT_ACCESSIB", # of PAT_ACCESSIB
m.final)) # in m.final
pa.d[,1:5] # show just the predicted probabilities
## PAT_ACCESSIB prob.d_a prob.d_p prob.p_a prob.p_p
## 1 given 0.3442863 0.2595748 0.2316187 0.16452012
## 2 new 0.4774871 0.2520581 0.2105525 0.05990231
plot(pa, # plot the effect of PAT_ACCESSIB
type="probability", # but plot predicted probabilities
ylim=c(0, 1), # w/ these y-axis limits
grid=TRUE, # & w/ a grid
multiline=TRUE, # put all lines in one panel
confint=list(style="auto"), # keep confidence intervals
lattice=list(key.args=list( # make the
columns=4, # legend have 4 columns &
cex.title=0))) # no title
REC_ANIM
is next:
rca.d <- data.frame( # make rca.d a data frame of
rca <- effect( # the effect
"REC_ANIM", # of REC_ANIM
m.final)) # in m.final
rca.d[,1:5] # show just the predicted probabilities
## REC_ANIM prob.d_a prob.d_p prob.p_a prob.p_p
## 1 animate 0.4969793 0.2747134 0.1643431 0.06396429
## 2 inanimate 0.3752305 0.2167235 0.3239262 0.08411979
plot(rca, # plot the effect of REC_ANIM
type="probability", # but plot predicted probabilities
ylim=c(0, 1), # w/ these y-axis limits
grid=TRUE, # & w/ a grid
multiline=TRUE, # put all lines in one panel
confint=list(style="auto"), # keep confidence intervals
lattice=list(key.args=list( # make the
columns=4, # legend have 4 columns &
cex.title=0))) # no title