STG: CART/RF in CorpLing (CLLT)

rm(list=ls(all=TRUE))
set.seed(060977) # setting a random-number generator seed to make the analyses random, but replicable



# Section 2.1: A small artificial data set ########################################################
## a first (made-up) small data set
P1s <- factor(rep(c("b","a","b","a"), c(1,7,9,3)) ) # predictor 1 small
P2s <- factor(rep(c("e","f","e"), c(6,10,4)))       # predictor 2 small
P3s <- factor(rep(c("m","n","m","n"), c(6,4,6,4)))  # predictor 3 small
DVs <- factor(rep(c("x","y"), c(10, 10)))           # response small
summary((D1 <- data.frame(P1s, P2s, P3s, DVs)))

##  P1s    P2s    P3s    DVs
##  a:10   e:10   m:12   x:10
##  b:10   f:10   n: 8   y:10

ftable(P1s, P2s, P3s, DVs)

##             DVs x y
## P1s P2s P3s
## a   e   m       5 0
##         n       0 3
##     f   m       0 0
##         n       2 0
## b   e   m       1 0
##         n       0 1
##     f   m       0 6
##         n       2 0

## if you try to predict DVs from P1s (a -> x, b -> y), you get 70% right
table(P1s, DVs)

##    DVs
## P1s x y
##   a 7 3
##   b 3 7

## if you try to predict DVs from P2s (e -> x, f -> y), you get 60% right
table(P2s, DVs)

##    DVs
## P2s x y
##   e 6 4
##   f 4 6

## if you try to predict DVs from P3s (m -> x, n -> y), you get 50% right, i.e. chance baseline
table(P3s, DVs)

##    DVs
## P3s x y
##   m 6 6
##   n 4 4

## but note: combining the two weak independent variables gives you perfect accuracy:
ftable(P2s, P3s, DVs)

##         DVs x y
## P2s P3s
## e   m       6 0
##     n       0 4
## f   m       0 6
##     n       4 0

table(P2s:P3s, DVs)

##      DVs
##       x y
##   e:m 6 0
##   e:n 0 4
##   f:m 0 6
##   f:n 4 0

## e+m -> x   and   e+n -> y
## f+m -> y   and   f+n -> x



## how do CART approaches handle this data set?
library(tree)
plot(tree.tree <- tree(DVs ~ P1s+P2s+P3s))
   text(tree.tree, pretty=0)

mean(predict(tree.tree, type="class")==DVs)

## [1] 0.75

detach(package:tree)


library(rpart)
plot(tree.rpart <- rpart(DVs ~ P1s+P2s+P3s), margin=0.1)
   text(tree.rpart, all=TRUE, use.n=TRUE)

mean(predict(tree.rpart, type="class")==DVs)

## [1] 0.7

detach(package:rpart)


library(party)
plot(tree.party <- ctree(DVs ~ P1s+P2s+P3s))

   mean(predict(tree.party)==DVs)

## [1] 0.5

detach(package:party)


library(partykit)
plot(tree.partykit <- ctree(DVs ~ P1s+P2s+P3s))

   mean(predict(tree.partykit)==DVs)

## [1] 0.5

detach(package:partykit)



# Section 2.2: The large version of the artificial data set #######################################
## a second (made-up) larger data set
P1l <- rep(P1s, 10) # predictor 1 large
P2l <- rep(P2s, 10) # predictor 2 large
P3l <- rep(P3s, 10) # predictor 3 large
DVl <- rep(DVs, 10) # response large
summary(D2 <- data.frame(P1l, P2l, P3l, DVl))

##  P1l     P2l     P3l     DVl
##  a:100   e:100   m:120   x:100
##  b:100   f:100   n: 80   y:100

## how do CART approaches handle this data set?
library(tree)
plot(tree.tree <- tree(DVl ~ P1l+P2l+P3l))
   text(tree.tree, pretty=0)

mean(predict(tree.tree, type="class")==DVs)

## [1] 1

### but does this go away when we prune the tree using misclassifications?
pruning <- cv.tree(tree.tree, FUN=prune.misclass)
plot(pruning$size, pruning$dev, type="b"); grid()

cart.1.pruned <- tree::prune.tree(tree.tree, best=4)
plot(cart.1.pruned)
   text(cart.1.pruned, pretty=0, all=TRUE)

mean(predict(cart.1.pruned, type="class")==DVl)

## [1] 0.85

### but does this go away when we prune the tree using deviance?
pruning <- cv.tree(tree.tree)
plot(pruning$size, pruning$dev, type="b"); grid()

cart.1.pruned <- tree::prune.tree(tree.tree, best=4)
plot(cart.1.pruned)
   text(cart.1.pruned, pretty=0, all=TRUE)

mean(predict(cart.1.pruned, type="class")==DVl)

## [1] 0.85

detach(package:tree)


library(rpart)
plot(tree.rpart <- rpart(DVl ~ P1l+P2l+P3l), margin=0.1)
   text(tree.rpart, all=TRUE, use.n=TRUE)

mean(predict(tree.rpart, type="class")==DVs)

## [1] 1

## but does this go away when we prune the tree using misclassifications?
plotcp(tree.rpart) # there is no value below the dotted line, thus, we pick 0.12

tree.rpart.pruned <- prune(tree.rpart, cp=0.12) # what we decided on in the previous code
plot(tree.rpart.pruned, margin=0.1)
   text(tree.rpart.pruned, pretty=TRUE, use.n=TRUE)

mean(predict(tree.rpart.pruned, type="class")==DVs)

## [1] 0.85

detach(package:rpart)


library(party)
plot(tree.ctree <- party::ctree(DVl ~ P1l+P2l+P3l))

mean(predict(tree.ctree)==DVs)

## [1] 1

detach(package:party)


library(partykit)
plot(tree.partykit <- ctree(DVl ~ P1l+P2l+P3l))

mean(predict(tree.partykit)==DVl)

## [1] 1

detach(package:partykit)



# Section 2.3: Random / conditional inference forests to the rescue? ##############################
library(randomForest)

## compute a random forest on data set 1 small
(rfs <- randomForest(DVs ~ P1s+P2s+P3s, mtry=2))

##
## Call:
##  randomForest(formula = DVs ~ P1s + P2s + P3s, mtry = 2)
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
##
##         OOB estimate of  error rate: 10%
## Confusion matrix:
##   x y class.error
## x 9 1         0.1
## y 1 9         0.1

mean(predict(rfs)==DVs) # compute prediction accuracy

## [1] 0.9

importance(rfs) # compute variable importance scores

##     MeanDecreaseGini
## P1s         1.651851
## P2s         2.950552
## P3s         3.618863

### compute partial dependency scores
(ps.1 <- partialPlot(rfs, D1, "P1s", ylim=c(-0.7, 0.7), ylab="Impact on DV='x'"))

## $x
## [1] "a" "b"
##
## $y
## [1]  0.6987539 -0.4461811

(ps.2 <- partialPlot(rfs, D1, "P2s", ylim=c(-0.7, 0.7), ylab="Impact on DV='x'"))

## $x
## [1] "e" "f"
##
## $y
## [1]  0.3658665 -0.3192668

(ps.3 <- partialPlot(rfs, D1, "P3s", ylim=c(-0.7, 0.7), ylab="Impact on DV='x'"))

## $x
## [1] "m" "n"
##
## $y
## [1] 0.025000839 0.004314669

plot(
   c(ps.1$y, ps.2$y, ps.3$y), xlab="",
   ylim=c(-0.7, 0.7), ylab="Impact on the response being 'x'",
   type="n", axes=FALSE)
text(1:6, c(ps.1$y, ps.2$y, ps.3$y), c(ps.1$x, ps.2$x, ps.3$x))
   grid(); abline(h=0, lty=2); axis(2)
   axis(1, at=c(1.5, 3.5, 5.5), labels=paste0("Predictor", 1:3, "'s levels"))

## compute a random forest on data set 1 large
(rfl <- randomForest(DVl ~ P1l+P2l+P3l, mtry=2))

##
## Call:
##  randomForest(formula = DVl ~ P1l + P2l + P3l, mtry = 2)
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
##
##         OOB estimate of  error rate: 0%
## Confusion matrix:
##     x   y class.error
## x 100   0           0
## y   0 100           0

mean(predict(rfl)==DVl) # compute prediction accuracy

## [1] 1

importance(rfl) # compute variable importance scores

##     MeanDecreaseGini
## P1l         12.93793
## P2l         20.09449
## P3l         51.40719

### compute partial dependency scores
(pl.1 <- partialPlot(rfl, D2, "P1l", ylim=c(-6, 6), ylab="Impact on DV='x'"))

## $x
## [1] "a" "b"
##
## $y
## [1]  5.583007 -5.111478

(pl.2 <- partialPlot(rfl, D2, "P2l", ylim=c(-6, 6), ylab="Impact on DV='x'"))

## $x
## [1] "e" "f"
##
## $y
## [1]  4.409282 -5.749225

(pl.3 <- partialPlot(rfl, D2, "P3l", ylim=c(-6, 6), ylab="Impact on DV='x'"))

## $x
## [1] "m" "n"
##
## $y
## [1] 0.13265772 0.06094591

plot(
   c(pl.1$y, pl.2$y, pl.3$y), xlab="",
   ylim=c(-6, 6), ylab="Impact on the response being 'x'",
   type="n", axes=FALSE)
text(1:6, c(pl.1$y, pl.2$y, pl.3$y), c(pl.1$x, pl.2$x, pl.3$x))
   grid(); abline(h=0, lty=2); axis(2)
   axis(1, at=c(1.5, 3.5, 5.5), labels=paste0("Predictor", 1:3, "'s levels"))

detach(package:randomForest)


library(party)

## compute a conditional random forest on data set 1 small
cfs <- cforest(DVs ~ P1s+P2s+P3s, controls=cforest_control(mtry=ceiling(sqrt(3))))

mean(predict(cfs)==DVs) # compute prediction accuracy

## [1] 0.7

### compute variable importance scores
varimp(cfs)

##          P1s          P2s          P3s
##  0.053528571 -0.008375469 -0.011546970

varimp(cfs, conditional=TRUE)

##         P1s         P2s         P3s
##  0.04797619 -0.01015079 -0.01155570

# varimpAUC(cfs)
# varimpAUC(cfs, conditional=TRUE)

## compute a random forest on data set 1 large
cfl <- cforest(DVl ~ P1l+P2l+P3l, controls=cforest_control(mtry=ceiling(sqrt(3))))

mean(predict(cfl)==DVl) # compute prediction accuracy

## [1] 1

### compute variable importance scores
varimp(cfl)

##       P1l       P2l       P3l
## 0.1438758 0.2251775 0.3229000

varimp(cfl, conditional=TRUE)

##        P1l        P2l        P3l
## 0.01227272 0.20476215 0.31794746

varimpAUC(cfl)

##       P1l       P2l       P3l
## 0.1562850 0.2571000 0.3534203

varimpAUC(cfl, conditional=TRUE)

##        P1l        P2l        P3l
## 0.01518716 0.23984828 0.35596357

detach(package:party)


library(partykit)

## compute a conditional random forest on data set 1 small
cfs <- cforest(DVs ~ P1s+P2s+P3s)

mean(predict(cfs)==DVs) # compute prediction accuracy

## [1] 0.5

### compute variable importance scores
varimp(cfs)

## named numeric(0)

varimp(cfs, conditional=TRUE)

## named numeric(0)

## compute a random forest on data set 1 large
cfl <- cforest(DVl ~ P1l+P2l+P3l)

mean(predict(cfl)==DVl) # compute prediction accuracy

## [1] 1

### compute variable importance scores
varimp(cfl)

##      P1l      P2l      P3l
## 2.311940 4.010550 5.147123

varimp(cfl, conditional=TRUE)

##       P1l       P2l       P3l
## 0.1238174 3.7095522 5.0656456

detach(package:partykit)



# Section 3.1: Improving trees' and forests' performance: interaction variables as predictors #####
## create interaction variables for both data sets
P1sxP2s <- P1s:P2s; P1sxP3s <- P1s:P3s; P2sxP3s <- P2s:P3s
   summary(D1 <- cbind(D1, P1sxP2s, P1sxP3s, P2sxP3s))

##  P1s    P2s    P3s    DVs    P1sxP2s P1sxP3s P2sxP3s
##  a:10   e:10   m:12   x:10   a:e:8   a:m:5   e:m:6
##  b:10   f:10   n: 8   y:10   a:f:2   a:n:5   e:n:4
##                              b:e:2   b:m:7   f:m:6
##                              b:f:8   b:n:3   f:n:4

P1lxP2l <- P1l:P2l; P1lxP3l <- P1l:P3l; P2lxP3l <- P2l:P3l
   summary(D2 <- cbind(D2, P1lxP2l, P1lxP3l, P2lxP3l))

##  P1l     P2l     P3l     DVl     P1lxP2l  P1lxP3l  P2lxP3l
##  a:100   e:100   m:120   x:100   a:e:80   a:m:50   e:m:60
##  b:100   f:100   n: 80   y:100   a:f:20   a:n:50   e:n:40
##                                  b:e:20   b:m:70   f:m:60
##                                  b:f:80   b:n:30   f:n:40

## how do CART approaches handle the small data set with the interaction variables?
plot(tree.tree <- tree::tree(DVs ~ P1s+P2s+P3s+P1sxP2s+P1sxP3s+P2sxP3s))
   text(tree.tree, pretty=0)

   mean(predict(tree.tree, type="class")==DVs)

## [1] 1

plot(tree.rpart <- rpart::rpart(DVs ~ P1s+P2s+P3s+P1sxP2s+P1sxP3s+P2sxP3s), margin=0.1)
   text(tree.rpart, all=TRUE, use.n=TRUE)

   mean(predict(tree.rpart, type="class")==DVs)

## [1] 1

plot(tree.party <- party::ctree(DVs ~ P1s+P2s+P3s+P1sxP2s+P1sxP3s+P2sxP3s))

   mean(predict(tree.party)==DVs)

## [1] 1

plot(tree.partykit <- partykit::ctree(DVs ~ P1s+P2s+P3s+P1sxP2s+P1sxP3s+P2sxP3s))

   mean(predict(tree.partykit)==DVs)

## [1] 1

## how do CART approaches handle the large data set with the interaction variables?
plot(tree.tree <- tree::tree(DVl ~ P1l+P2l+P3l+P1lxP2l+P1lxP3l+P2lxP3l))
   text(tree.tree, pretty=0)

   mean(predict(tree.tree, type="class")==DVs)

## [1] 1

plot(tree.rpart <- rpart::rpart(DVl ~ P1l+P2l+P3l+P1lxP2l+P1lxP3l+P2lxP3l), margin=0.1)
   text(tree.rpart, all=TRUE, use.n=TRUE)

   mean(predict(tree.rpart, type="class")==DVs)

## [1] 1

plot(tree.party <- party::ctree(DVl ~ P1l+P2l+P3l+P1lxP2l+P1lxP3l+P2lxP3l))

   mean(predict(tree.party)==DVs)

## [1] 1

plot(tree.partykit <- partykit::ctree(DVl ~ P1l+P2l+P3l+P1lxP2l+P1lxP3l+P2lxP3l))

   mean(predict(tree.partykit)==DVs)

## [1] 1

## how do forests react to this?
library(randomForest)

### compute a random forest on data set 1 small
(rfs <- randomForest(DVs ~ P1s+P2s+P3s+P1sxP2s+P1sxP3s+P2sxP3s))

##
## Call:
##  randomForest(formula = DVs ~ P1s + P2s + P3s + P1sxP2s + P1sxP3s + P2sxP3s)
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
##
##         OOB estimate of  error rate: 0%
## Confusion matrix:
##    x  y class.error
## x 10  0           0
## y  0 10           0

mean(predict(rfs)==DVs) # compute prediction accuracy

## [1] 1

importance(rfs) # compute variable importance scores

##         MeanDecreaseGini
## P1s            0.3922734
## P2s            0.4098387
## P3s            0.5992698
## P1sxP2s        1.2970844
## P1sxP3s        1.7998302
## P2sxP3s        4.7386206

### compute partial dependency scores for the by far most important effect
(ps.23 <- partialPlot(rfs, D1, "P2sxP3s", ylab="Impact on DV='x'"))

## $x
## [1] "e:m" "e:n" "f:m" "f:n"
##
## $y
## [1]  1.1554860 -0.7337640 -5.4989573  0.6205407

## compute a random forest on data set 1 large
(rfl <- randomForest(DVl ~ P1l+P2l+P3l+P1lxP2l+P1lxP3l+P2lxP3l))

##
## Call:
##  randomForest(formula = DVl ~ P1l + P2l + P3l + P1lxP2l + P1lxP3l + P2lxP3l)
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
##
##         OOB estimate of  error rate: 0%
## Confusion matrix:
##     x   y class.error
## x 100   0           0
## y   0 100           0

mean(predict(rfl)==DVl) # compute prediction accuracy

## [1] 1

importance(rfl) # compute variable importance scores

##         MeanDecreaseGini
## P1l             2.619442
## P2l             3.887269
## P3l             6.185997
## P1lxP2l        11.790335
## P1lxP3l        17.366405
## P2lxP3l        53.361195

### compute partial dependency scores for the by far most important effect
(pl.23 <- partialPlot(rfl, D2, "P2lxP3l", ylab="Impact on DV='x'"))

## $x
## [1] "e:m" "e:n" "f:m" "f:n"
##
## $y
## [1]  5.0394694 -0.8466043 -5.4883717  0.8304328

detach(package:randomForest)


library(party)

## compute a conditional random forest on data set 1 small
cfs <- cforest(DVs ~ P1s+P2s+P3s+P1sxP2s+P1sxP3s+P2sxP3s,
   controls=cforest_control(mtry=ceiling(sqrt(6))))

mean(predict(cfs)==DVs) # compute prediction accuracy

## [1] 1

### compute variable importance scores
varimp(cfs)

##           P1s           P2s           P3s       P1sxP2s       P1sxP3s       P2sxP3s
##  0.0051071429 -0.0002142857 -0.0037341270  0.0136595238  0.0513515873  0.2023887446

varimp(cfs, conditional=TRUE)

##           P1s           P2s           P3s       P1sxP2s       P1sxP3s       P2sxP3s
## -0.0023666667  0.0005714286 -0.0022950938  0.0072666667  0.0460063492  0.1783181818

# varimpAUC(cfs)
# varimpAUC(cfs, conditional=TRUE)

## compute a conditional random forest on data set 1 large
cfl <- cforest(DVl ~ P1l+P2l+P3l+P1lxP2l+P1lxP3l+P2lxP3l,
   controls=cforest_control(mtry=ceiling(sqrt(6))))

mean(predict(cfl)==DVl) # compute prediction accuracy

## [1] 1

### compute variable importance scores
varimp(cfl)

##        P1l        P2l        P3l    P1lxP2l    P1lxP3l    P2lxP3l
## 0.02488977 0.02004302 0.03594787 0.07913391 0.10344119 0.32938993

varimp(cfl, conditional=TRUE)

##         P1l         P2l         P3l     P1lxP2l     P1lxP3l     P2lxP3l
## 0.002842231 0.012528873 0.024065912 0.038540707 0.051880157 0.302427169

varimpAUC(cfl)

##        P1l        P2l        P3l    P1lxP2l    P1lxP3l    P2lxP3l
## 0.02529408 0.02071179 0.03670507 0.07787176 0.10037292 0.32650967

varimpAUC(cfl, conditional=TRUE)

##         P1l         P2l         P3l     P1lxP2l     P1lxP3l     P2lxP3l
## 0.003667529 0.013081199 0.026005910 0.038646316 0.054449572 0.309123440

detach(package:party)



library(partykit)

## compute a conditional random forest on data set 1 small
cfs <- cforest(DVs ~ P1s+P2s+P3s+P1sxP2s+P1sxP3s+P2sxP3s)

mean(predict(cfs)==DVs) # compute prediction accuracy

## [1] 0.5

### compute variable importance scores
varimp(cfs)

## named numeric(0)

varimp(cfs, conditional=TRUE)

## named numeric(0)

## compute a conditional random forest on data set 1 large
cfl <- cforest(DVl ~ P1l+P2l+P3l+P1lxP2l+P1lxP3l+P2lxP3l)

mean(predict(cfl)==DVl) # compute prediction accuracy

## [1] 1

### compute variable importance scores
varimp(cfl)

##      P1l      P2l      P3l  P1lxP2l  P1lxP3l  P2lxP3l
## 2.682614 2.508443 3.027808 3.399461 3.445935 6.588357

# varimp(cfl, conditional=TRUE)

detach(package:partykit)


## find interactions
library(randomForestSRC)
rf.src <- rfsrc(DVl ~ P1l+P2l+P3l, data=D2)
find.interaction(rf.src, method="vimp")

## Pairing P1l with P2l
## Pairing P1l with P3l
## Pairing P2l with P3l
##
##                               Method: vimp
##                     No. of variables: 3
##            Variables sorted by VIMP?: TRUE
##    No. of variables used for pairing: 3
##     Total no. of paired interactions: 3
##             Monte Carlo replications: 1
##     Type of noising up used for VIMP: permute
##
##          Var 1  Var 2 Paired Additive Difference
## P1l:P2l 0.1277 0.3724 0.4815   0.5000    -0.0185
## P1l:P3l 0.1277 0.4144 0.4833   0.5420    -0.0587
## P2l:P3l 0.3698 0.4144 0.4371   0.7842    -0.3471

# note the big difference returned for P2l:P3l
detach(package:randomForestSRC)

library(edarf)
partial_dependence(rf.src, vars=c("P1l"), n=c(10, 20)) # partial dependence of P1l

##   P1l         x         y
## 1   a 0.6572153 0.3427847
## 2   b 0.3031674 0.6968326

partial_dependence(rf.src, vars=c("P1l", "P2l"), interaction=TRUE, data=D2) # partial dep. of interaction P1l:P2l

##   P1l P2l         x         y
## 1   a   e 0.6113153 0.3886847
## 2   a   f 0.8579153 0.1420847
## 3   b   e 0.5822289 0.4177711
## 4   b   f 0.3959764 0.6040236

partial_dependence(rf.src, vars=c("P2l", "P3l"), interaction=TRUE, data=D2) # partial dep. of interaction P2l:P3l

##   P2l P3l          x          y
## 1   e   m 0.96972505 0.03027495
## 2   e   n 0.03734273 0.96265727
## 3   f   m 0.40129876 0.59870124
## 4   f   n 0.96541644 0.03458356

# note how in the latter, three predicted probabilities of x and y exceed 95%, that's the interaction



# Section 3.2: Representing random forests: representative trees ##################################
forest.4.reprtree <- randomForest::randomForest(DVl ~ P1l+P2l+P3l)
plot(reptree <- reprtree::ReprTree(forest.4.reprtree, D2, metric="d2"))

# Section 3.3: Representing random forests: effects (plots) #######################################
## example with a version of D2 without the interactions (for the sake of simplicity)
D2.maineffects <- D2[,1:4]

### add to the data a column with the combination of predictor levels
D2.maineffects$COMBO <- apply(D2.maineffects[,1:3], 1, paste, collapse="_")

### add to the data a column that states how frequent each row's predictor combination is (just for housekeeping)
# D2.maineffects$COMBOFREQ <- table(D2.maineffects$COMBO)[D2.maineffects$COMBO]

### create all possible combinations of predictor levels
all.possible.combs <- expand.grid(
   P1l=levels(D2.maineffects$P1l),
   P2l=levels(D2.maineffects$P2l),
   P3l=levels(D2.maineffects$P3l)
)

### add to the possible combinations a column with the combination of predictor levels
all.possible.combs$COMBO <- apply(all.possible.combs, 1, paste, collapse="_")

### add to the possible combinations a column that states how frequent each row's possible combinations is
all.possible.combs$COMBOFREQINREALDATA <- table(D2.maineffects$COMBO)[all.possible.combs$COMBO]
   all.possible.combs$COMBOFREQINREALDATA[is.na(all.possible.combs$COMBOFREQINREALDATA)] <- 0

### add to the possible combinations a column that provides the predicted probability of DV=="y"
all.possible.combs$PREDPROB <- predict(
   randomForest::randomForest(DVl ~ P1l + P2l + P3l, data=D2.maineffects),
   newdata=all.possible.combs, type="prob")[,2]

### the effect of P1l:a on the predicted probability of DV=="y"
with(
   subset(all.possible.combs, all.possible.combs$P1l=="a"), {
      weighted.mean(PREDPROB, COMBOFREQINREALDATA)
   })

## [1] 0.2618

### the effect of P1l:b on the predicted probability of DV=="y"
with(
   subset(all.possible.combs, all.possible.combs$P1l=="b"), {
      weighted.mean(PREDPROB, COMBOFREQINREALDATA)
   })

## [1] 0.7386

### the predictions for P2:P3 (for DV=="y")
with(
   subset(all.possible.combs, grepl("e_m", all.possible.combs$COMBO)), {
      weighted.mean(PREDPROB, COMBOFREQINREALDATA)
   })

## [1] 0.1266667

with(
   subset(all.possible.combs, grepl("e_n", all.possible.combs$COMBO)), {
      weighted.mean(PREDPROB, COMBOFREQINREALDATA)
   })

## [1] 0.574

with(
   subset(all.possible.combs, grepl("f_m", all.possible.combs$COMBO)), {
      weighted.mean(PREDPROB, COMBOFREQINREALDATA)
   })

## [1] 0.916

with(
   subset(all.possible.combs, grepl("f_n", all.possible.combs$COMBO)), {
      weighted.mean(PREDPROB, COMBOFREQINREALDATA)
   })

## [1] 0.363

# Section 3.4: Representing random forests: global surrogate models ###############################
library(MASS)
predicted.categories <- predict(rfl, type="class")

summary(gsm <- stepAIC(glm(predicted.categories ~ P1l*P2l*P3l, family=binomial)))

## [omitted some messages/output, STG]
## Call:
## glm(formula = predicted.categories ~ P2l + P3l + P2l:P3l, family = binomial)
##
## Deviance Residuals:
##        Min          1Q      Median          3Q         Max
## -2.409e-06  -2.409e-06   0.000e+00   2.409e-06   2.409e-06
##
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept)    -26.57   45975.41  -0.001    1.000
## P2lf            53.13   65019.06   0.001    0.999
## P3ln            53.13   72693.41   0.001    0.999
## P2lf:P3ln     -106.26  102804.08  -0.001    0.999
##
## (Dispersion parameter for binomial family taken to be 1)
##
##     Null deviance: 2.7726e+02  on 199  degrees of freedom
## Residual deviance: 1.1603e-09  on 196  degrees of freedom
## AIC: 8
##
## Number of Fisher Scoring iterations: 25

## because of the perfect fit in the simulated data, we're running into convergence problems and are getting ns results with huge CIs - in real data, this does not happen, see below

## perfect accuracy
table(
   ifelse(predict(gsm)>=0, "y", "x"), # the dichotomized predictions from the gsm
   predicted.categories)              # the actually predicted categories from the random forest

##    predicted.categories
##       x   y
##   x 100   0
##   y   0 100

## recovers the interaction and nothing else: generate an effects plot of the gsm
plot(effects::allEffects(gsm), type="response",
   ylim=c(0, 1), ylab="Predicted probability of DV=y",
   multiline=TRUE, grid=TRUE)

## demonstration that, with real data, gsms do not run into the convergence issues from above
rm(list=ls(all=TRUE)); set.seed(314)
# generate the clause-orders data from Gries (2013: Section 5.3)
x<-structure(list(
CASE=c(4777,1698,953,1681,4055,967,2658,2502,4455,2368,1729,4546,586,4608,935,2108,4317,4318,2385,4723,3267,4558,1747,623,4507,138,80,952,2602,599,4037,4467,4324,2928,4728,4473,1688,3805,2472,84,997,998,988,474,4737,4523,4805,4808,4802,1063,4841,4769,4531,1677,1649,4813,4882,4293,4544,4763,4552,4881,3024,2614,4069,4278,4861,2517,4509,941,4838,2619,2530,2485,4878,4495,4458,4468,4859,4610,1017,4568,1638,1256,1938,1724,964,1784,1661,619,4503,4604,4778,3617,450,1117,2383,2018,2574,131,2382,4672,2017,3570,5052,4897,1783,4670,3926,3701,4974,2822,5051,452,1184,3489,4896,3616,2820,3925,2818,5053,2825,1118,2576,141,2819,2456,4673,1182,904,4898,2648,4671,905,4899,130,454,2575,4675,2834,2849,1855,2838,4151,2459,4154,4148,2024,456,1285,2832,62,4144,10,4453,3764,2839,2830,4149,4155,4145,2835,4159,9,2020,2846,3927,2841,223,4143,4156,458,4680,3210,4674,4679,4677,3490,2579,4681,2845,1555,63,3928,2844,2847,3763,2842,2831,2577,3211,4902,2837,2458,4146,4152,4153,2827,4147,2457,2840,1186,4452,2731,332,3547,4092,4078,2409,326,1795,823,2754,4646,2751,4341,324,4087,2746,1796,3546,4633,1809,1432,327,330,341,1799,329,1808,4339,2421,1103,2742,3889,2552,4629,4081,4090,2728,122,1307,2752,4110,2662,2732,4099,351,4097,4089,2729,3171,3166,821,2005,1441,4335,349,316,3542,2550,2748,825,3552,3184,3993,2,2736,3173,3176,1804,4338,4094,2422,2741,4080,4084,3545,1442,3179,1304,3177,3549,3543,1100,1309,2740,328,4100,2418,2554,1810,1440,2423,4098,3170,2758,2405,1779,4107,1803,1308,1811,101,3178,3994,2420,4642,333,339,2663,3169,2543,2553,322,1272,321,4334,2756,4648,2750,3551,4108,3744,2749,822,4103,323,102,2664,3746,4105,1434,348,347,3550,4085,1104,4637,4636,343,345,3168,103,4639,3845,3548,2747,3181,1429,4643,2735,1793,1439,1792,1305,831,2406,4640,2734,4079,4101,820,1800,824,2226,4631,3890,2006,3167,2545,4102,342,3745,352,3180,4093,2551,2425,4106,2640,5043,4647,2737,1433,1797,4109,1798,354,355,2743,3182,4638,2757,829,318,4644,1794,4095,2733,350,208),
ORDER=structure(c(2,1,2,1,2,2,1,2,2,2,2,2,2,1,2,2,1,1,2,1,1,1,2,2,2,2,2,1,1,2,2,2,1,1,2,1,2,1,2,2,1,2,2,2,1,1,2,1,1,2,2,2,1,1,2,2,2,1,2,1,1,1,1,2,2,2,1,2,2,2,2,2,1,2,1,2,2,2,2,2,1,2,1,1,1,1,1,2,1,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,2,2,1,1,2,2,2,1,1,1,2,1,2,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,2,2,2,1,1,2,2,2,1,1,2,1,2,2,1,2,1,1,1,2,2,2,2,1,2,1,1,1,2,2,2,1,2,1,2,2,1,2,1,2,2,2,2,2,2,2,2,2,2,1,1,2,2,2,1,2,1,1,2,2,1,2,2,1,1,2,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,2,1,1,1,1,1,1,1,2,2,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2),.Label=c("mc-sc","sc-mc"),class="factor"),
SUBORDTYPE=structure(c(2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),.Label=c("caus","temp"),class="factor"),
LEN_MC=c(4,7,12,6,9,9,9,4,6,4,15,11,10,5,6,9,10,8,13,6,14,5,6,10,14,11,7,4,2,7,24,4,5,6,5,4,18,11,5,10,3,5,10,8,4,5,4,6,6,18,3,10,10,4,10,10,7,8,15,6,11,10,7,9,10,10,5,16,8,12,17,6,7,17,5,10,9,7,5,9,3,7,5,4,4,3,6,5,8,12,11,4,4,12,17,3,3,8,3,5,5,4,11,11,8,7,11,10,7,19,9,7,6,10,23,5,24,6,12,10,9,9,17,4,7,11,13,6,10,18,6,4,6,9,16,15,10,17,5,4,6,13,8,10,3,16,12,7,7,17,18,10,8,8,16,7,14,6,13,3,14,12,15,9,8,11,7,6,15,6,6,12,12,5,12,10,22,17,15,15,3,13,5,31,5,11,7,12,8,4,4,8,5,9,29,6,8,10,5,22,13,10,9,28,9,9,19,7,8,6,6,7,5,10,15,13,8,4,5,9,8,6,6,11,10,14,8,9,3,6,9,12,5,16,5,16,7,8,14,8,4,3,8,13,15,5,7,9,9,17,6,9,12,4,7,6,15,9,14,10,17,6,7,11,8,5,5,24,6,7,11,26,11,17,19,6,9,16,17,11,7,5,5,6,9,11,5,6,7,7,4,8,13,10,14,6,3,7,6,9,7,18,10,16,31,4,5,9,8,4,9,5,7,6,6,4,23,10,6,10,11,9,27,7,14,7,3,4,12,15,7,6,10,6,9,3,9,3,7,14,10,3,6,11,20,15,4,15,3,18,5,15,5,12,13,7,6,7,9,4,9,10,5,5,6,6,8,5,9,5,5,11,12,10,8,8,11,15,7,4,7,8,8,5,8,5,7,7,19,9,17,8,24,9,7,7,3,9,13,10,8,8,9),
LEN_SC=c(10,6,7,15,5,5,12,2,24,11,4,3,11,11,6,11,11,7,7,13,4,3,4,5,20,5,3,5,9,6,8,10,17,11,9,6,18,15,4,20,14,3,12,11,4,10,4,5,7,7,4,7,4,7,22,7,12,22,6,9,2,6,17,4,2,10,10,4,3,2,8,4,17,4,10,7,5,6,2,7,14,16,5,5,3,3,3,6,5,6,8,3,10,5,14,7,5,5,4,8,6,6,6,9,12,2,19,6,10,13,5,9,13,7,6,8,5,5,9,11,6,8,4,3,3,4,4,13,17,8,4,3,4,8,5,3,5,6,9,8,12,14,7,14,8,5,9,21,14,7,14,7,20,21,15,16,18,4,11,7,8,15,10,6,29,6,14,8,13,5,9,11,21,5,10,12,17,8,10,7,4,14,4,13,4,11,9,5,10,12,6,8,8,7,4,9,5,6,14,7,4,15,6,12,4,7,7,15,7,13,9,11,23,7,14,4,8,17,8,10,22,9,9,20,8,5,5,10,7,15,7,2,5,7,5,15,6,9,19,11,4,3,8,16,6,13,7,7,20,18,8,8,4,11,13,10,7,6,12,7,23,6,10,10,5,6,7,11,10,6,9,13,10,17,10,8,8,13,12,9,5,10,6,18,13,18,10,13,5,8,8,5,15,9,13,9,8,10,5,16,10,9,10,6,16,19,6,4,5,21,29,21,8,5,7,9,5,13,5,7,17,16,17,12,4,9,11,12,9,15,7,9,5,5,6,3,5,5,10,8,6,7,5,11,19,5,11,14,26,5,4,5,10,17,9,8,5,9,10,36,9,16,8,20,14,6,17,12,5,7,6,9,13,15,10,20,8,17,4,3,9,4,8,6,14,2,5,14,9,18,18,9,6,7,6,6,11,11,23,7,5,11,6),
LENGTH_DIFF=c(-6,1,5,-9,4,4,-3,2,-18,-7,11,8,-1,-6,0,-2,-1,1,6,-7,10,2,2,5,-6,6,4,-1,-7,1,16,-6,-12,-5,-4,-2,0,-4,1,-10,-11,2,-2,-3,0,-5,0,1,-1,11,-1,3,6,-3,-12,3,-5,-14,9,-3,9,4,-10,5,8,0,-5,12,5,10,9,2,-10,13,-5,3,4,1,3,2,-11,-9,0,-1,1,0,3,-1,3,6,3,1,-6,7,3,-4,-2,3,-1,-3,-1,-2,5,2,-4,5,-8,4,-3,6,4,-2,-7,3,17,-3,19,1,3,-1,3,1,13,1,4,7,9,-7,-7,10,2,1,2,1,11,12,5,11,-4,-4,-6,-1,1,-4,-5,11,3,-14,-7,10,4,3,-12,-13,1,-9,-4,2,2,-4,6,-3,5,3,-21,5,-7,-2,2,1,-3,1,-9,0,2,-2,5,9,5,8,-1,-1,1,18,1,0,-2,7,-2,-8,-2,0,-3,2,25,-3,3,4,-9,15,9,-5,3,16,5,2,12,-8,1,-7,-3,-4,-18,3,1,9,0,-13,-3,-1,-14,-3,-3,-9,2,9,3,-1,-4,-9,2,10,0,9,0,1,1,-1,-5,-3,0,0,0,-3,9,-8,0,2,-11,-1,-2,1,8,-7,-6,-4,8,3,2,3,-6,0,-3,1,3,-1,-2,13,-4,1,2,13,1,0,9,-2,1,3,5,2,2,-5,-1,-12,-4,-7,-5,-7,2,-1,-4,3,-2,1,1,-3,-5,-3,1,-7,-3,9,0,10,15,-15,-1,5,3,-17,-20,-16,-1,1,-1,-5,18,-3,1,3,-6,-7,10,-5,10,-2,-8,-8,3,0,0,-3,5,1,3,0,4,-2,-3,6,4,-4,1,0,1,10,-7,1,-23,13,1,10,-5,-5,4,-1,1,-2,-1,-32,0,-6,-3,-15,-8,0,-9,-7,4,-2,-1,2,-1,-5,-2,-12,3,-2,3,1,-2,4,0,-1,-6,3,2,-7,10,-9,-1,-1,18,2,1,1,-8,-2,-10,3,3,-3,3),
CONJ=structure(c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4),.Label=c("als/when","bevor/before","nachdem/after","weil/because"),class="factor"),
MORETHAN2CL=structure(c(1,1,2,1,2,2,1,2,1,1,2,1,2,2,2,1,1,1,1,1,2,2,2,2,2,1,2,2,1,2,2,1,2,1,2,2,2,2,1,2,1,1,1,2,2,2,2,1,1,2,2,2,1,2,2,1,2,2,2,2,1,1,2,2,1,1,1,1,1,1,1,1,2,2,2,1,2,2,2,2,2,2,2,1,2,1,1,2,1,2,1,1,2,2,2,2,2,2,2,1,1,2,2,1,1,2,2,1,1,1,1,1,2,1,2,1,2,2,2,2,1,1,2,2,1,1,1,1,1,1,2,1,2,1,1,2,2,1,1,1,2,2,2,2,2,2,2,1,2,1,2,1,1,1,2,2,2,2,2,2,2,1,1,2,2,2,1,1,2,1,2,2,2,1,1,2,1,1,1,1,2,1,1,1,2,1,2,2,2,2,2,2,1,1,1,2,1,2,2,1,2,1,1,2,2,1,2,2,1,1,2,2,2,1,1,1,1,1,1,2,1,1,2,2,1,1,2,2,1,2,2,1,2,1,2,2,2,2,1,1,2,2,2,1,2,1,2,2,2,2,2,2,2,1,1,1,1,2,1,1,2,1,1,2,1,2,2,1,2,2,2,2,1,1,2,1,1,1,1,2,2,1,2,2,2,2,2,2,2,1,2,1,2,2,2,2,1,1,1,1,2,1,1,2,2,2,1,2,1,1,1,1,2,2,2,1,1,2,2,1,1,1,2,2,1,1,1,2,2,2,2,2,1,1,2,2,2,1,1,1,1,2,2,2,1,2,2,2,2,1,1,1,1,2,2,1,2,2,1,2,1,1,1,1,1,2,1,1,1,2,1,2,1,1,2,1,2,1,2,2,1,1,2,1,2,1,2,2,2,2,2,2,2,2,1,2,2,1,1,2,2,1,2),.Label=c("no","yes"),class="factor")),
class="data.frame",row.names=c(NA,-403))
   x$LEN_MC <- log2(x$LEN_MC)
   x$LEN_SC <- log2(x$LEN_SC)
   x$LENGTH_DIFF <- x$LEN_MC-x$LEN_SC
   x$SUBORDTYPE <- factor(x$SUBORDTYPE,
   levels=levels(x$SUBORDTYPE)[2:1])
   x$CONJ <- factor(x$CONJ,
   levels=levels(x$CONJ)[c(4,2,1,3)])

### fit a random forest on these data and add categorical as well as numeric predictions to the data
rf <- randomForest::randomForest(ORDER ~ SUBORDTYPE + LEN_MC + LEN_SC + LENGTH_DIFF + CONJ + MORETHAN2CL, data=x)
x$PREDCAT <- predict(rf, type="class")
x$PREDNUM <- car::logit(predict(rf, type="prob")[,"sc-mc"], adjust=0.01)

### fit a gsm on the numeric predictions from the rf
summary(gsm.num.01 <- lm(PREDNUM ~ 1, data=x))

##
## Call:
## lm(formula = PREDNUM ~ 1, data = x)
##
## Residuals:
##     Min      1Q  Median      3Q     Max
## -3.0903 -2.1124  0.2595  1.7787  5.6928
##
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  -1.5048     0.1058  -14.22   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.124 on 402 degrees of freedom

(temp <- add1(gsm.num.01,
   scope= ~ (SUBORDTYPE + LEN_MC + LEN_SC + LENGTH_DIFF + CONJ + MORETHAN2CL)^2,
   test="F"))

## Single term additions
##
## Model:
## PREDNUM ~ 1
##             Df Sum of Sq     RSS    AIC   F value    Pr(>F)
## <none>                   1813.56 608.16
## SUBORDTYPE   1   1317.90  495.66  87.40 1066.2055 < 2.2e-16 ***
## LEN_MC       1      4.43 1809.13 609.17    0.9822   0.32226
## LEN_SC       1     80.55 1733.00 591.85   18.6395 1.992e-05 ***
## LENGTH_DIFF  1     75.20 1738.36 593.09   17.3467 3.816e-05 ***
## CONJ         3   1344.82  468.73  68.89  381.5857 < 2.2e-16 ***
## MORETHAN2CL  1     21.53 1792.03 605.34    4.8183   0.02873 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(gsm.num.02 <- lm(PREDNUM ~ CONJ, data=x))

##
## Call:
## lm(formula = PREDNUM ~ CONJ, data = x)
##
## Residuals:
##     Min      1Q  Median      3Q     Max
## -2.4805 -0.8121 -0.0964  0.5832  3.6394
##
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)
## (Intercept)       -3.33577    0.07683  -43.42   <2e-16 ***
## CONJbevor/before   2.96025    0.17732   16.70   <2e-16 ***
## CONJals/when       3.88434    0.13614   28.53   <2e-16 ***
## CONJnachdem/after  3.69936    0.15484   23.89   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.084 on 399 degrees of freedom
## Multiple R-squared:  0.7415, Adjusted R-squared:  0.7396
## F-statistic: 381.6 on 3 and 399 DF,  p-value: < 2.2e-16

(temp <- add1(gsm.num.02,
   scope= ~ (SUBORDTYPE + LEN_MC + LEN_SC + LENGTH_DIFF + CONJ + MORETHAN2CL)^2,
   test="F"))

## Single term additions
##
## Model:
## PREDNUM ~ CONJ
##             Df Sum of Sq    RSS    AIC F value    Pr(>F)
## <none>                   468.73 68.892
## SUBORDTYPE   0    0.0000 468.73 68.892
## LEN_MC       1    7.3358 461.40 64.535  6.3278 0.0122786 *
## LEN_SC       1   10.6872 458.05 61.597  9.2862 0.0024628 **
## LENGTH_DIFF  1   20.4945 448.24 52.875 18.1975 2.491e-05 ***
## MORETHAN2CL  1   17.3222 451.41 55.717 15.2727 0.0001093 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(gsm.num.03 <- lm(PREDNUM ~ CONJ + LENGTH_DIFF, data=x))

##
## Call:
## lm(formula = PREDNUM ~ CONJ + LENGTH_DIFF, data = x)
##
## Residuals:
##     Min      1Q  Median      3Q     Max
## -2.2662 -0.7424 -0.1106  0.5625  3.4668
##
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)
## (Intercept)       -3.30836    0.07550 -43.818  < 2e-16 ***
## CONJbevor/before   2.83875    0.17594  16.135  < 2e-16 ***
## CONJals/when       3.83611    0.13378  28.675  < 2e-16 ***
## CONJnachdem/after  3.66800    0.15179  24.165  < 2e-16 ***
## LENGTH_DIFF        0.22810    0.05347   4.266 2.49e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.061 on 398 degrees of freedom
## Multiple R-squared:  0.7528, Adjusted R-squared:  0.7504
## F-statistic: 303.1 on 4 and 398 DF,  p-value: < 2.2e-16

(temp <- add1(gsm.num.03,
   scope= ~ (SUBORDTYPE + LEN_MC + LEN_SC + LENGTH_DIFF + CONJ + MORETHAN2CL)^2,
   test="F"))

## Single term additions
##
## Model:
## PREDNUM ~ CONJ + LENGTH_DIFF
##                  Df Sum of Sq    RSS    AIC F value    Pr(>F)
## <none>                        448.24 52.875
## SUBORDTYPE        0    0.0000 448.24 52.875
## LEN_MC            1    0.0483 448.19 54.831  0.0428    0.8362
## LEN_SC            1    0.0483 448.19 54.831  0.0428    0.8362
## MORETHAN2CL       1   18.0923 430.15 38.271 16.6982 5.305e-05 ***
## LENGTH_DIFF:CONJ  3   29.6663 418.57 31.279  9.3319 5.659e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(gsm.num.04 <- lm(PREDNUM ~ CONJ + LENGTH_DIFF + MORETHAN2CL, data=x))

##
## Call:
## lm(formula = PREDNUM ~ CONJ + LENGTH_DIFF + MORETHAN2CL, data = x)
##
## Residuals:
##     Min      1Q  Median      3Q     Max
## -2.4499 -0.7632 -0.1308  0.6080  3.2758
##
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)
## (Intercept)       -3.53682    0.09279 -38.116  < 2e-16 ***
## CONJbevor/before   2.86177    0.17266  16.575  < 2e-16 ***
## CONJals/when       3.82607    0.13124  29.153  < 2e-16 ***
## CONJnachdem/after  3.65398    0.14892  24.536  < 2e-16 ***
## LENGTH_DIFF        0.23239    0.05246   4.430 1.22e-05 ***
## MORETHAN2CLyes     0.42584    0.10421   4.086 5.30e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.041 on 397 degrees of freedom
## Multiple R-squared:  0.7628, Adjusted R-squared:  0.7598
## F-statistic: 255.4 on 5 and 397 DF,  p-value: < 2.2e-16

(temp <- add1(gsm.num.04,
   scope= ~ (SUBORDTYPE + LEN_MC + LEN_SC + LENGTH_DIFF + CONJ + MORETHAN2CL)^2,
   test="F"))

## Single term additions
##
## Model:
## PREDNUM ~ CONJ + LENGTH_DIFF + MORETHAN2CL
##                         Df Sum of Sq    RSS    AIC F value    Pr(>F)
## <none>                               430.15 38.271
## SUBORDTYPE               0     0.000 430.15 38.271
## LEN_MC                   1     0.006 430.14 40.265  0.0053   0.94175
## LEN_SC                   1     0.006 430.14 40.265  0.0053   0.94175
## LENGTH_DIFF:CONJ         3    32.633 397.51 12.476 10.7814 8.004e-07 ***
## LENGTH_DIFF:MORETHAN2CL  1     4.726 425.42 35.818  4.3993   0.03659 *
## CONJ:MORETHAN2CL         3     7.455 422.69 37.225  2.3162   0.07526 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(gsm.num.05 <- lm(PREDNUM ~ CONJ * LENGTH_DIFF + MORETHAN2CL, data=x))

##
## Call:
## lm(formula = PREDNUM ~ CONJ * LENGTH_DIFF + MORETHAN2CL, data = x)
##
## Residuals:
##     Min      1Q  Median      3Q     Max
## -2.1098 -0.7248 -0.1214  0.5886  2.9924
##
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)
## (Intercept)                   -3.584675   0.090046 -39.810  < 2e-16 ***
## CONJbevor/before               3.011301   0.176446  17.066  < 2e-16 ***
## CONJals/when                   3.819745   0.126802  30.124  < 2e-16 ***
## CONJnachdem/after              3.678378   0.143838  25.573  < 2e-16 ***
## LENGTH_DIFF                   -0.008743   0.075435  -0.116  0.90779
## MORETHAN2CLyes                 0.460968   0.100898   4.569 6.58e-06 ***
## CONJbevor/before:LENGTH_DIFF  -0.046060   0.170478  -0.270  0.78716
## CONJals/when:LENGTH_DIFF       0.619720   0.120111   5.160 3.93e-07 ***
## CONJnachdem/after:LENGTH_DIFF  0.440759   0.152566   2.889  0.00408 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.004 on 394 degrees of freedom
## Multiple R-squared:  0.7808, Adjusted R-squared:  0.7764
## F-statistic: 175.4 on 8 and 394 DF,  p-value: < 2.2e-16

(temp <- add1(gsm.num.05,
   scope= ~ (SUBORDTYPE + LEN_MC + LEN_SC + LENGTH_DIFF + CONJ + MORETHAN2CL)^2,
   test="F"))

## Single term additions
##
## Model:
## PREDNUM ~ CONJ * LENGTH_DIFF + MORETHAN2CL
##                         Df Sum of Sq    RSS     AIC F value  Pr(>F)
## <none>                               397.51 12.4756
## SUBORDTYPE               0    0.0000 397.51 12.4756
## LEN_MC                   1    0.0720 397.44 14.4027  0.0712 0.78976
## LEN_SC                   1    0.0720 397.44 14.4027  0.0712 0.78976
## LENGTH_DIFF:MORETHAN2CL  1    5.4045 392.11  8.9590  5.4168 0.02045 *
## CONJ:MORETHAN2CL         3    9.0303 388.48  9.2151  3.0296 0.02934 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(gsm.num.06 <- lm(PREDNUM ~ CONJ * LENGTH_DIFF + MORETHAN2CL + CONJ:MORETHAN2CL, data=x))

##
## Call:
## lm(formula = PREDNUM ~ CONJ * LENGTH_DIFF + MORETHAN2CL + CONJ:MORETHAN2CL,
##     data = x)
##
## Residuals:
##     Min      1Q  Median      3Q     Max
## -2.1999 -0.7148 -0.1352  0.5763  3.1280
##
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)
## (Intercept)                      -3.424136   0.104441 -32.785  < 2e-16 ***
## CONJbevor/before                  2.697734   0.233250  11.566  < 2e-16 ***
## CONJals/when                      3.536397   0.188184  18.792  < 2e-16 ***
## CONJnachdem/after                 3.300215   0.216885  15.216  < 2e-16 ***
## LENGTH_DIFF                      -0.005828   0.074865  -0.078  0.93799
## MORETHAN2CLyes                    0.163046   0.141747   1.150  0.25074
## CONJbevor/before:LENGTH_DIFF     -0.070114   0.170111  -0.412  0.68044
## CONJals/when:LENGTH_DIFF          0.623488   0.119304   5.226 2.82e-07 ***
## CONJnachdem/after:LENGTH_DIFF     0.470043   0.152615   3.080  0.00222 **
## CONJbevor/before:MORETHAN2CLyes   0.636121   0.328549   1.936  0.05357 .
## CONJals/when:MORETHAN2CLyes       0.516470   0.252265   2.047  0.04129 *
## CONJnachdem/after:MORETHAN2CLyes  0.679252   0.289966   2.343  0.01965 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9968 on 391 degrees of freedom
## Multiple R-squared:  0.7858, Adjusted R-squared:  0.7798
## F-statistic: 130.4 on 11 and 391 DF,  p-value: < 2.2e-16

(temp <- add1(gsm.num.06,
   scope= ~ (SUBORDTYPE + LEN_MC + LEN_SC + LENGTH_DIFF + CONJ + MORETHAN2CL)^2,
   test="F"))

## Single term additions
##
## Model:
## PREDNUM ~ CONJ * LENGTH_DIFF + MORETHAN2CL + CONJ:MORETHAN2CL
##                         Df Sum of Sq    RSS     AIC F value  Pr(>F)
## <none>                               388.48  9.2151
## SUBORDTYPE               0    0.0000 388.48  9.2151
## LEN_MC                   1    0.0953 388.39 11.1162  0.0957 0.75722
## LEN_SC                   1    0.0953 388.39 11.1162  0.0957 0.75722
## LENGTH_DIFF:MORETHAN2CL  1    3.7910 384.69  7.2632  3.8433 0.05066 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

plot(effects::allEffects(gsm.num.06), ylab="Logit of pred. prob. of sc-mc", grid=TRUE)

#### these results are quite compatible with what a regular logistic regression on these data would return

On classification trees and random forests in corpus linguistics: some words of caution and suggestions for improvement

Stefan Th. Gries (UC Santa Barbara & JLU Giessen)

01 April 2019 12:34:56