rm(list=ls(all=TRUE))
set.seed(060977) # setting a random-number generator seed to make the analyses random, but replicable
# Section 2.1: A small artificial data set ########################################################
## a first (made-up) small data set
P1s <- factor(rep(c("b","a","b","a"), c(1,7,9,3)) ) # predictor 1 small
P2s <- factor(rep(c("e","f","e"), c(6,10,4))) # predictor 2 small
P3s <- factor(rep(c("m","n","m","n"), c(6,4,6,4))) # predictor 3 small
DVs <- factor(rep(c("x","y"), c(10, 10))) # response small
summary((D1 <- data.frame(P1s, P2s, P3s, DVs)))
## P1s P2s P3s DVs
## a:10 e:10 m:12 x:10
## b:10 f:10 n: 8 y:10
ftable(P1s, P2s, P3s, DVs)
## DVs x y
## P1s P2s P3s
## a e m 5 0
## n 0 3
## f m 0 0
## n 2 0
## b e m 1 0
## n 0 1
## f m 0 6
## n 2 0
## if you try to predict DVs from P1s (a -> x, b -> y), you get 70% right
table(P1s, DVs)
## DVs
## P1s x y
## a 7 3
## b 3 7
## if you try to predict DVs from P2s (e -> x, f -> y), you get 60% right
table(P2s, DVs)
## DVs
## P2s x y
## e 6 4
## f 4 6
## if you try to predict DVs from P3s (m -> x, n -> y), you get 50% right, i.e. chance baseline
table(P3s, DVs)
## DVs
## P3s x y
## m 6 6
## n 4 4
## but note: combining the two weak independent variables gives you perfect accuracy:
ftable(P2s, P3s, DVs)
## DVs x y
## P2s P3s
## e m 6 0
## n 0 4
## f m 0 6
## n 4 0
table(P2s:P3s, DVs)
## DVs
## x y
## e:m 6 0
## e:n 0 4
## f:m 0 6
## f:n 4 0
## e+m -> x and e+n -> y
## f+m -> y and f+n -> x
## how do CART approaches handle this data set?
library(tree)
plot(tree.tree <- tree(DVs ~ P1s+P2s+P3s))
text(tree.tree, pretty=0)
mean(predict(tree.tree, type="class")==DVs)
## [1] 0.75
detach(package:tree)
library(rpart)
plot(tree.rpart <- rpart(DVs ~ P1s+P2s+P3s), margin=0.1)
text(tree.rpart, all=TRUE, use.n=TRUE)
mean(predict(tree.rpart, type="class")==DVs)
## [1] 0.7
detach(package:rpart)
library(party)
plot(tree.party <- ctree(DVs ~ P1s+P2s+P3s))
mean(predict(tree.party)==DVs)
## [1] 0.5
detach(package:party)
library(partykit)
plot(tree.partykit <- ctree(DVs ~ P1s+P2s+P3s))
mean(predict(tree.partykit)==DVs)
## [1] 0.5
detach(package:partykit)
# Section 2.2: The large version of the artificial data set #######################################
## a second (made-up) larger data set
P1l <- rep(P1s, 10) # predictor 1 large
P2l <- rep(P2s, 10) # predictor 2 large
P3l <- rep(P3s, 10) # predictor 3 large
DVl <- rep(DVs, 10) # response large
summary(D2 <- data.frame(P1l, P2l, P3l, DVl))
## P1l P2l P3l DVl
## a:100 e:100 m:120 x:100
## b:100 f:100 n: 80 y:100
## how do CART approaches handle this data set?
library(tree)
plot(tree.tree <- tree(DVl ~ P1l+P2l+P3l))
text(tree.tree, pretty=0)
mean(predict(tree.tree, type="class")==DVs)
## [1] 1
### but does this go away when we prune the tree using misclassifications?
pruning <- cv.tree(tree.tree, FUN=prune.misclass)
plot(pruning$size, pruning$dev, type="b"); grid()
cart.1.pruned <- tree::prune.tree(tree.tree, best=4)
plot(cart.1.pruned)
text(cart.1.pruned, pretty=0, all=TRUE)
mean(predict(cart.1.pruned, type="class")==DVl)
## [1] 0.85
### but does this go away when we prune the tree using deviance?
pruning <- cv.tree(tree.tree)
plot(pruning$size, pruning$dev, type="b"); grid()
cart.1.pruned <- tree::prune.tree(tree.tree, best=4)
plot(cart.1.pruned)
text(cart.1.pruned, pretty=0, all=TRUE)
mean(predict(cart.1.pruned, type="class")==DVl)
## [1] 0.85
detach(package:tree)
library(rpart)
plot(tree.rpart <- rpart(DVl ~ P1l+P2l+P3l), margin=0.1)
text(tree.rpart, all=TRUE, use.n=TRUE)
mean(predict(tree.rpart, type="class")==DVs)
## [1] 1
## but does this go away when we prune the tree using misclassifications?
plotcp(tree.rpart) # there is no value below the dotted line, thus, we pick 0.12
tree.rpart.pruned <- prune(tree.rpart, cp=0.12) # what we decided on in the previous code
plot(tree.rpart.pruned, margin=0.1)
text(tree.rpart.pruned, pretty=TRUE, use.n=TRUE)
mean(predict(tree.rpart.pruned, type="class")==DVs)
## [1] 0.85
detach(package:rpart)
library(party)
plot(tree.ctree <- party::ctree(DVl ~ P1l+P2l+P3l))
mean(predict(tree.ctree)==DVs)
## [1] 1
detach(package:party)
library(partykit)
plot(tree.partykit <- ctree(DVl ~ P1l+P2l+P3l))
mean(predict(tree.partykit)==DVl)
## [1] 1
detach(package:partykit)
# Section 2.3: Random / conditional inference forests to the rescue? ##############################
library(randomForest)
## compute a random forest on data set 1 small
(rfs <- randomForest(DVs ~ P1s+P2s+P3s, mtry=2))
##
## Call:
## randomForest(formula = DVs ~ P1s + P2s + P3s, mtry = 2)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 10%
## Confusion matrix:
## x y class.error
## x 9 1 0.1
## y 1 9 0.1
mean(predict(rfs)==DVs) # compute prediction accuracy
## [1] 0.9
importance(rfs) # compute variable importance scores
## MeanDecreaseGini
## P1s 1.651851
## P2s 2.950552
## P3s 3.618863
### compute partial dependency scores
(ps.1 <- partialPlot(rfs, D1, "P1s", ylim=c(-0.7, 0.7), ylab="Impact on DV='x'"))
## $x
## [1] "a" "b"
##
## $y
## [1] 0.6987539 -0.4461811
(ps.2 <- partialPlot(rfs, D1, "P2s", ylim=c(-0.7, 0.7), ylab="Impact on DV='x'"))
## $x
## [1] "e" "f"
##
## $y
## [1] 0.3658665 -0.3192668
(ps.3 <- partialPlot(rfs, D1, "P3s", ylim=c(-0.7, 0.7), ylab="Impact on DV='x'"))
## $x
## [1] "m" "n"
##
## $y
## [1] 0.025000839 0.004314669
plot(
c(ps.1$y, ps.2$y, ps.3$y), xlab="",
ylim=c(-0.7, 0.7), ylab="Impact on the response being 'x'",
type="n", axes=FALSE)
text(1:6, c(ps.1$y, ps.2$y, ps.3$y), c(ps.1$x, ps.2$x, ps.3$x))
grid(); abline(h=0, lty=2); axis(2)
axis(1, at=c(1.5, 3.5, 5.5), labels=paste0("Predictor", 1:3, "'s levels"))
## compute a random forest on data set 1 large
(rfl <- randomForest(DVl ~ P1l+P2l+P3l, mtry=2))
##
## Call:
## randomForest(formula = DVl ~ P1l + P2l + P3l, mtry = 2)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 0%
## Confusion matrix:
## x y class.error
## x 100 0 0
## y 0 100 0
mean(predict(rfl)==DVl) # compute prediction accuracy
## [1] 1
importance(rfl) # compute variable importance scores
## MeanDecreaseGini
## P1l 12.93793
## P2l 20.09449
## P3l 51.40719
### compute partial dependency scores
(pl.1 <- partialPlot(rfl, D2, "P1l", ylim=c(-6, 6), ylab="Impact on DV='x'"))
## $x
## [1] "a" "b"
##
## $y
## [1] 5.583007 -5.111478
(pl.2 <- partialPlot(rfl, D2, "P2l", ylim=c(-6, 6), ylab="Impact on DV='x'"))
## $x
## [1] "e" "f"
##
## $y
## [1] 4.409282 -5.749225
(pl.3 <- partialPlot(rfl, D2, "P3l", ylim=c(-6, 6), ylab="Impact on DV='x'"))
## $x
## [1] "m" "n"
##
## $y
## [1] 0.13265772 0.06094591
plot(
c(pl.1$y, pl.2$y, pl.3$y), xlab="",
ylim=c(-6, 6), ylab="Impact on the response being 'x'",
type="n", axes=FALSE)
text(1:6, c(pl.1$y, pl.2$y, pl.3$y), c(pl.1$x, pl.2$x, pl.3$x))
grid(); abline(h=0, lty=2); axis(2)
axis(1, at=c(1.5, 3.5, 5.5), labels=paste0("Predictor", 1:3, "'s levels"))
detach(package:randomForest)
library(party)
## compute a conditional random forest on data set 1 small
cfs <- cforest(DVs ~ P1s+P2s+P3s, controls=cforest_control(mtry=ceiling(sqrt(3))))
mean(predict(cfs)==DVs) # compute prediction accuracy
## [1] 0.7
### compute variable importance scores
varimp(cfs)
## P1s P2s P3s
## 0.053528571 -0.008375469 -0.011546970
varimp(cfs, conditional=TRUE)
## P1s P2s P3s
## 0.04797619 -0.01015079 -0.01155570
# varimpAUC(cfs)
# varimpAUC(cfs, conditional=TRUE)
## compute a random forest on data set 1 large
cfl <- cforest(DVl ~ P1l+P2l+P3l, controls=cforest_control(mtry=ceiling(sqrt(3))))
mean(predict(cfl)==DVl) # compute prediction accuracy
## [1] 1
### compute variable importance scores
varimp(cfl)
## P1l P2l P3l
## 0.1438758 0.2251775 0.3229000
varimp(cfl, conditional=TRUE)
## P1l P2l P3l
## 0.01227272 0.20476215 0.31794746
varimpAUC(cfl)
## P1l P2l P3l
## 0.1562850 0.2571000 0.3534203
varimpAUC(cfl, conditional=TRUE)
## P1l P2l P3l
## 0.01518716 0.23984828 0.35596357
detach(package:party)
library(partykit)
## compute a conditional random forest on data set 1 small
cfs <- cforest(DVs ~ P1s+P2s+P3s)
mean(predict(cfs)==DVs) # compute prediction accuracy
## [1] 0.5
### compute variable importance scores
varimp(cfs)
## named numeric(0)
varimp(cfs, conditional=TRUE)
## named numeric(0)
## compute a random forest on data set 1 large
cfl <- cforest(DVl ~ P1l+P2l+P3l)
mean(predict(cfl)==DVl) # compute prediction accuracy
## [1] 1
### compute variable importance scores
varimp(cfl)
## P1l P2l P3l
## 2.311940 4.010550 5.147123
varimp(cfl, conditional=TRUE)
## P1l P2l P3l
## 0.1238174 3.7095522 5.0656456
detach(package:partykit)
# Section 3.1: Improving trees' and forests' performance: interaction variables as predictors #####
## create interaction variables for both data sets
P1sxP2s <- P1s:P2s; P1sxP3s <- P1s:P3s; P2sxP3s <- P2s:P3s
summary(D1 <- cbind(D1, P1sxP2s, P1sxP3s, P2sxP3s))
## P1s P2s P3s DVs P1sxP2s P1sxP3s P2sxP3s
## a:10 e:10 m:12 x:10 a:e:8 a:m:5 e:m:6
## b:10 f:10 n: 8 y:10 a:f:2 a:n:5 e:n:4
## b:e:2 b:m:7 f:m:6
## b:f:8 b:n:3 f:n:4
P1lxP2l <- P1l:P2l; P1lxP3l <- P1l:P3l; P2lxP3l <- P2l:P3l
summary(D2 <- cbind(D2, P1lxP2l, P1lxP3l, P2lxP3l))
## P1l P2l P3l DVl P1lxP2l P1lxP3l P2lxP3l
## a:100 e:100 m:120 x:100 a:e:80 a:m:50 e:m:60
## b:100 f:100 n: 80 y:100 a:f:20 a:n:50 e:n:40
## b:e:20 b:m:70 f:m:60
## b:f:80 b:n:30 f:n:40
## how do CART approaches handle the small data set with the interaction variables?
plot(tree.tree <- tree::tree(DVs ~ P1s+P2s+P3s+P1sxP2s+P1sxP3s+P2sxP3s))
text(tree.tree, pretty=0)
mean(predict(tree.tree, type="class")==DVs)
## [1] 1
plot(tree.rpart <- rpart::rpart(DVs ~ P1s+P2s+P3s+P1sxP2s+P1sxP3s+P2sxP3s), margin=0.1)
text(tree.rpart, all=TRUE, use.n=TRUE)
mean(predict(tree.rpart, type="class")==DVs)
## [1] 1
plot(tree.party <- party::ctree(DVs ~ P1s+P2s+P3s+P1sxP2s+P1sxP3s+P2sxP3s))
mean(predict(tree.party)==DVs)
## [1] 1
plot(tree.partykit <- partykit::ctree(DVs ~ P1s+P2s+P3s+P1sxP2s+P1sxP3s+P2sxP3s))
mean(predict(tree.partykit)==DVs)
## [1] 1
## how do CART approaches handle the large data set with the interaction variables?
plot(tree.tree <- tree::tree(DVl ~ P1l+P2l+P3l+P1lxP2l+P1lxP3l+P2lxP3l))
text(tree.tree, pretty=0)
mean(predict(tree.tree, type="class")==DVs)
## [1] 1
plot(tree.rpart <- rpart::rpart(DVl ~ P1l+P2l+P3l+P1lxP2l+P1lxP3l+P2lxP3l), margin=0.1)
text(tree.rpart, all=TRUE, use.n=TRUE)
mean(predict(tree.rpart, type="class")==DVs)
## [1] 1
plot(tree.party <- party::ctree(DVl ~ P1l+P2l+P3l+P1lxP2l+P1lxP3l+P2lxP3l))
mean(predict(tree.party)==DVs)
## [1] 1
plot(tree.partykit <- partykit::ctree(DVl ~ P1l+P2l+P3l+P1lxP2l+P1lxP3l+P2lxP3l))
mean(predict(tree.partykit)==DVs)
## [1] 1
## how do forests react to this?
library(randomForest)
### compute a random forest on data set 1 small
(rfs <- randomForest(DVs ~ P1s+P2s+P3s+P1sxP2s+P1sxP3s+P2sxP3s))
##
## Call:
## randomForest(formula = DVs ~ P1s + P2s + P3s + P1sxP2s + P1sxP3s + P2sxP3s)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 0%
## Confusion matrix:
## x y class.error
## x 10 0 0
## y 0 10 0
mean(predict(rfs)==DVs) # compute prediction accuracy
## [1] 1
importance(rfs) # compute variable importance scores
## MeanDecreaseGini
## P1s 0.3922734
## P2s 0.4098387
## P3s 0.5992698
## P1sxP2s 1.2970844
## P1sxP3s 1.7998302
## P2sxP3s 4.7386206
### compute partial dependency scores for the by far most important effect
(ps.23 <- partialPlot(rfs, D1, "P2sxP3s", ylab="Impact on DV='x'"))
## $x
## [1] "e:m" "e:n" "f:m" "f:n"
##
## $y
## [1] 1.1554860 -0.7337640 -5.4989573 0.6205407
## compute a random forest on data set 1 large
(rfl <- randomForest(DVl ~ P1l+P2l+P3l+P1lxP2l+P1lxP3l+P2lxP3l))
##
## Call:
## randomForest(formula = DVl ~ P1l + P2l + P3l + P1lxP2l + P1lxP3l + P2lxP3l)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 0%
## Confusion matrix:
## x y class.error
## x 100 0 0
## y 0 100 0
mean(predict(rfl)==DVl) # compute prediction accuracy
## [1] 1
importance(rfl) # compute variable importance scores
## MeanDecreaseGini
## P1l 2.619442
## P2l 3.887269
## P3l 6.185997
## P1lxP2l 11.790335
## P1lxP3l 17.366405
## P2lxP3l 53.361195
### compute partial dependency scores for the by far most important effect
(pl.23 <- partialPlot(rfl, D2, "P2lxP3l", ylab="Impact on DV='x'"))
## $x
## [1] "e:m" "e:n" "f:m" "f:n"
##
## $y
## [1] 5.0394694 -0.8466043 -5.4883717 0.8304328
detach(package:randomForest)
library(party)
## compute a conditional random forest on data set 1 small
cfs <- cforest(DVs ~ P1s+P2s+P3s+P1sxP2s+P1sxP3s+P2sxP3s,
controls=cforest_control(mtry=ceiling(sqrt(6))))
mean(predict(cfs)==DVs) # compute prediction accuracy
## [1] 1
### compute variable importance scores
varimp(cfs)
## P1s P2s P3s P1sxP2s P1sxP3s P2sxP3s
## 0.0051071429 -0.0002142857 -0.0037341270 0.0136595238 0.0513515873 0.2023887446
varimp(cfs, conditional=TRUE)
## P1s P2s P3s P1sxP2s P1sxP3s P2sxP3s
## -0.0023666667 0.0005714286 -0.0022950938 0.0072666667 0.0460063492 0.1783181818
# varimpAUC(cfs)
# varimpAUC(cfs, conditional=TRUE)
## compute a conditional random forest on data set 1 large
cfl <- cforest(DVl ~ P1l+P2l+P3l+P1lxP2l+P1lxP3l+P2lxP3l,
controls=cforest_control(mtry=ceiling(sqrt(6))))
mean(predict(cfl)==DVl) # compute prediction accuracy
## [1] 1
### compute variable importance scores
varimp(cfl)
## P1l P2l P3l P1lxP2l P1lxP3l P2lxP3l
## 0.02488977 0.02004302 0.03594787 0.07913391 0.10344119 0.32938993
varimp(cfl, conditional=TRUE)
## P1l P2l P3l P1lxP2l P1lxP3l P2lxP3l
## 0.002842231 0.012528873 0.024065912 0.038540707 0.051880157 0.302427169
varimpAUC(cfl)
## P1l P2l P3l P1lxP2l P1lxP3l P2lxP3l
## 0.02529408 0.02071179 0.03670507 0.07787176 0.10037292 0.32650967
varimpAUC(cfl, conditional=TRUE)
## P1l P2l P3l P1lxP2l P1lxP3l P2lxP3l
## 0.003667529 0.013081199 0.026005910 0.038646316 0.054449572 0.309123440
detach(package:party)
library(partykit)
## compute a conditional random forest on data set 1 small
cfs <- cforest(DVs ~ P1s+P2s+P3s+P1sxP2s+P1sxP3s+P2sxP3s)
mean(predict(cfs)==DVs) # compute prediction accuracy
## [1] 0.5
### compute variable importance scores
varimp(cfs)
## named numeric(0)
varimp(cfs, conditional=TRUE)
## named numeric(0)
## compute a conditional random forest on data set 1 large
cfl <- cforest(DVl ~ P1l+P2l+P3l+P1lxP2l+P1lxP3l+P2lxP3l)
mean(predict(cfl)==DVl) # compute prediction accuracy
## [1] 1
### compute variable importance scores
varimp(cfl)
## P1l P2l P3l P1lxP2l P1lxP3l P2lxP3l
## 2.682614 2.508443 3.027808 3.399461 3.445935 6.588357
# varimp(cfl, conditional=TRUE)
detach(package:partykit)
## find interactions
library(randomForestSRC)
rf.src <- rfsrc(DVl ~ P1l+P2l+P3l, data=D2)
find.interaction(rf.src, method="vimp")
## Pairing P1l with P2l
## Pairing P1l with P3l
## Pairing P2l with P3l
##
## Method: vimp
## No. of variables: 3
## Variables sorted by VIMP?: TRUE
## No. of variables used for pairing: 3
## Total no. of paired interactions: 3
## Monte Carlo replications: 1
## Type of noising up used for VIMP: permute
##
## Var 1 Var 2 Paired Additive Difference
## P1l:P2l 0.1277 0.3724 0.4815 0.5000 -0.0185
## P1l:P3l 0.1277 0.4144 0.4833 0.5420 -0.0587
## P2l:P3l 0.3698 0.4144 0.4371 0.7842 -0.3471
# note the big difference returned for P2l:P3l
detach(package:randomForestSRC)
library(edarf)
partial_dependence(rf.src, vars=c("P1l"), n=c(10, 20)) # partial dependence of P1l
## P1l x y
## 1 a 0.6572153 0.3427847
## 2 b 0.3031674 0.6968326
partial_dependence(rf.src, vars=c("P1l", "P2l"), interaction=TRUE, data=D2) # partial dep. of interaction P1l:P2l
## P1l P2l x y
## 1 a e 0.6113153 0.3886847
## 2 a f 0.8579153 0.1420847
## 3 b e 0.5822289 0.4177711
## 4 b f 0.3959764 0.6040236
partial_dependence(rf.src, vars=c("P2l", "P3l"), interaction=TRUE, data=D2) # partial dep. of interaction P2l:P3l
## P2l P3l x y
## 1 e m 0.96972505 0.03027495
## 2 e n 0.03734273 0.96265727
## 3 f m 0.40129876 0.59870124
## 4 f n 0.96541644 0.03458356
# note how in the latter, three predicted probabilities of x and y exceed 95%, that's the interaction
# Section 3.2: Representing random forests: representative trees ##################################
forest.4.reprtree <- randomForest::randomForest(DVl ~ P1l+P2l+P3l)
plot(reptree <- reprtree::ReprTree(forest.4.reprtree, D2, metric="d2"))
# Section 3.3: Representing random forests: effects (plots) #######################################
## example with a version of D2 without the interactions (for the sake of simplicity)
D2.maineffects <- D2[,1:4]
### add to the data a column with the combination of predictor levels
D2.maineffects$COMBO <- apply(D2.maineffects[,1:3], 1, paste, collapse="_")
### add to the data a column that states how frequent each row's predictor combination is (just for housekeeping)
# D2.maineffects$COMBOFREQ <- table(D2.maineffects$COMBO)[D2.maineffects$COMBO]
### create all possible combinations of predictor levels
all.possible.combs <- expand.grid(
P1l=levels(D2.maineffects$P1l),
P2l=levels(D2.maineffects$P2l),
P3l=levels(D2.maineffects$P3l)
)
### add to the possible combinations a column with the combination of predictor levels
all.possible.combs$COMBO <- apply(all.possible.combs, 1, paste, collapse="_")
### add to the possible combinations a column that states how frequent each row's possible combinations is
all.possible.combs$COMBOFREQINREALDATA <- table(D2.maineffects$COMBO)[all.possible.combs$COMBO]
all.possible.combs$COMBOFREQINREALDATA[is.na(all.possible.combs$COMBOFREQINREALDATA)] <- 0
### add to the possible combinations a column that provides the predicted probability of DV=="y"
all.possible.combs$PREDPROB <- predict(
randomForest::randomForest(DVl ~ P1l + P2l + P3l, data=D2.maineffects),
newdata=all.possible.combs, type="prob")[,2]
### the effect of P1l:a on the predicted probability of DV=="y"
with(
subset(all.possible.combs, all.possible.combs$P1l=="a"), {
weighted.mean(PREDPROB, COMBOFREQINREALDATA)
})
## [1] 0.2618
### the effect of P1l:b on the predicted probability of DV=="y"
with(
subset(all.possible.combs, all.possible.combs$P1l=="b"), {
weighted.mean(PREDPROB, COMBOFREQINREALDATA)
})
## [1] 0.7386
### the predictions for P2:P3 (for DV=="y")
with(
subset(all.possible.combs, grepl("e_m", all.possible.combs$COMBO)), {
weighted.mean(PREDPROB, COMBOFREQINREALDATA)
})
## [1] 0.1266667
with(
subset(all.possible.combs, grepl("e_n", all.possible.combs$COMBO)), {
weighted.mean(PREDPROB, COMBOFREQINREALDATA)
})
## [1] 0.574
with(
subset(all.possible.combs, grepl("f_m", all.possible.combs$COMBO)), {
weighted.mean(PREDPROB, COMBOFREQINREALDATA)
})
## [1] 0.916
with(
subset(all.possible.combs, grepl("f_n", all.possible.combs$COMBO)), {
weighted.mean(PREDPROB, COMBOFREQINREALDATA)
})
## [1] 0.363
# Section 3.4: Representing random forests: global surrogate models ###############################
library(MASS)
predicted.categories <- predict(rfl, type="class")
summary(gsm <- stepAIC(glm(predicted.categories ~ P1l*P2l*P3l, family=binomial)))
## [omitted some messages/output, STG]
## Call:
## glm(formula = predicted.categories ~ P2l + P3l + P2l:P3l, family = binomial)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.409e-06 -2.409e-06 0.000e+00 2.409e-06 2.409e-06
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -26.57 45975.41 -0.001 1.000
## P2lf 53.13 65019.06 0.001 0.999
## P3ln 53.13 72693.41 0.001 0.999
## P2lf:P3ln -106.26 102804.08 -0.001 0.999
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2.7726e+02 on 199 degrees of freedom
## Residual deviance: 1.1603e-09 on 196 degrees of freedom
## AIC: 8
##
## Number of Fisher Scoring iterations: 25
## because of the perfect fit in the simulated data, we're running into convergence problems and are getting ns results with huge CIs - in real data, this does not happen, see below
## perfect accuracy
table(
ifelse(predict(gsm)>=0, "y", "x"), # the dichotomized predictions from the gsm
predicted.categories) # the actually predicted categories from the random forest
## predicted.categories
## x y
## x 100 0
## y 0 100
## recovers the interaction and nothing else: generate an effects plot of the gsm
plot(effects::allEffects(gsm), type="response",
ylim=c(0, 1), ylab="Predicted probability of DV=y",
multiline=TRUE, grid=TRUE)
## demonstration that, with real data, gsms do not run into the convergence issues from above
rm(list=ls(all=TRUE)); set.seed(314)
# generate the clause-orders data from Gries (2013: Section 5.3)
x<-structure(list(
CASE=c(4777,1698,953,1681,4055,967,2658,2502,4455,2368,1729,4546,586,4608,935,2108,4317,4318,2385,4723,3267,4558,1747,623,4507,138,80,952,2602,599,4037,4467,4324,2928,4728,4473,1688,3805,2472,84,997,998,988,474,4737,4523,4805,4808,4802,1063,4841,4769,4531,1677,1649,4813,4882,4293,4544,4763,4552,4881,3024,2614,4069,4278,4861,2517,4509,941,4838,2619,2530,2485,4878,4495,4458,4468,4859,4610,1017,4568,1638,1256,1938,1724,964,1784,1661,619,4503,4604,4778,3617,450,1117,2383,2018,2574,131,2382,4672,2017,3570,5052,4897,1783,4670,3926,3701,4974,2822,5051,452,1184,3489,4896,3616,2820,3925,2818,5053,2825,1118,2576,141,2819,2456,4673,1182,904,4898,2648,4671,905,4899,130,454,2575,4675,2834,2849,1855,2838,4151,2459,4154,4148,2024,456,1285,2832,62,4144,10,4453,3764,2839,2830,4149,4155,4145,2835,4159,9,2020,2846,3927,2841,223,4143,4156,458,4680,3210,4674,4679,4677,3490,2579,4681,2845,1555,63,3928,2844,2847,3763,2842,2831,2577,3211,4902,2837,2458,4146,4152,4153,2827,4147,2457,2840,1186,4452,2731,332,3547,4092,4078,2409,326,1795,823,2754,4646,2751,4341,324,4087,2746,1796,3546,4633,1809,1432,327,330,341,1799,329,1808,4339,2421,1103,2742,3889,2552,4629,4081,4090,2728,122,1307,2752,4110,2662,2732,4099,351,4097,4089,2729,3171,3166,821,2005,1441,4335,349,316,3542,2550,2748,825,3552,3184,3993,2,2736,3173,3176,1804,4338,4094,2422,2741,4080,4084,3545,1442,3179,1304,3177,3549,3543,1100,1309,2740,328,4100,2418,2554,1810,1440,2423,4098,3170,2758,2405,1779,4107,1803,1308,1811,101,3178,3994,2420,4642,333,339,2663,3169,2543,2553,322,1272,321,4334,2756,4648,2750,3551,4108,3744,2749,822,4103,323,102,2664,3746,4105,1434,348,347,3550,4085,1104,4637,4636,343,345,3168,103,4639,3845,3548,2747,3181,1429,4643,2735,1793,1439,1792,1305,831,2406,4640,2734,4079,4101,820,1800,824,2226,4631,3890,2006,3167,2545,4102,342,3745,352,3180,4093,2551,2425,4106,2640,5043,4647,2737,1433,1797,4109,1798,354,355,2743,3182,4638,2757,829,318,4644,1794,4095,2733,350,208),
ORDER=structure(c(2,1,2,1,2,2,1,2,2,2,2,2,2,1,2,2,1,1,2,1,1,1,2,2,2,2,2,1,1,2,2,2,1,1,2,1,2,1,2,2,1,2,2,2,1,1,2,1,1,2,2,2,1,1,2,2,2,1,2,1,1,1,1,2,2,2,1,2,2,2,2,2,1,2,1,2,2,2,2,2,1,2,1,1,1,1,1,2,1,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,2,2,1,1,2,2,2,1,1,1,2,1,2,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,2,2,2,1,1,2,2,2,1,1,2,1,2,2,1,2,1,1,1,2,2,2,2,1,2,1,1,1,2,2,2,1,2,1,2,2,1,2,1,2,2,2,2,2,2,2,2,2,2,1,1,2,2,2,1,2,1,1,2,2,1,2,2,1,1,2,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,2,1,1,1,1,1,1,1,2,2,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2),.Label=c("mc-sc","sc-mc"),class="factor"),
SUBORDTYPE=structure(c(2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),.Label=c("caus","temp"),class="factor"),
LEN_MC=c(4,7,12,6,9,9,9,4,6,4,15,11,10,5,6,9,10,8,13,6,14,5,6,10,14,11,7,4,2,7,24,4,5,6,5,4,18,11,5,10,3,5,10,8,4,5,4,6,6,18,3,10,10,4,10,10,7,8,15,6,11,10,7,9,10,10,5,16,8,12,17,6,7,17,5,10,9,7,5,9,3,7,5,4,4,3,6,5,8,12,11,4,4,12,17,3,3,8,3,5,5,4,11,11,8,7,11,10,7,19,9,7,6,10,23,5,24,6,12,10,9,9,17,4,7,11,13,6,10,18,6,4,6,9,16,15,10,17,5,4,6,13,8,10,3,16,12,7,7,17,18,10,8,8,16,7,14,6,13,3,14,12,15,9,8,11,7,6,15,6,6,12,12,5,12,10,22,17,15,15,3,13,5,31,5,11,7,12,8,4,4,8,5,9,29,6,8,10,5,22,13,10,9,28,9,9,19,7,8,6,6,7,5,10,15,13,8,4,5,9,8,6,6,11,10,14,8,9,3,6,9,12,5,16,5,16,7,8,14,8,4,3,8,13,15,5,7,9,9,17,6,9,12,4,7,6,15,9,14,10,17,6,7,11,8,5,5,24,6,7,11,26,11,17,19,6,9,16,17,11,7,5,5,6,9,11,5,6,7,7,4,8,13,10,14,6,3,7,6,9,7,18,10,16,31,4,5,9,8,4,9,5,7,6,6,4,23,10,6,10,11,9,27,7,14,7,3,4,12,15,7,6,10,6,9,3,9,3,7,14,10,3,6,11,20,15,4,15,3,18,5,15,5,12,13,7,6,7,9,4,9,10,5,5,6,6,8,5,9,5,5,11,12,10,8,8,11,15,7,4,7,8,8,5,8,5,7,7,19,9,17,8,24,9,7,7,3,9,13,10,8,8,9),
LEN_SC=c(10,6,7,15,5,5,12,2,24,11,4,3,11,11,6,11,11,7,7,13,4,3,4,5,20,5,3,5,9,6,8,10,17,11,9,6,18,15,4,20,14,3,12,11,4,10,4,5,7,7,4,7,4,7,22,7,12,22,6,9,2,6,17,4,2,10,10,4,3,2,8,4,17,4,10,7,5,6,2,7,14,16,5,5,3,3,3,6,5,6,8,3,10,5,14,7,5,5,4,8,6,6,6,9,12,2,19,6,10,13,5,9,13,7,6,8,5,5,9,11,6,8,4,3,3,4,4,13,17,8,4,3,4,8,5,3,5,6,9,8,12,14,7,14,8,5,9,21,14,7,14,7,20,21,15,16,18,4,11,7,8,15,10,6,29,6,14,8,13,5,9,11,21,5,10,12,17,8,10,7,4,14,4,13,4,11,9,5,10,12,6,8,8,7,4,9,5,6,14,7,4,15,6,12,4,7,7,15,7,13,9,11,23,7,14,4,8,17,8,10,22,9,9,20,8,5,5,10,7,15,7,2,5,7,5,15,6,9,19,11,4,3,8,16,6,13,7,7,20,18,8,8,4,11,13,10,7,6,12,7,23,6,10,10,5,6,7,11,10,6,9,13,10,17,10,8,8,13,12,9,5,10,6,18,13,18,10,13,5,8,8,5,15,9,13,9,8,10,5,16,10,9,10,6,16,19,6,4,5,21,29,21,8,5,7,9,5,13,5,7,17,16,17,12,4,9,11,12,9,15,7,9,5,5,6,3,5,5,10,8,6,7,5,11,19,5,11,14,26,5,4,5,10,17,9,8,5,9,10,36,9,16,8,20,14,6,17,12,5,7,6,9,13,15,10,20,8,17,4,3,9,4,8,6,14,2,5,14,9,18,18,9,6,7,6,6,11,11,23,7,5,11,6),
LENGTH_DIFF=c(-6,1,5,-9,4,4,-3,2,-18,-7,11,8,-1,-6,0,-2,-1,1,6,-7,10,2,2,5,-6,6,4,-1,-7,1,16,-6,-12,-5,-4,-2,0,-4,1,-10,-11,2,-2,-3,0,-5,0,1,-1,11,-1,3,6,-3,-12,3,-5,-14,9,-3,9,4,-10,5,8,0,-5,12,5,10,9,2,-10,13,-5,3,4,1,3,2,-11,-9,0,-1,1,0,3,-1,3,6,3,1,-6,7,3,-4,-2,3,-1,-3,-1,-2,5,2,-4,5,-8,4,-3,6,4,-2,-7,3,17,-3,19,1,3,-1,3,1,13,1,4,7,9,-7,-7,10,2,1,2,1,11,12,5,11,-4,-4,-6,-1,1,-4,-5,11,3,-14,-7,10,4,3,-12,-13,1,-9,-4,2,2,-4,6,-3,5,3,-21,5,-7,-2,2,1,-3,1,-9,0,2,-2,5,9,5,8,-1,-1,1,18,1,0,-2,7,-2,-8,-2,0,-3,2,25,-3,3,4,-9,15,9,-5,3,16,5,2,12,-8,1,-7,-3,-4,-18,3,1,9,0,-13,-3,-1,-14,-3,-3,-9,2,9,3,-1,-4,-9,2,10,0,9,0,1,1,-1,-5,-3,0,0,0,-3,9,-8,0,2,-11,-1,-2,1,8,-7,-6,-4,8,3,2,3,-6,0,-3,1,3,-1,-2,13,-4,1,2,13,1,0,9,-2,1,3,5,2,2,-5,-1,-12,-4,-7,-5,-7,2,-1,-4,3,-2,1,1,-3,-5,-3,1,-7,-3,9,0,10,15,-15,-1,5,3,-17,-20,-16,-1,1,-1,-5,18,-3,1,3,-6,-7,10,-5,10,-2,-8,-8,3,0,0,-3,5,1,3,0,4,-2,-3,6,4,-4,1,0,1,10,-7,1,-23,13,1,10,-5,-5,4,-1,1,-2,-1,-32,0,-6,-3,-15,-8,0,-9,-7,4,-2,-1,2,-1,-5,-2,-12,3,-2,3,1,-2,4,0,-1,-6,3,2,-7,10,-9,-1,-1,18,2,1,1,-8,-2,-10,3,3,-3,3),
CONJ=structure(c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4),.Label=c("als/when","bevor/before","nachdem/after","weil/because"),class="factor"),
MORETHAN2CL=structure(c(1,1,2,1,2,2,1,2,1,1,2,1,2,2,2,1,1,1,1,1,2,2,2,2,2,1,2,2,1,2,2,1,2,1,2,2,2,2,1,2,1,1,1,2,2,2,2,1,1,2,2,2,1,2,2,1,2,2,2,2,1,1,2,2,1,1,1,1,1,1,1,1,2,2,2,1,2,2,2,2,2,2,2,1,2,1,1,2,1,2,1,1,2,2,2,2,2,2,2,1,1,2,2,1,1,2,2,1,1,1,1,1,2,1,2,1,2,2,2,2,1,1,2,2,1,1,1,1,1,1,2,1,2,1,1,2,2,1,1,1,2,2,2,2,2,2,2,1,2,1,2,1,1,1,2,2,2,2,2,2,2,1,1,2,2,2,1,1,2,1,2,2,2,1,1,2,1,1,1,1,2,1,1,1,2,1,2,2,2,2,2,2,1,1,1,2,1,2,2,1,2,1,1,2,2,1,2,2,1,1,2,2,2,1,1,1,1,1,1,2,1,1,2,2,1,1,2,2,1,2,2,1,2,1,2,2,2,2,1,1,2,2,2,1,2,1,2,2,2,2,2,2,2,1,1,1,1,2,1,1,2,1,1,2,1,2,2,1,2,2,2,2,1,1,2,1,1,1,1,2,2,1,2,2,2,2,2,2,2,1,2,1,2,2,2,2,1,1,1,1,2,1,1,2,2,2,1,2,1,1,1,1,2,2,2,1,1,2,2,1,1,1,2,2,1,1,1,2,2,2,2,2,1,1,2,2,2,1,1,1,1,2,2,2,1,2,2,2,2,1,1,1,1,2,2,1,2,2,1,2,1,1,1,1,1,2,1,1,1,2,1,2,1,1,2,1,2,1,2,2,1,1,2,1,2,1,2,2,2,2,2,2,2,2,1,2,2,1,1,2,2,1,2),.Label=c("no","yes"),class="factor")),
class="data.frame",row.names=c(NA,-403))
x$LEN_MC <- log2(x$LEN_MC)
x$LEN_SC <- log2(x$LEN_SC)
x$LENGTH_DIFF <- x$LEN_MC-x$LEN_SC
x$SUBORDTYPE <- factor(x$SUBORDTYPE,
levels=levels(x$SUBORDTYPE)[2:1])
x$CONJ <- factor(x$CONJ,
levels=levels(x$CONJ)[c(4,2,1,3)])
### fit a random forest on these data and add categorical as well as numeric predictions to the data
rf <- randomForest::randomForest(ORDER ~ SUBORDTYPE + LEN_MC + LEN_SC + LENGTH_DIFF + CONJ + MORETHAN2CL, data=x)
x$PREDCAT <- predict(rf, type="class")
x$PREDNUM <- car::logit(predict(rf, type="prob")[,"sc-mc"], adjust=0.01)
### fit a gsm on the numeric predictions from the rf
summary(gsm.num.01 <- lm(PREDNUM ~ 1, data=x))
##
## Call:
## lm(formula = PREDNUM ~ 1, data = x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.0903 -2.1124 0.2595 1.7787 5.6928
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.5048 0.1058 -14.22 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.124 on 402 degrees of freedom
(temp <- add1(gsm.num.01,
scope= ~ (SUBORDTYPE + LEN_MC + LEN_SC + LENGTH_DIFF + CONJ + MORETHAN2CL)^2,
test="F"))
## Single term additions
##
## Model:
## PREDNUM ~ 1
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 1813.56 608.16
## SUBORDTYPE 1 1317.90 495.66 87.40 1066.2055 < 2.2e-16 ***
## LEN_MC 1 4.43 1809.13 609.17 0.9822 0.32226
## LEN_SC 1 80.55 1733.00 591.85 18.6395 1.992e-05 ***
## LENGTH_DIFF 1 75.20 1738.36 593.09 17.3467 3.816e-05 ***
## CONJ 3 1344.82 468.73 68.89 381.5857 < 2.2e-16 ***
## MORETHAN2CL 1 21.53 1792.03 605.34 4.8183 0.02873 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(gsm.num.02 <- lm(PREDNUM ~ CONJ, data=x))
##
## Call:
## lm(formula = PREDNUM ~ CONJ, data = x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.4805 -0.8121 -0.0964 0.5832 3.6394
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.33577 0.07683 -43.42 <2e-16 ***
## CONJbevor/before 2.96025 0.17732 16.70 <2e-16 ***
## CONJals/when 3.88434 0.13614 28.53 <2e-16 ***
## CONJnachdem/after 3.69936 0.15484 23.89 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.084 on 399 degrees of freedom
## Multiple R-squared: 0.7415, Adjusted R-squared: 0.7396
## F-statistic: 381.6 on 3 and 399 DF, p-value: < 2.2e-16
(temp <- add1(gsm.num.02,
scope= ~ (SUBORDTYPE + LEN_MC + LEN_SC + LENGTH_DIFF + CONJ + MORETHAN2CL)^2,
test="F"))
## Single term additions
##
## Model:
## PREDNUM ~ CONJ
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 468.73 68.892
## SUBORDTYPE 0 0.0000 468.73 68.892
## LEN_MC 1 7.3358 461.40 64.535 6.3278 0.0122786 *
## LEN_SC 1 10.6872 458.05 61.597 9.2862 0.0024628 **
## LENGTH_DIFF 1 20.4945 448.24 52.875 18.1975 2.491e-05 ***
## MORETHAN2CL 1 17.3222 451.41 55.717 15.2727 0.0001093 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(gsm.num.03 <- lm(PREDNUM ~ CONJ + LENGTH_DIFF, data=x))
##
## Call:
## lm(formula = PREDNUM ~ CONJ + LENGTH_DIFF, data = x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.2662 -0.7424 -0.1106 0.5625 3.4668
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.30836 0.07550 -43.818 < 2e-16 ***
## CONJbevor/before 2.83875 0.17594 16.135 < 2e-16 ***
## CONJals/when 3.83611 0.13378 28.675 < 2e-16 ***
## CONJnachdem/after 3.66800 0.15179 24.165 < 2e-16 ***
## LENGTH_DIFF 0.22810 0.05347 4.266 2.49e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.061 on 398 degrees of freedom
## Multiple R-squared: 0.7528, Adjusted R-squared: 0.7504
## F-statistic: 303.1 on 4 and 398 DF, p-value: < 2.2e-16
(temp <- add1(gsm.num.03,
scope= ~ (SUBORDTYPE + LEN_MC + LEN_SC + LENGTH_DIFF + CONJ + MORETHAN2CL)^2,
test="F"))
## Single term additions
##
## Model:
## PREDNUM ~ CONJ + LENGTH_DIFF
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 448.24 52.875
## SUBORDTYPE 0 0.0000 448.24 52.875
## LEN_MC 1 0.0483 448.19 54.831 0.0428 0.8362
## LEN_SC 1 0.0483 448.19 54.831 0.0428 0.8362
## MORETHAN2CL 1 18.0923 430.15 38.271 16.6982 5.305e-05 ***
## LENGTH_DIFF:CONJ 3 29.6663 418.57 31.279 9.3319 5.659e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(gsm.num.04 <- lm(PREDNUM ~ CONJ + LENGTH_DIFF + MORETHAN2CL, data=x))
##
## Call:
## lm(formula = PREDNUM ~ CONJ + LENGTH_DIFF + MORETHAN2CL, data = x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.4499 -0.7632 -0.1308 0.6080 3.2758
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.53682 0.09279 -38.116 < 2e-16 ***
## CONJbevor/before 2.86177 0.17266 16.575 < 2e-16 ***
## CONJals/when 3.82607 0.13124 29.153 < 2e-16 ***
## CONJnachdem/after 3.65398 0.14892 24.536 < 2e-16 ***
## LENGTH_DIFF 0.23239 0.05246 4.430 1.22e-05 ***
## MORETHAN2CLyes 0.42584 0.10421 4.086 5.30e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.041 on 397 degrees of freedom
## Multiple R-squared: 0.7628, Adjusted R-squared: 0.7598
## F-statistic: 255.4 on 5 and 397 DF, p-value: < 2.2e-16
(temp <- add1(gsm.num.04,
scope= ~ (SUBORDTYPE + LEN_MC + LEN_SC + LENGTH_DIFF + CONJ + MORETHAN2CL)^2,
test="F"))
## Single term additions
##
## Model:
## PREDNUM ~ CONJ + LENGTH_DIFF + MORETHAN2CL
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 430.15 38.271
## SUBORDTYPE 0 0.000 430.15 38.271
## LEN_MC 1 0.006 430.14 40.265 0.0053 0.94175
## LEN_SC 1 0.006 430.14 40.265 0.0053 0.94175
## LENGTH_DIFF:CONJ 3 32.633 397.51 12.476 10.7814 8.004e-07 ***
## LENGTH_DIFF:MORETHAN2CL 1 4.726 425.42 35.818 4.3993 0.03659 *
## CONJ:MORETHAN2CL 3 7.455 422.69 37.225 2.3162 0.07526 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(gsm.num.05 <- lm(PREDNUM ~ CONJ * LENGTH_DIFF + MORETHAN2CL, data=x))
##
## Call:
## lm(formula = PREDNUM ~ CONJ * LENGTH_DIFF + MORETHAN2CL, data = x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1098 -0.7248 -0.1214 0.5886 2.9924
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.584675 0.090046 -39.810 < 2e-16 ***
## CONJbevor/before 3.011301 0.176446 17.066 < 2e-16 ***
## CONJals/when 3.819745 0.126802 30.124 < 2e-16 ***
## CONJnachdem/after 3.678378 0.143838 25.573 < 2e-16 ***
## LENGTH_DIFF -0.008743 0.075435 -0.116 0.90779
## MORETHAN2CLyes 0.460968 0.100898 4.569 6.58e-06 ***
## CONJbevor/before:LENGTH_DIFF -0.046060 0.170478 -0.270 0.78716
## CONJals/when:LENGTH_DIFF 0.619720 0.120111 5.160 3.93e-07 ***
## CONJnachdem/after:LENGTH_DIFF 0.440759 0.152566 2.889 0.00408 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.004 on 394 degrees of freedom
## Multiple R-squared: 0.7808, Adjusted R-squared: 0.7764
## F-statistic: 175.4 on 8 and 394 DF, p-value: < 2.2e-16
(temp <- add1(gsm.num.05,
scope= ~ (SUBORDTYPE + LEN_MC + LEN_SC + LENGTH_DIFF + CONJ + MORETHAN2CL)^2,
test="F"))
## Single term additions
##
## Model:
## PREDNUM ~ CONJ * LENGTH_DIFF + MORETHAN2CL
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 397.51 12.4756
## SUBORDTYPE 0 0.0000 397.51 12.4756
## LEN_MC 1 0.0720 397.44 14.4027 0.0712 0.78976
## LEN_SC 1 0.0720 397.44 14.4027 0.0712 0.78976
## LENGTH_DIFF:MORETHAN2CL 1 5.4045 392.11 8.9590 5.4168 0.02045 *
## CONJ:MORETHAN2CL 3 9.0303 388.48 9.2151 3.0296 0.02934 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(gsm.num.06 <- lm(PREDNUM ~ CONJ * LENGTH_DIFF + MORETHAN2CL + CONJ:MORETHAN2CL, data=x))
##
## Call:
## lm(formula = PREDNUM ~ CONJ * LENGTH_DIFF + MORETHAN2CL + CONJ:MORETHAN2CL,
## data = x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1999 -0.7148 -0.1352 0.5763 3.1280
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.424136 0.104441 -32.785 < 2e-16 ***
## CONJbevor/before 2.697734 0.233250 11.566 < 2e-16 ***
## CONJals/when 3.536397 0.188184 18.792 < 2e-16 ***
## CONJnachdem/after 3.300215 0.216885 15.216 < 2e-16 ***
## LENGTH_DIFF -0.005828 0.074865 -0.078 0.93799
## MORETHAN2CLyes 0.163046 0.141747 1.150 0.25074
## CONJbevor/before:LENGTH_DIFF -0.070114 0.170111 -0.412 0.68044
## CONJals/when:LENGTH_DIFF 0.623488 0.119304 5.226 2.82e-07 ***
## CONJnachdem/after:LENGTH_DIFF 0.470043 0.152615 3.080 0.00222 **
## CONJbevor/before:MORETHAN2CLyes 0.636121 0.328549 1.936 0.05357 .
## CONJals/when:MORETHAN2CLyes 0.516470 0.252265 2.047 0.04129 *
## CONJnachdem/after:MORETHAN2CLyes 0.679252 0.289966 2.343 0.01965 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9968 on 391 degrees of freedom
## Multiple R-squared: 0.7858, Adjusted R-squared: 0.7798
## F-statistic: 130.4 on 11 and 391 DF, p-value: < 2.2e-16
(temp <- add1(gsm.num.06,
scope= ~ (SUBORDTYPE + LEN_MC + LEN_SC + LENGTH_DIFF + CONJ + MORETHAN2CL)^2,
test="F"))
## Single term additions
##
## Model:
## PREDNUM ~ CONJ * LENGTH_DIFF + MORETHAN2CL + CONJ:MORETHAN2CL
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 388.48 9.2151
## SUBORDTYPE 0 0.0000 388.48 9.2151
## LEN_MC 1 0.0953 388.39 11.1162 0.0957 0.75722
## LEN_SC 1 0.0953 388.39 11.1162 0.0957 0.75722
## LENGTH_DIFF:MORETHAN2CL 1 3.7910 384.69 7.2632 3.8433 0.05066 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(effects::allEffects(gsm.num.06), ylab="Logit of pred. prob. of sc-mc", grid=TRUE)
#### these results are quite compatible with what a regular logistic regression on these data would return