library(ggplot2)
library(reshape2)
library(tidyr)
library(broom)
library(sigr)
library(vtreat)
library(dplyr)
library(lme4)
library(nlme)
library(caret)
library(gbm)
library(foreach)
library(doParallel)
library(magrittr)
library(plyr)
library(mosaic)
library(rgp)
head(esoph)
## agegp alcgp tobgp ncases ncontrols
## 1 25-34 0-39g/day 0-9g/day 0 40
## 2 25-34 0-39g/day 10-19 0 10
## 3 25-34 0-39g/day 20-29 0 6
## 4 25-34 0-39g/day 30+ 0 5
## 5 25-34 40-79 0-9g/day 0 27
## 6 25-34 40-79 10-19 0 7
#To get more details about esoph data
summary(esoph)
## agegp alcgp tobgp ncases ncontrols
## 25-34:15 0-39g/day:23 0-9g/day:24 Min. : 0.000 Min. : 1.00
## 35-44:15 40-79 :23 10-19 :24 1st Qu.: 0.000 1st Qu.: 3.00
## 45-54:16 80-119 :21 20-29 :20 Median : 1.000 Median : 6.00
## 55-64:16 120+ :21 30+ :20 Mean : 2.273 Mean :11.08
## 65-74:15 3rd Qu.: 4.000 3rd Qu.:14.00
## 75+ :11 Max. :17.000 Max. :60.00
# To get the number of rows in esoph (N)
(N <- nrow(esoph))
## [1] 88
# To calculate how many rows 75% of N should be
(target <- round(N * 0.75))
## [1] 66
# Create the vector of N uniform random variables: gp
gp <- runif(N)
gp
## [1] 0.84455558 0.75266807 0.87659491 0.08160445 0.22964153 0.46501945
## [7] 0.43888290 0.68947289 0.75853355 0.20274126 0.43576669 0.84643398
## [13] 0.60752594 0.43160490 0.05491417 0.60359378 0.46922520 0.78351553
## [19] 0.44419482 0.20470638 0.76787927 0.64873136 0.31414706 0.43604833
## [25] 0.83742075 0.93028240 0.92365829 0.06415928 0.70467872 0.77326632
## [31] 0.18951858 0.62143217 0.65035832 0.80839303 0.81930239 0.53236792
## [37] 0.14291715 0.90485467 0.16720439 0.44795169 0.62117093 0.87896760
## [43] 0.40420279 0.70132144 0.10062946 0.90405978 0.65723640 0.38109454
## [49] 0.64671354 0.35756566 0.80162525 0.92256414 0.39249797 0.42999366
## [55] 0.03404520 0.66425676 0.55631162 0.40547093 0.35324091 0.51876022
## [61] 0.54337798 0.77003480 0.07210496 0.29021050 0.61989996 0.22352684
## [67] 0.13597678 0.09202109 0.09645160 0.36934047 0.87044296 0.36012224
## [73] 0.44112632 0.50073335 0.06665930 0.50178258 0.34434535 0.75619184
## [79] 0.99150712 0.21970860 0.16021603 0.72905527 0.77010852 0.59204446
## [85] 0.65534637 0.05458133 0.60734861 0.32210665
# Use gp to create the training set: esoph_train (75% of data) and esoph_test (25% of data)
esoph_train <- esoph[gp < 0.75, ]
esoph_test <- esoph[gp >= 0.75, ]
# To examine the number of elements in esoph_train and esoph_test
nrow(esoph_train)
## [1] 65
nrow(esoph_test)
## [1] 23
### I created a formula to express esoph canser ncases as a function of tobgp (tobacco consumption), alcgp(alcohol consumption) and age without interaction
(fmla <- ncases ~ agegp+alcgp+tobgp)
## ncases ~ agegp + alcgp + tobgp
##Now I use lm() to build a model esoph_model from esoph that predicts ncases from agegp+alcgp+tobgp
esoph_add <- lm(fmla, data = esoph_train)
# predict ncases from for the training set
esoph_train$prediction <- predict(esoph_add)
# predict ncases for the test set
esoph_test$prediction <- predict(esoph_add, newdata = esoph_test)
# Evaluate the root mean squared error on both training and test data
(rmse_train <- rmse(esoph_train$prediction, esoph_train$ncases))
## [1] 1.963961
(rmse_test <- rmse(esoph_test$prediction, esoph_test$ncases))
## [1] 1.467446
# Evaluate the rsquared on both training and test data
(rsq_train <- rsquared(esoph_train$prediction, esoph_train$ncases))
## [1] 0.01873516
(rsq_test <- rsquared(esoph_test$prediction, esoph_test$ncases))
## [1] 0.6116932
# Plot the predictions (on the x-axis) against the outcome (esoph cases) on the train data
ggplot(esoph_train, aes(x = prediction, y = ncases)) +
geom_point() +
geom_abline()
library(rpart)
esoph_add <- rpart(ncases ~ ., data = esoph_train, method = "class", control = rpart.control(cp = 0))
esoph_test$predictions <- predict(esoph_add, esoph_test, type = "class")
table(esoph_test$predictions, esoph_test$ncases)
##
## 0 1 2 3 4 5 6 9
## 0 9 0 1 1 0 0 0 0
## 1 1 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0
## 3 1 1 2 0 1 2 2 1
## 4 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0
## 6 0 0 0 0 1 0 0 0
## 8 0 0 0 0 0 0 0 0
## 9 0 0 0 0 0 0 0 0
## 17 0 0 0 0 0 0 0 0
mean(esoph_test$predictions == esoph_test$ncases)
## [1] 0.3913043
## I use summary() to examine the model
summary(esoph_add)
## Call:
## rpart(formula = ncases ~ ., data = esoph_train, method = "class",
## control = rpart.control(cp = 0))
## n= 65
##
## CP nsplit rel error xerror xstd
## 1 0.1914894 0 1.0000000 1.0000000 0.07675924
## 2 0.1276596 1 0.8085106 0.8085106 0.08453161
## 3 0.0212766 2 0.6808511 0.7234043 0.08567730
## 4 0.0000000 4 0.6382979 0.8510638 0.08345367
##
## Variable importance
## agegp prediction ncontrols alcgp tobgp
## 36 29 23 8 4
##
## Node number 1: 65 observations, complexity param=0.1914894
## predicted class=0 expected loss=0.7230769 P(node) =1
## class counts: 18 15 8 8 6 4 3 1 1 1
## probabilities: 0.277 0.231 0.123 0.123 0.092 0.062 0.046 0.015 0.015 0.015
## left son=2 (19 obs) right son=3 (46 obs)
## Primary splits:
## agegp splits as LLRRRR, improve=8.142334, (0 missing)
## prediction < 2.574494 to the left, improve=7.057143, (0 missing)
## ncontrols < 1.5 to the left, improve=4.436364, (0 missing)
## tobgp splits as RRRL, improve=1.423529, (0 missing)
## alcgp splits as LLRR, improve=1.037500, (0 missing)
## Surrogate splits:
## prediction < 0.4694616 to the left, agree=0.892, adj=0.632, (0 split)
##
## Node number 2: 19 observations
## predicted class=0 expected loss=0.2105263 P(node) =0.2923077
## class counts: 15 3 1 0 0 0 0 0 0 0
## probabilities: 0.789 0.158 0.053 0.000 0.000 0.000 0.000 0.000 0.000 0.000
##
## Node number 3: 46 observations, complexity param=0.1276596
## predicted class=1 expected loss=0.7391304 P(node) =0.7076923
## class counts: 3 12 7 8 6 4 3 1 1 1
## probabilities: 0.065 0.261 0.152 0.174 0.130 0.087 0.065 0.022 0.022 0.022
## left son=6 (15 obs) right son=7 (31 obs)
## Primary splits:
## ncontrols < 3.5 to the left, improve=5.3035060, (0 missing)
## prediction < 2.53541 to the left, improve=4.3615710, (0 missing)
## agegp splits as RRRRRL, improve=3.0366130, (0 missing)
## tobgp splits as RRRL, improve=1.1024030, (0 missing)
## alcgp splits as LLRR, improve=0.9472991, (0 missing)
## Surrogate splits:
## prediction < 2.53541 to the left, agree=0.826, adj=0.467, (0 split)
## agegp splits as RRRRRL, agree=0.804, adj=0.400, (0 split)
## alcgp splits as RRRL, agree=0.717, adj=0.133, (0 split)
## tobgp splits as RRRL, agree=0.717, adj=0.133, (0 split)
##
## Node number 6: 15 observations
## predicted class=1 expected loss=0.3333333 P(node) =0.2307692
## class counts: 1 10 4 0 0 0 0 0 0 0
## probabilities: 0.067 0.667 0.267 0.000 0.000 0.000 0.000 0.000 0.000 0.000
##
## Node number 7: 31 observations, complexity param=0.0212766
## predicted class=3 expected loss=0.7419355 P(node) =0.4769231
## class counts: 2 2 3 8 6 4 3 1 1 1
## probabilities: 0.065 0.065 0.097 0.258 0.194 0.129 0.097 0.032 0.032 0.032
## left son=14 (9 obs) right son=15 (22 obs)
## Primary splits:
## ncontrols < 6.5 to the left, improve=1.1407620, (0 missing)
## prediction < 3.072553 to the left, improve=1.0749620, (0 missing)
## alcgp splits as LLRR, improve=0.9721533, (0 missing)
## agegp splits as LLLRRR, improve=0.7892473, (0 missing)
## tobgp splits as RLLL, improve=0.6680352, (0 missing)
## Surrogate splits:
## tobgp splits as RRRL, agree=0.806, adj=0.333, (0 split)
## alcgp splits as RRRL, agree=0.742, adj=0.111, (0 split)
##
## Node number 14: 9 observations
## predicted class=3 expected loss=0.5555556 P(node) =0.1384615
## class counts: 0 1 1 4 3 0 0 0 0 0
## probabilities: 0.000 0.111 0.111 0.444 0.333 0.000 0.000 0.000 0.000 0.000
##
## Node number 15: 22 observations, complexity param=0.0212766
## predicted class=3 expected loss=0.8181818 P(node) =0.3384615
## class counts: 2 1 2 4 3 4 3 1 1 1
## probabilities: 0.091 0.045 0.091 0.182 0.136 0.182 0.136 0.045 0.045 0.045
## left son=30 (15 obs) right son=31 (7 obs)
## Primary splits:
## alcgp splits as LLRR, improve=1.5056280, (0 missing)
## prediction < 3.168174 to the left, improve=0.8008658, (0 missing)
## ncontrols < 16.5 to the right, improve=0.7886558, (0 missing)
## tobgp splits as RLLL, improve=0.7175325, (0 missing)
## agegp splits as LLLRRR, improve=0.6675325, (0 missing)
## Surrogate splits:
## prediction < 4.146616 to the left, agree=0.773, adj=0.286, (0 split)
##
## Node number 30: 15 observations
## predicted class=3 expected loss=0.8 P(node) =0.2307692
## class counts: 2 1 2 3 3 3 0 0 0 1
## probabilities: 0.133 0.067 0.133 0.200 0.200 0.200 0.000 0.000 0.000 0.067
##
## Node number 31: 7 observations
## predicted class=6 expected loss=0.5714286 P(node) =0.1076923
## class counts: 0 0 0 1 0 1 3 1 1 0
## probabilities: 0.000 0.000 0.000 0.143 0.000 0.143 0.429 0.143 0.143 0.000
### The second model is including an additional variable: alcohol and tobacco consumption with interaction
(fmla2 <- ncases ~ agegp+alcgp+tobgp+alcgp:tobgp)
## ncases ~ agegp + alcgp + tobgp + alcgp:tobgp
fmla2
## ncases ~ agegp + alcgp + tobgp + alcgp:tobgp
##Now I use lm() to build a model esoph_model from esoph that predicts ncases
esoph_interaction <- lm(fmla2, data = esoph_train)
esoph_interaction
##
## Call:
## lm(formula = fmla2, data = esoph_train)
##
## Coefficients:
## (Intercept) agegp.L agegp.Q agegp.C
## 1.82562 1.85603 -3.50302 -2.43505
## agegp^4 agegp^5 alcgp.L alcgp.Q
## 0.68603 -0.05258 0.18266 -1.42506
## alcgp.C tobgp.L tobgp.Q tobgp.C
## 0.83068 -1.80968 0.63251 0.51644
## alcgp.L:tobgp.L alcgp.Q:tobgp.L alcgp.C:tobgp.L alcgp.L:tobgp.Q
## -1.11884 1.47633 -0.34827 -0.37799
## alcgp.Q:tobgp.Q alcgp.C:tobgp.Q alcgp.L:tobgp.C alcgp.Q:tobgp.C
## -0.11236 0.69555 0.48398 0.42841
## alcgp.C:tobgp.C
## -2.09980
# predict ncases from tobgp for the training set
esoph_train$prediction <- predict(esoph_interaction)
# predict ncases from tobgp for the test set
esoph_test$prediction <- predict(esoph_interaction, newdata = esoph_test)
# Evaluate the root mean squared error on both training and test data and print them
(rmse_train2 <- rmse(esoph_train$prediction, esoph_train$ncases))
## [1] 1.819777
(rmse_test2 <- rmse(esoph_test$prediction, esoph_test$ncases))
## [1] 1.821575
# Evaluate the rsquared on both training and test data and print them
(rsq_train2 <- rsquared(esoph_train$prediction, esoph_train$ncases))
## [1] 0.2602022
(rsq_test2 <- rsquared(esoph_test$prediction, esoph_test$ncases))
## [1] 0.5767596
# Plot the predictions (on the x-axis) against the outcome (esoph cases) on the train data
ggplot(esoph_train, aes(x = prediction, y = ncases)) +
geom_point() +
geom_abline()
library(rpart)
esoph_interaction <- rpart(ncases ~ ., data = esoph_train, method = "class", control = rpart.control(cp = 0))
esoph_test$predictions <- predict(esoph_interaction, esoph_test, type = "class")
table(esoph_test$predictions, esoph_test$ncases)
##
## 0 1 2 3 4 5 6 9
## 0 9 0 1 1 0 0 0 0
## 1 1 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0
## 3 1 1 2 0 1 1 0 0
## 4 0 0 0 0 0 1 0 0
## 5 0 0 0 0 0 0 0 0
## 6 0 0 0 0 1 0 2 1
## 8 0 0 0 0 0 0 0 0
## 9 0 0 0 0 0 0 0 0
## 17 0 0 0 0 0 0 0 0
mean(esoph_test$predictions == esoph_test$ncases)
## [1] 0.4782609
## I use summary() to examine the model
summary(esoph_interaction)
## Call:
## rpart(formula = ncases ~ ., data = esoph_train, method = "class",
## control = rpart.control(cp = 0))
## n= 65
##
## CP nsplit rel error xerror xstd
## 1 0.19148936 0 1.0000000 1.0000000 0.07675924
## 2 0.12765957 1 0.8085106 0.8723404 0.08278335
## 3 0.04255319 2 0.6808511 0.7446809 0.08551457
## 4 0.02127660 3 0.6382979 0.8297872 0.08403582
## 5 0.00000000 4 0.6170213 0.8297872 0.08403582
##
## Variable importance
## agegp prediction ncontrols tobgp alcgp
## 39 35 20 3 3
##
## Node number 1: 65 observations, complexity param=0.1914894
## predicted class=0 expected loss=0.7230769 P(node) =1
## class counts: 18 15 8 8 6 4 3 1 1 1
## probabilities: 0.277 0.231 0.123 0.123 0.092 0.062 0.046 0.015 0.015 0.015
## left son=2 (19 obs) right son=3 (46 obs)
## Primary splits:
## agegp splits as LLRRRR, improve=8.142334, (0 missing)
## prediction < 2.495813 to the left, improve=5.984470, (0 missing)
## ncontrols < 1.5 to the left, improve=4.436364, (0 missing)
## tobgp splits as RRRL, improve=1.423529, (0 missing)
## alcgp splits as LLRR, improve=1.037500, (0 missing)
## Surrogate splits:
## prediction < 0.08124933 to the left, agree=0.862, adj=0.526, (0 split)
##
## Node number 2: 19 observations
## predicted class=0 expected loss=0.2105263 P(node) =0.2923077
## class counts: 15 3 1 0 0 0 0 0 0 0
## probabilities: 0.789 0.158 0.053 0.000 0.000 0.000 0.000 0.000 0.000 0.000
##
## Node number 3: 46 observations, complexity param=0.1276596
## predicted class=1 expected loss=0.7391304 P(node) =0.7076923
## class counts: 3 12 7 8 6 4 3 1 1 1
## probabilities: 0.065 0.261 0.152 0.174 0.130 0.087 0.065 0.022 0.022 0.022
## left son=6 (15 obs) right son=7 (31 obs)
## Primary splits:
## ncontrols < 3.5 to the left, improve=5.3035060, (0 missing)
## prediction < 2.792507 to the left, improve=5.1435470, (0 missing)
## agegp splits as RRRRRL, improve=3.0366130, (0 missing)
## tobgp splits as RRRL, improve=1.1024030, (0 missing)
## alcgp splits as LLRR, improve=0.9472991, (0 missing)
## Surrogate splits:
## agegp splits as RRRRRL, agree=0.804, adj=0.400, (0 split)
## prediction < 2.495813 to the left, agree=0.804, adj=0.400, (0 split)
## alcgp splits as RRRL, agree=0.717, adj=0.133, (0 split)
## tobgp splits as RRRL, agree=0.717, adj=0.133, (0 split)
##
## Node number 6: 15 observations
## predicted class=1 expected loss=0.3333333 P(node) =0.2307692
## class counts: 1 10 4 0 0 0 0 0 0 0
## probabilities: 0.067 0.667 0.267 0.000 0.000 0.000 0.000 0.000 0.000 0.000
##
## Node number 7: 31 observations, complexity param=0.04255319
## predicted class=3 expected loss=0.7419355 P(node) =0.4769231
## class counts: 2 2 3 8 6 4 3 1 1 1
## probabilities: 0.065 0.065 0.097 0.258 0.194 0.129 0.097 0.032 0.032 0.032
## left son=14 (21 obs) right son=15 (10 obs)
## Primary splits:
## prediction < 4.513865 to the left, improve=1.6463900, (0 missing)
## ncontrols < 6.5 to the left, improve=1.1407620, (0 missing)
## alcgp splits as LLRR, improve=0.9721533, (0 missing)
## agegp splits as LLLRRR, improve=0.7892473, (0 missing)
## tobgp splits as RLLL, improve=0.6680352, (0 missing)
## Surrogate splits:
## alcgp splits as LLRR, agree=0.71, adj=0.1, (0 split)
##
## Node number 14: 21 observations, complexity param=0.0212766
## predicted class=3 expected loss=0.6666667 P(node) =0.3230769
## class counts: 2 2 3 7 5 2 0 0 0 0
## probabilities: 0.095 0.095 0.143 0.333 0.238 0.095 0.000 0.000 0.000 0.000
## left son=28 (8 obs) right son=29 (13 obs)
## Primary splits:
## prediction < 2.964276 to the left, improve=1.49542100, (0 missing)
## agegp splits as LLLRRR, improve=1.03174600, (0 missing)
## alcgp splits as LRRR, improve=0.63982680, (0 missing)
## ncontrols < 14.5 to the left, improve=0.59157510, (0 missing)
## tobgp splits as RRLL, improve=0.03174603, (0 missing)
## Surrogate splits:
## agegp splits as LLLRRR, agree=0.667, adj=0.125, (0 split)
## tobgp splits as RRLL, agree=0.667, adj=0.125, (0 split)
##
## Node number 15: 10 observations
## predicted class=6 expected loss=0.7 P(node) =0.1538462
## class counts: 0 0 0 1 1 2 3 1 1 1
## probabilities: 0.000 0.000 0.000 0.100 0.100 0.200 0.300 0.100 0.100 0.100
##
## Node number 28: 8 observations
## predicted class=3 expected loss=0.625 P(node) =0.1230769
## class counts: 2 2 1 3 0 0 0 0 0 0
## probabilities: 0.250 0.250 0.125 0.375 0.000 0.000 0.000 0.000 0.000 0.000
##
## Node number 29: 13 observations
## predicted class=4 expected loss=0.6153846 P(node) =0.2
## class counts: 0 0 2 4 5 2 0 0 0 0
## probabilities: 0.000 0.000 0.154 0.308 0.385 0.154 0.000 0.000 0.000 0.000
This is the comparison of the performance of the interaction model to the performance of a main-effects only model. Because this data set is small, we will use cross-validation to simulate making predictions on out-of-sample data. Cross-validation predicts how well a model built from all the data will perform on new data. As with the test/train split, for a good modeling procedure, cross-validation performance and training performance should be close.
# Create the formula with main effects only
(esoph_add <- ncases ~ agegp+ alcgp + tobgp)
## ncases ~ agegp + alcgp + tobgp
# Create the formula with interactions
(esoph_interaction <- ncases ~ agegp+alcgp+tobgp+alcgp:tobgp)
## ncases ~ agegp + alcgp + tobgp + alcgp:tobgp
set.seed(34) # set the seed for reproducibility
splitPlan <- kWayCrossValidation(nrow(esoph), 3, NULL, NULL)
# Sample code: Get cross-val predictions for main-effects only model
esoph$predictions_add <- 0 # initialize the prediction vector
for(i in 1:3) {
split <- splitPlan[[i]]
esoph_add <- lm(esoph_add, data = esoph[split$train, ])
esoph$predictions_add[split$app] <- predict(esoph_add, newdata = esoph[split$app, ])
}
# Get the cross-val predictions for the model with interactions
esoph$predictions_interaction <- 0 # initialize the prediction vector
for(i in 1:3) {
split <- splitPlan[[i]]
esoph_interaction <- lm(esoph_interaction, data = esoph[split$train, ])
esoph$predictions_interaction[split$app] <- predict(esoph_interaction, newdata = esoph[split$app, ])
}
esoph %>%
gather(key = modeltype, value = predictions, predictions_add, predictions_interaction) %>%
mutate(residuals = ncases - predictions) %>%
group_by(modeltype) %>%
summarize(rmse = sqrt(mean(residuals^2)))
## rmse
## 1 2.17695
#(RMSE) is a frequently used measure of the differences between values (sample and population values) predicted by a model or an estimator and the values actually observed.