esoph

R Markdown

library(ggplot2)
library(reshape2)
library(tidyr)
library(broom)
library(sigr)
library(vtreat)
library(dplyr)
library(lme4)
library(nlme)
library(caret)
library(gbm)
library(foreach)
library(doParallel)
library(magrittr)
library(plyr)
library(mosaic)
library(rgp)

head(esoph)

##   agegp     alcgp    tobgp ncases ncontrols
## 1 25-34 0-39g/day 0-9g/day      0        40
## 2 25-34 0-39g/day    10-19      0        10
## 3 25-34 0-39g/day    20-29      0         6
## 4 25-34 0-39g/day      30+      0         5
## 5 25-34     40-79 0-9g/day      0        27
## 6 25-34     40-79    10-19      0         7

#To get more details about esoph data
summary(esoph)

##    agegp          alcgp         tobgp        ncases         ncontrols    
##  25-34:15   0-39g/day:23   0-9g/day:24   Min.   : 0.000   Min.   : 1.00  
##  35-44:15   40-79    :23   10-19   :24   1st Qu.: 0.000   1st Qu.: 3.00  
##  45-54:16   80-119   :21   20-29   :20   Median : 1.000   Median : 6.00  
##  55-64:16   120+     :21   30+     :20   Mean   : 2.273   Mean   :11.08  
##  65-74:15                                3rd Qu.: 4.000   3rd Qu.:14.00  
##  75+  :11                                Max.   :17.000   Max.   :60.00

# To get the number of rows in esoph (N)
(N <- nrow(esoph))

## [1] 88

# To calculate how many rows 75% of N should be
(target <- round(N * 0.75))

## [1] 66

# Create the vector of N uniform random variables: gp
gp <- runif(N)
gp

##  [1] 0.84455558 0.75266807 0.87659491 0.08160445 0.22964153 0.46501945
##  [7] 0.43888290 0.68947289 0.75853355 0.20274126 0.43576669 0.84643398
## [13] 0.60752594 0.43160490 0.05491417 0.60359378 0.46922520 0.78351553
## [19] 0.44419482 0.20470638 0.76787927 0.64873136 0.31414706 0.43604833
## [25] 0.83742075 0.93028240 0.92365829 0.06415928 0.70467872 0.77326632
## [31] 0.18951858 0.62143217 0.65035832 0.80839303 0.81930239 0.53236792
## [37] 0.14291715 0.90485467 0.16720439 0.44795169 0.62117093 0.87896760
## [43] 0.40420279 0.70132144 0.10062946 0.90405978 0.65723640 0.38109454
## [49] 0.64671354 0.35756566 0.80162525 0.92256414 0.39249797 0.42999366
## [55] 0.03404520 0.66425676 0.55631162 0.40547093 0.35324091 0.51876022
## [61] 0.54337798 0.77003480 0.07210496 0.29021050 0.61989996 0.22352684
## [67] 0.13597678 0.09202109 0.09645160 0.36934047 0.87044296 0.36012224
## [73] 0.44112632 0.50073335 0.06665930 0.50178258 0.34434535 0.75619184
## [79] 0.99150712 0.21970860 0.16021603 0.72905527 0.77010852 0.59204446
## [85] 0.65534637 0.05458133 0.60734861 0.32210665

# Use gp to create the training set: esoph_train (75% of data) and esoph_test (25% of data)
esoph_train <- esoph[gp < 0.75, ]
esoph_test <- esoph[gp >= 0.75, ]

# To examine the number of elements in esoph_train and esoph_test
nrow(esoph_train)

## [1] 65

nrow(esoph_test)

## [1] 23

FIRST MODEL: esoph cases ~ age + alcohol + tobacco

### I created a formula to express esoph canser ncases as a function of tobgp (tobacco consumption), alcgp(alcohol consumption) and age without interaction
(fmla <- ncases ~ agegp+alcgp+tobgp)

## ncases ~ agegp + alcgp + tobgp

##Now I use lm() to build a model esoph_model from esoph that predicts ncases from agegp+alcgp+tobgp

esoph_add <- lm(fmla, data = esoph_train)

# predict ncases from for the training set
esoph_train$prediction <- predict(esoph_add)

# predict ncases for the test set
esoph_test$prediction <- predict(esoph_add, newdata = esoph_test)

# Evaluate the root mean squared error on both training and test data
(rmse_train <- rmse(esoph_train$prediction, esoph_train$ncases))

## [1] 1.963961

(rmse_test <- rmse(esoph_test$prediction, esoph_test$ncases))

## [1] 1.467446

# Evaluate the rsquared on both training and test data
(rsq_train <- rsquared(esoph_train$prediction, esoph_train$ncases))

## [1] 0.01873516

(rsq_test <- rsquared(esoph_test$prediction, esoph_test$ncases))

## [1] 0.6116932

# Plot the predictions (on the x-axis) against the outcome (esoph cases) on the train data
ggplot(esoph_train, aes(x = prediction, y = ncases)) + 
  geom_point() + 
  geom_abline()

library(rpart)
esoph_add <- rpart(ncases ~ ., data = esoph_train, method = "class", control = rpart.control(cp = 0))

esoph_test$predictions <- predict(esoph_add, esoph_test, type = "class")
table(esoph_test$predictions, esoph_test$ncases)

##     
##      0 1 2 3 4 5 6 9
##   0  9 0 1 1 0 0 0 0
##   1  1 0 0 0 0 0 0 0
##   2  0 0 0 0 0 0 0 0
##   3  1 1 2 0 1 2 2 1
##   4  0 0 0 0 0 0 0 0
##   5  0 0 0 0 0 0 0 0
##   6  0 0 0 0 1 0 0 0
##   8  0 0 0 0 0 0 0 0
##   9  0 0 0 0 0 0 0 0
##   17 0 0 0 0 0 0 0 0

mean(esoph_test$predictions == esoph_test$ncases)

## [1] 0.3913043

## I use summary() to examine the model
summary(esoph_add)

## Call:
## rpart(formula = ncases ~ ., data = esoph_train, method = "class", 
##     control = rpart.control(cp = 0))
##   n= 65 
## 
##          CP nsplit rel error    xerror       xstd
## 1 0.1914894      0 1.0000000 1.0000000 0.07675924
## 2 0.1276596      1 0.8085106 0.8085106 0.08453161
## 3 0.0212766      2 0.6808511 0.7234043 0.08567730
## 4 0.0000000      4 0.6382979 0.8510638 0.08345367
## 
## Variable importance
##      agegp prediction  ncontrols      alcgp      tobgp 
##         36         29         23          8          4 
## 
## Node number 1: 65 observations,    complexity param=0.1914894
##   predicted class=0  expected loss=0.7230769  P(node) =1
##     class counts:    18    15     8     8     6     4     3     1     1     1
##    probabilities: 0.277 0.231 0.123 0.123 0.092 0.062 0.046 0.015 0.015 0.015 
##   left son=2 (19 obs) right son=3 (46 obs)
##   Primary splits:
##       agegp      splits as  LLRRRR, improve=8.142334, (0 missing)
##       prediction < 2.574494  to the left,  improve=7.057143, (0 missing)
##       ncontrols  < 1.5       to the left,  improve=4.436364, (0 missing)
##       tobgp      splits as  RRRL, improve=1.423529, (0 missing)
##       alcgp      splits as  LLRR, improve=1.037500, (0 missing)
##   Surrogate splits:
##       prediction < 0.4694616 to the left,  agree=0.892, adj=0.632, (0 split)
## 
## Node number 2: 19 observations
##   predicted class=0  expected loss=0.2105263  P(node) =0.2923077
##     class counts:    15     3     1     0     0     0     0     0     0     0
##    probabilities: 0.789 0.158 0.053 0.000 0.000 0.000 0.000 0.000 0.000 0.000 
## 
## Node number 3: 46 observations,    complexity param=0.1276596
##   predicted class=1  expected loss=0.7391304  P(node) =0.7076923
##     class counts:     3    12     7     8     6     4     3     1     1     1
##    probabilities: 0.065 0.261 0.152 0.174 0.130 0.087 0.065 0.022 0.022 0.022 
##   left son=6 (15 obs) right son=7 (31 obs)
##   Primary splits:
##       ncontrols  < 3.5       to the left,  improve=5.3035060, (0 missing)
##       prediction < 2.53541   to the left,  improve=4.3615710, (0 missing)
##       agegp      splits as  RRRRRL, improve=3.0366130, (0 missing)
##       tobgp      splits as  RRRL, improve=1.1024030, (0 missing)
##       alcgp      splits as  LLRR, improve=0.9472991, (0 missing)
##   Surrogate splits:
##       prediction < 2.53541   to the left,  agree=0.826, adj=0.467, (0 split)
##       agegp      splits as  RRRRRL, agree=0.804, adj=0.400, (0 split)
##       alcgp      splits as  RRRL, agree=0.717, adj=0.133, (0 split)
##       tobgp      splits as  RRRL, agree=0.717, adj=0.133, (0 split)
## 
## Node number 6: 15 observations
##   predicted class=1  expected loss=0.3333333  P(node) =0.2307692
##     class counts:     1    10     4     0     0     0     0     0     0     0
##    probabilities: 0.067 0.667 0.267 0.000 0.000 0.000 0.000 0.000 0.000 0.000 
## 
## Node number 7: 31 observations,    complexity param=0.0212766
##   predicted class=3  expected loss=0.7419355  P(node) =0.4769231
##     class counts:     2     2     3     8     6     4     3     1     1     1
##    probabilities: 0.065 0.065 0.097 0.258 0.194 0.129 0.097 0.032 0.032 0.032 
##   left son=14 (9 obs) right son=15 (22 obs)
##   Primary splits:
##       ncontrols  < 6.5       to the left,  improve=1.1407620, (0 missing)
##       prediction < 3.072553  to the left,  improve=1.0749620, (0 missing)
##       alcgp      splits as  LLRR, improve=0.9721533, (0 missing)
##       agegp      splits as  LLLRRR, improve=0.7892473, (0 missing)
##       tobgp      splits as  RLLL, improve=0.6680352, (0 missing)
##   Surrogate splits:
##       tobgp splits as  RRRL, agree=0.806, adj=0.333, (0 split)
##       alcgp splits as  RRRL, agree=0.742, adj=0.111, (0 split)
## 
## Node number 14: 9 observations
##   predicted class=3  expected loss=0.5555556  P(node) =0.1384615
##     class counts:     0     1     1     4     3     0     0     0     0     0
##    probabilities: 0.000 0.111 0.111 0.444 0.333 0.000 0.000 0.000 0.000 0.000 
## 
## Node number 15: 22 observations,    complexity param=0.0212766
##   predicted class=3  expected loss=0.8181818  P(node) =0.3384615
##     class counts:     2     1     2     4     3     4     3     1     1     1
##    probabilities: 0.091 0.045 0.091 0.182 0.136 0.182 0.136 0.045 0.045 0.045 
##   left son=30 (15 obs) right son=31 (7 obs)
##   Primary splits:
##       alcgp      splits as  LLRR, improve=1.5056280, (0 missing)
##       prediction < 3.168174  to the left,  improve=0.8008658, (0 missing)
##       ncontrols  < 16.5      to the right, improve=0.7886558, (0 missing)
##       tobgp      splits as  RLLL, improve=0.7175325, (0 missing)
##       agegp      splits as  LLLRRR, improve=0.6675325, (0 missing)
##   Surrogate splits:
##       prediction < 4.146616  to the left,  agree=0.773, adj=0.286, (0 split)
## 
## Node number 30: 15 observations
##   predicted class=3  expected loss=0.8  P(node) =0.2307692
##     class counts:     2     1     2     3     3     3     0     0     0     1
##    probabilities: 0.133 0.067 0.133 0.200 0.200 0.200 0.000 0.000 0.000 0.067 
## 
## Node number 31: 7 observations
##   predicted class=6  expected loss=0.5714286  P(node) =0.1076923
##     class counts:     0     0     0     1     0     1     3     1     1     0
##    probabilities: 0.000 0.000 0.000 0.143 0.000 0.143 0.429 0.143 0.143 0.000

SECOND MODEL: esoph cases ~ age + alcohol + tobacco + alcohol:tobacco

### The second model is including an additional variable: alcohol and tobacco consumption with interaction
(fmla2 <- ncases ~ agegp+alcgp+tobgp+alcgp:tobgp)

## ncases ~ agegp + alcgp + tobgp + alcgp:tobgp

fmla2

## ncases ~ agegp + alcgp + tobgp + alcgp:tobgp

##Now I use lm() to build a model esoph_model from esoph that predicts ncases 
esoph_interaction <- lm(fmla2, data = esoph_train)
esoph_interaction

## 
## Call:
## lm(formula = fmla2, data = esoph_train)
## 
## Coefficients:
##     (Intercept)          agegp.L          agegp.Q          agegp.C  
##         1.82562          1.85603         -3.50302         -2.43505  
##         agegp^4          agegp^5          alcgp.L          alcgp.Q  
##         0.68603         -0.05258          0.18266         -1.42506  
##         alcgp.C          tobgp.L          tobgp.Q          tobgp.C  
##         0.83068         -1.80968          0.63251          0.51644  
## alcgp.L:tobgp.L  alcgp.Q:tobgp.L  alcgp.C:tobgp.L  alcgp.L:tobgp.Q  
##        -1.11884          1.47633         -0.34827         -0.37799  
## alcgp.Q:tobgp.Q  alcgp.C:tobgp.Q  alcgp.L:tobgp.C  alcgp.Q:tobgp.C  
##        -0.11236          0.69555          0.48398          0.42841  
## alcgp.C:tobgp.C  
##        -2.09980

# predict ncases from tobgp for the training set
esoph_train$prediction <- predict(esoph_interaction)

# predict ncases from tobgp for the test set
esoph_test$prediction <- predict(esoph_interaction, newdata = esoph_test)

# Evaluate the root mean squared error on both training and test data and print them
(rmse_train2 <- rmse(esoph_train$prediction, esoph_train$ncases))

## [1] 1.819777

(rmse_test2 <- rmse(esoph_test$prediction, esoph_test$ncases))

## [1] 1.821575

# Evaluate the rsquared on both training and test data and print them
(rsq_train2 <- rsquared(esoph_train$prediction, esoph_train$ncases))

## [1] 0.2602022

(rsq_test2 <- rsquared(esoph_test$prediction, esoph_test$ncases))

## [1] 0.5767596

# Plot the predictions (on the x-axis) against the outcome (esoph cases) on the train data
ggplot(esoph_train, aes(x = prediction, y = ncases)) + 
  geom_point() + 
  geom_abline()

library(rpart)
esoph_interaction <- rpart(ncases ~ ., data = esoph_train, method = "class", control = rpart.control(cp = 0))

esoph_test$predictions <- predict(esoph_interaction, esoph_test, type = "class")
table(esoph_test$predictions, esoph_test$ncases)

##     
##      0 1 2 3 4 5 6 9
##   0  9 0 1 1 0 0 0 0
##   1  1 0 0 0 0 0 0 0
##   2  0 0 0 0 0 0 0 0
##   3  1 1 2 0 1 1 0 0
##   4  0 0 0 0 0 1 0 0
##   5  0 0 0 0 0 0 0 0
##   6  0 0 0 0 1 0 2 1
##   8  0 0 0 0 0 0 0 0
##   9  0 0 0 0 0 0 0 0
##   17 0 0 0 0 0 0 0 0

mean(esoph_test$predictions == esoph_test$ncases)

## [1] 0.4782609

## I use summary() to examine the model
summary(esoph_interaction)

## Call:
## rpart(formula = ncases ~ ., data = esoph_train, method = "class", 
##     control = rpart.control(cp = 0))
##   n= 65 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.19148936      0 1.0000000 1.0000000 0.07675924
## 2 0.12765957      1 0.8085106 0.8723404 0.08278335
## 3 0.04255319      2 0.6808511 0.7446809 0.08551457
## 4 0.02127660      3 0.6382979 0.8297872 0.08403582
## 5 0.00000000      4 0.6170213 0.8297872 0.08403582
## 
## Variable importance
##      agegp prediction  ncontrols      tobgp      alcgp 
##         39         35         20          3          3 
## 
## Node number 1: 65 observations,    complexity param=0.1914894
##   predicted class=0  expected loss=0.7230769  P(node) =1
##     class counts:    18    15     8     8     6     4     3     1     1     1
##    probabilities: 0.277 0.231 0.123 0.123 0.092 0.062 0.046 0.015 0.015 0.015 
##   left son=2 (19 obs) right son=3 (46 obs)
##   Primary splits:
##       agegp      splits as  LLRRRR, improve=8.142334, (0 missing)
##       prediction < 2.495813   to the left,  improve=5.984470, (0 missing)
##       ncontrols  < 1.5        to the left,  improve=4.436364, (0 missing)
##       tobgp      splits as  RRRL, improve=1.423529, (0 missing)
##       alcgp      splits as  LLRR, improve=1.037500, (0 missing)
##   Surrogate splits:
##       prediction < 0.08124933 to the left,  agree=0.862, adj=0.526, (0 split)
## 
## Node number 2: 19 observations
##   predicted class=0  expected loss=0.2105263  P(node) =0.2923077
##     class counts:    15     3     1     0     0     0     0     0     0     0
##    probabilities: 0.789 0.158 0.053 0.000 0.000 0.000 0.000 0.000 0.000 0.000 
## 
## Node number 3: 46 observations,    complexity param=0.1276596
##   predicted class=1  expected loss=0.7391304  P(node) =0.7076923
##     class counts:     3    12     7     8     6     4     3     1     1     1
##    probabilities: 0.065 0.261 0.152 0.174 0.130 0.087 0.065 0.022 0.022 0.022 
##   left son=6 (15 obs) right son=7 (31 obs)
##   Primary splits:
##       ncontrols  < 3.5        to the left,  improve=5.3035060, (0 missing)
##       prediction < 2.792507   to the left,  improve=5.1435470, (0 missing)
##       agegp      splits as  RRRRRL, improve=3.0366130, (0 missing)
##       tobgp      splits as  RRRL, improve=1.1024030, (0 missing)
##       alcgp      splits as  LLRR, improve=0.9472991, (0 missing)
##   Surrogate splits:
##       agegp      splits as  RRRRRL, agree=0.804, adj=0.400, (0 split)
##       prediction < 2.495813   to the left,  agree=0.804, adj=0.400, (0 split)
##       alcgp      splits as  RRRL, agree=0.717, adj=0.133, (0 split)
##       tobgp      splits as  RRRL, agree=0.717, adj=0.133, (0 split)
## 
## Node number 6: 15 observations
##   predicted class=1  expected loss=0.3333333  P(node) =0.2307692
##     class counts:     1    10     4     0     0     0     0     0     0     0
##    probabilities: 0.067 0.667 0.267 0.000 0.000 0.000 0.000 0.000 0.000 0.000 
## 
## Node number 7: 31 observations,    complexity param=0.04255319
##   predicted class=3  expected loss=0.7419355  P(node) =0.4769231
##     class counts:     2     2     3     8     6     4     3     1     1     1
##    probabilities: 0.065 0.065 0.097 0.258 0.194 0.129 0.097 0.032 0.032 0.032 
##   left son=14 (21 obs) right son=15 (10 obs)
##   Primary splits:
##       prediction < 4.513865   to the left,  improve=1.6463900, (0 missing)
##       ncontrols  < 6.5        to the left,  improve=1.1407620, (0 missing)
##       alcgp      splits as  LLRR, improve=0.9721533, (0 missing)
##       agegp      splits as  LLLRRR, improve=0.7892473, (0 missing)
##       tobgp      splits as  RLLL, improve=0.6680352, (0 missing)
##   Surrogate splits:
##       alcgp splits as  LLRR, agree=0.71, adj=0.1, (0 split)
## 
## Node number 14: 21 observations,    complexity param=0.0212766
##   predicted class=3  expected loss=0.6666667  P(node) =0.3230769
##     class counts:     2     2     3     7     5     2     0     0     0     0
##    probabilities: 0.095 0.095 0.143 0.333 0.238 0.095 0.000 0.000 0.000 0.000 
##   left son=28 (8 obs) right son=29 (13 obs)
##   Primary splits:
##       prediction < 2.964276   to the left,  improve=1.49542100, (0 missing)
##       agegp      splits as  LLLRRR, improve=1.03174600, (0 missing)
##       alcgp      splits as  LRRR, improve=0.63982680, (0 missing)
##       ncontrols  < 14.5       to the left,  improve=0.59157510, (0 missing)
##       tobgp      splits as  RRLL, improve=0.03174603, (0 missing)
##   Surrogate splits:
##       agegp splits as  LLLRRR, agree=0.667, adj=0.125, (0 split)
##       tobgp splits as  RRLL,   agree=0.667, adj=0.125, (0 split)
## 
## Node number 15: 10 observations
##   predicted class=6  expected loss=0.7  P(node) =0.1538462
##     class counts:     0     0     0     1     1     2     3     1     1     1
##    probabilities: 0.000 0.000 0.000 0.100 0.100 0.200 0.300 0.100 0.100 0.100 
## 
## Node number 28: 8 observations
##   predicted class=3  expected loss=0.625  P(node) =0.1230769
##     class counts:     2     2     1     3     0     0     0     0     0     0
##    probabilities: 0.250 0.250 0.125 0.375 0.000 0.000 0.000 0.000 0.000 0.000 
## 
## Node number 29: 13 observations
##   predicted class=4  expected loss=0.6153846  P(node) =0.2
##     class counts:     0     0     2     4     5     2     0     0     0     0
##    probabilities: 0.000 0.000 0.154 0.308 0.385 0.154 0.000 0.000 0.000 0.000

NEW MODEL COMPARISON

This is the comparison of the performance of the interaction model to the performance of a main-effects only model. Because this data set is small, we will use cross-validation to simulate making predictions on out-of-sample data. Cross-validation predicts how well a model built from all the data will perform on new data. As with the test/train split, for a good modeling procedure, cross-validation performance and training performance should be close.

# Create the formula with main effects only
(esoph_add <- ncases ~ agegp+ alcgp + tobgp)

## ncases ~ agegp + alcgp + tobgp

# Create the formula with interactions
(esoph_interaction <- ncases ~ agegp+alcgp+tobgp+alcgp:tobgp)

## ncases ~ agegp + alcgp + tobgp + alcgp:tobgp

set.seed(34)  # set the seed for reproducibility
splitPlan <- kWayCrossValidation(nrow(esoph), 3, NULL, NULL)

# Sample code: Get cross-val predictions for main-effects only model
esoph$predictions_add <- 0  # initialize the prediction vector
for(i in 1:3) {
  split <- splitPlan[[i]]
  esoph_add <- lm(esoph_add, data = esoph[split$train, ])
  esoph$predictions_add[split$app] <- predict(esoph_add, newdata = esoph[split$app, ])
}
# Get the cross-val predictions for the model with interactions
esoph$predictions_interaction <- 0 # initialize the prediction vector
for(i in 1:3) {
  split <- splitPlan[[i]]
  esoph_interaction <- lm(esoph_interaction, data = esoph[split$train, ])
  esoph$predictions_interaction[split$app] <- predict(esoph_interaction, newdata = esoph[split$app, ])
}

esoph %>% 
  gather(key = modeltype, value = predictions, predictions_add, predictions_interaction) %>%
  mutate(residuals = ncases - predictions) %>%
  group_by(modeltype) %>%
  summarize(rmse = sqrt(mean(residuals^2)))

##      rmse
## 1 2.17695

#(RMSE) is a frequently used measure of the differences between values (sample and population values) predicted by a model or an estimator and the values actually observed.

esoph

Bilge Uzuner

4 December 2017

R Markdown

FIRST MODEL: esoph cases ~ age + alcohol + tobacco

SECOND MODEL: esoph cases ~ age + alcohol + tobacco + alcohol:tobacco

NEW MODEL COMPARISON