Introduction

This is an assignment can be reached from here. Dataset can be reachable from lecture notes. It is from UCI Database. See documentation on the website for further detail.

For a brief summary; the aim is buiding a CART model to detect spam mail using UCI’s Spambase data and analyze it. Performance depends on correct specification of spam/non-spam mails in the test subset.


1. Prepare Dataset

library(dplyr)
library(ggplot2)
library(rpart)
library(rpart.plot) 
library(rattle)
library(readr)
library(tidyverse)

#Load data from file-system.
load("C:/Users/ecetp/Downloads/MEF/BDA 503 - R/spam_data.RData")

#head(spam_data)  --review data structure

#Check dim of dataset.
dim(spam_data)
## [1] 4601   59

Note: Make sure all the categorical variables are converted into factors. The function rpart will run a regression tree if the response variable is numeric, and a classification tree if it is a factor.

#Check column properties.
#glimpse(spam_data)

#Attention: spam_or_not column which we'll try to analyse is integer. So, convert it into factor.
spam_data$spam_or_not <- as.factor(spam_data$spam_or_not)

#Check number of train and test rows of dataset. 
table(spam_data$train_test==0)    #TRUE = Train , False = Test
## 
## FALSE  TRUE 
##   500  4101
#Check number of train and test rows of dataset.
table(spam_data$spam_or_not==0)   #TRUE = Spam , False= Not Spam
## 
## FALSE  TRUE 
##  1813  2788
traindata<-subset(spam_data,train_test==0)
testdata<-subset(spam_data,train_test==1)

#glimpse(traindata)

2. Build a Model

#build the model.
spam_tree <- rpart(spam_or_not~.,data = traindata,control=rpart.control(minsplit = 30))

#a fancy graph of the result.
fancyRpartPlot(spam_tree)

3. Validation and Explication

printcp(spam_tree)
## 
## Classification tree:
## rpart(formula = spam_or_not ~ ., data = traindata, control = rpart.control(minsplit = 30))
## 
## Variables actually used in tree construction:
## [1] V16 V25 V52 V53 V57 V7 
## 
## Root node error: 1605/4101 = 0.39137
## 
## n= 4101 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.481620      0   1.00000 1.00000 0.019473
## 2 0.143925      1   0.51838 0.54829 0.016380
## 3 0.049221      2   0.37445 0.43863 0.015046
## 4 0.037383      3   0.32523 0.34829 0.013690
## 5 0.030530      4   0.28785 0.31277 0.013077
## 6 0.011838      5   0.25732 0.28224 0.012507
## 7 0.010000      6   0.24548 0.25857 0.012033
plotcp(spam_tree)

bestcp <- spam_tree$cptable[which.min(spam_tree$cptable[,"xerror"]),"CP"]
pruned_tree <- prune(spam_tree, cp = bestcp)
printcp(pruned_tree)
## 
## Classification tree:
## rpart(formula = spam_or_not ~ ., data = traindata, control = rpart.control(minsplit = 30))
## 
## Variables actually used in tree construction:
## [1] V16 V25 V52 V53 V57 V7 
## 
## Root node error: 1605/4101 = 0.39137
## 
## n= 4101 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.481620      0   1.00000 1.00000 0.019473
## 2 0.143925      1   0.51838 0.54829 0.016380
## 3 0.049221      2   0.37445 0.43863 0.015046
## 4 0.037383      3   0.32523 0.34829 0.013690
## 5 0.030530      4   0.28785 0.31277 0.013077
## 6 0.011838      5   0.25732 0.28224 0.012507
## 7 0.010000      6   0.24548 0.25857 0.012033
#plotcp(pruned_tree)
#confision matrices 
conf.matrix <- table(traindata$spam_or_not, predict(pruned_tree,type="class"))
rownames(conf.matrix) <- paste("Actual", rownames(conf.matrix), sep = ":")
colnames(conf.matrix) <- paste("Pred", colnames(conf.matrix), sep = ":")
print(conf.matrix)
##           
##            Pred:0 Pred:1
##   Actual:0   2381    115
##   Actual:1    279   1326
#plot to understand.
# faclen = 0 means to use full names of the factor labels
# extra = 1 adds number of observations at each node; equivalent to using use.n = TRUE in plot.rpart
prp(pruned_tree, faclen = 0, cex = 0.8, extra = 1)