This is an assignment can be reached from here. Dataset can be reachable from lecture notes. It is from UCI Database. See documentation on the website for further detail.
For a brief summary; the aim is buiding a CART model to detect spam mail using UCI’s Spambase data and analyze it. Performance depends on correct specification of spam/non-spam mails in the test subset.
library(dplyr)
library(ggplot2)
library(rpart)
library(rpart.plot) 
library(rattle)
library(readr)
library(tidyverse)
#Load data from file-system.
load("C:/Users/ecetp/Downloads/MEF/BDA 503 - R/spam_data.RData")
#head(spam_data)  --review data structure
#Check dim of dataset.
dim(spam_data)## [1] 4601   59Note: Make sure all the categorical variables are converted into factors. The function rpart will run a regression tree if the response variable is numeric, and a classification tree if it is a factor.
#Check column properties.
#glimpse(spam_data)
#Attention: spam_or_not column which we'll try to analyse is integer. So, convert it into factor.
spam_data$spam_or_not <- as.factor(spam_data$spam_or_not)
#Check number of train and test rows of dataset. 
table(spam_data$train_test==0)    #TRUE = Train , False = Test## 
## FALSE  TRUE 
##   500  4101#Check number of train and test rows of dataset.
table(spam_data$spam_or_not==0)   #TRUE = Spam , False= Not Spam## 
## FALSE  TRUE 
##  1813  2788traindata<-subset(spam_data,train_test==0)
testdata<-subset(spam_data,train_test==1)
#glimpse(traindata)#build the model.
spam_tree <- rpart(spam_or_not~.,data = traindata,control=rpart.control(minsplit = 30))
#a fancy graph of the result.
fancyRpartPlot(spam_tree)printcp(spam_tree)## 
## Classification tree:
## rpart(formula = spam_or_not ~ ., data = traindata, control = rpart.control(minsplit = 30))
## 
## Variables actually used in tree construction:
## [1] V16 V25 V52 V53 V57 V7 
## 
## Root node error: 1605/4101 = 0.39137
## 
## n= 4101 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.481620      0   1.00000 1.00000 0.019473
## 2 0.143925      1   0.51838 0.54829 0.016380
## 3 0.049221      2   0.37445 0.43863 0.015046
## 4 0.037383      3   0.32523 0.34829 0.013690
## 5 0.030530      4   0.28785 0.31277 0.013077
## 6 0.011838      5   0.25732 0.28224 0.012507
## 7 0.010000      6   0.24548 0.25857 0.012033plotcp(spam_tree)Hence I want the cp value (with a simpler tree) that minimizes the xerror. So, find best CP and generate pruned classification tree to move on.
(I know, I don’t need pruning my tree actually but i should be able to try this model for bigger or smaller version of this dataset.(For Compability, Reproducibility and my honour … )
bestcp <- spam_tree$cptable[which.min(spam_tree$cptable[,"xerror"]),"CP"]
pruned_tree <- prune(spam_tree, cp = bestcp)
printcp(pruned_tree)## 
## Classification tree:
## rpart(formula = spam_or_not ~ ., data = traindata, control = rpart.control(minsplit = 30))
## 
## Variables actually used in tree construction:
## [1] V16 V25 V52 V53 V57 V7 
## 
## Root node error: 1605/4101 = 0.39137
## 
## n= 4101 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.481620      0   1.00000 1.00000 0.019473
## 2 0.143925      1   0.51838 0.54829 0.016380
## 3 0.049221      2   0.37445 0.43863 0.015046
## 4 0.037383      3   0.32523 0.34829 0.013690
## 5 0.030530      4   0.28785 0.31277 0.013077
## 6 0.011838      5   0.25732 0.28224 0.012507
## 7 0.010000      6   0.24548 0.25857 0.012033#plotcp(pruned_tree)#confision matrices 
conf.matrix <- table(traindata$spam_or_not, predict(pruned_tree,type="class"))
rownames(conf.matrix) <- paste("Actual", rownames(conf.matrix), sep = ":")
colnames(conf.matrix) <- paste("Pred", colnames(conf.matrix), sep = ":")
print(conf.matrix)##           
##            Pred:0 Pred:1
##   Actual:0   2381    115
##   Actual:1    279   1326#plot to understand.
# faclen = 0 means to use full names of the factor labels
# extra = 1 adds number of observations at each node; equivalent to using use.n = TRUE in plot.rpart
prp(pruned_tree, faclen = 0, cex = 0.8, extra = 1)