This is an assignment can be reached from here. Dataset can be reachable from lecture notes. It is from UCI Database. See documentation on the website for further detail.
For a brief summary; the aim is buiding a CART model to detect spam mail using UCI’s Spambase data and analyze it. Performance depends on correct specification of spam/non-spam mails in the test subset.
library(dplyr)
library(ggplot2)
library(rpart)
library(rpart.plot)
library(rattle)
library(readr)
library(tidyverse)
#Load data from file-system.
load("C:/Users/ecetp/Downloads/MEF/BDA 503 - R/spam_data.RData")
#head(spam_data) --review data structure
#Check dim of dataset.
dim(spam_data)
## [1] 4601 59
Note: Make sure all the categorical variables are converted into factors. The function rpart will run a regression tree if the response variable is numeric, and a classification tree if it is a factor.
#Check column properties.
#glimpse(spam_data)
#Attention: spam_or_not column which we'll try to analyse is integer. So, convert it into factor.
spam_data$spam_or_not <- as.factor(spam_data$spam_or_not)
#Check number of train and test rows of dataset.
table(spam_data$train_test==0) #TRUE = Train , False = Test
##
## FALSE TRUE
## 500 4101
#Check number of train and test rows of dataset.
table(spam_data$spam_or_not==0) #TRUE = Spam , False= Not Spam
##
## FALSE TRUE
## 1813 2788
traindata<-subset(spam_data,train_test==0)
testdata<-subset(spam_data,train_test==1)
#glimpse(traindata)
#build the model.
spam_tree <- rpart(spam_or_not~.,data = traindata,control=rpart.control(minsplit = 30))
#a fancy graph of the result.
fancyRpartPlot(spam_tree)
printcp(spam_tree)
##
## Classification tree:
## rpart(formula = spam_or_not ~ ., data = traindata, control = rpart.control(minsplit = 30))
##
## Variables actually used in tree construction:
## [1] V16 V25 V52 V53 V57 V7
##
## Root node error: 1605/4101 = 0.39137
##
## n= 4101
##
## CP nsplit rel error xerror xstd
## 1 0.481620 0 1.00000 1.00000 0.019473
## 2 0.143925 1 0.51838 0.54829 0.016380
## 3 0.049221 2 0.37445 0.43863 0.015046
## 4 0.037383 3 0.32523 0.34829 0.013690
## 5 0.030530 4 0.28785 0.31277 0.013077
## 6 0.011838 5 0.25732 0.28224 0.012507
## 7 0.010000 6 0.24548 0.25857 0.012033
plotcp(spam_tree)
Hence I want the cp value (with a simpler tree) that minimizes the xerror. So, find best CP and generate pruned classification tree to move on.
(I know, I don’t need pruning my tree actually but i should be able to try this model for bigger or smaller version of this dataset.(For Compability, Reproducibility and my honour … )
bestcp <- spam_tree$cptable[which.min(spam_tree$cptable[,"xerror"]),"CP"]
pruned_tree <- prune(spam_tree, cp = bestcp)
printcp(pruned_tree)
##
## Classification tree:
## rpart(formula = spam_or_not ~ ., data = traindata, control = rpart.control(minsplit = 30))
##
## Variables actually used in tree construction:
## [1] V16 V25 V52 V53 V57 V7
##
## Root node error: 1605/4101 = 0.39137
##
## n= 4101
##
## CP nsplit rel error xerror xstd
## 1 0.481620 0 1.00000 1.00000 0.019473
## 2 0.143925 1 0.51838 0.54829 0.016380
## 3 0.049221 2 0.37445 0.43863 0.015046
## 4 0.037383 3 0.32523 0.34829 0.013690
## 5 0.030530 4 0.28785 0.31277 0.013077
## 6 0.011838 5 0.25732 0.28224 0.012507
## 7 0.010000 6 0.24548 0.25857 0.012033
#plotcp(pruned_tree)
#confision matrices
conf.matrix <- table(traindata$spam_or_not, predict(pruned_tree,type="class"))
rownames(conf.matrix) <- paste("Actual", rownames(conf.matrix), sep = ":")
colnames(conf.matrix) <- paste("Pred", colnames(conf.matrix), sep = ":")
print(conf.matrix)
##
## Pred:0 Pred:1
## Actual:0 2381 115
## Actual:1 279 1326
#plot to understand.
# faclen = 0 means to use full names of the factor labels
# extra = 1 adds number of observations at each node; equivalent to using use.n = TRUE in plot.rpart
prp(pruned_tree, faclen = 0, cex = 0.8, extra = 1)