First of all, we need to call our libraries.
library(data.table)
library(ggplot2)
library(ggthemes)
library(tidyverse)
library(highcharter)
library(plotly)
library(ggplot2)
library(xgboost)
library(randomForest)
library(caret)
library(rpart)
library(rpart.plot)
library(rattle)
library(dplyr)
Train and Test data sets are preparing.
set.seed(555)
set.seed(555)
diamonds_test2 <- diamonds %>% mutate(diamond_id = row_number()) %>%
group_by(cut, color, clarity) %>% sample_frac(0.2) %>% ungroup()
diamonds_train2 <- anti_join(diamonds %>% mutate(diamond_id = row_number()),
diamonds_test2, by = "diamond_id")
diamonds_train2<-data.table(diamonds_train2)
diamonds_test2<-data.table(diamonds_test2)
You can see the summry of train and test data sets.
summary(diamonds_train2)
## carat cut color clarity
## Min. :0.2000 Fair : 1285 D:5416 SI1 :10449
## 1st Qu.:0.4000 Good : 3923 E:7835 VS2 : 9806
## Median :0.7000 Very Good: 9662 F:7629 SI2 : 7354
## Mean :0.7977 Premium :11036 G:9037 VS1 : 6538
## 3rd Qu.:1.0400 Ideal :17237 H:6646 VVS2 : 4052
## Max. :5.0100 I:4336 VVS1 : 2923
## J:2244 (Other): 2021
## depth table price x
## Min. :43.00 Min. :43.00 Min. : 326.0 Min. : 0.00
## 1st Qu.:61.00 1st Qu.:56.00 1st Qu.: 947.5 1st Qu.: 4.71
## Median :61.80 Median :57.00 Median : 2398.0 Median : 5.70
## Mean :61.75 Mean :57.46 Mean : 3933.1 Mean : 5.73
## 3rd Qu.:62.50 3rd Qu.:59.00 3rd Qu.: 5325.0 3rd Qu.: 6.54
## Max. :79.00 Max. :95.00 Max. :18823.0 Max. :10.74
##
## y z diamond_id
## Min. : 0.000 Min. : 0.000 Min. : 1
## 1st Qu.: 4.720 1st Qu.: 2.910 1st Qu.:13457
## Median : 5.710 Median : 3.520 Median :26997
## Mean : 5.734 Mean : 3.538 Mean :26955
## 3rd Qu.: 6.540 3rd Qu.: 4.040 3rd Qu.:40415
## Max. :58.900 Max. :31.800 Max. :53940
##
summary(diamonds_test2)
## carat cut color clarity depth
## Min. :0.2000 Fair : 325 D:1359 SI1 :2616 Min. :43.00
## 1st Qu.:0.4000 Good : 983 E:1962 VS2 :2452 1st Qu.:61.00
## Median :0.7000 Very Good:2420 F:1913 SI2 :1840 Median :61.80
## Mean :0.7989 Premium :2755 G:2255 VS1 :1633 Mean :61.75
## 3rd Qu.:1.0400 Ideal :4314 H:1658 VVS2 :1014 3rd Qu.:62.50
## Max. :4.1300 I:1086 VVS1 : 732 Max. :73.60
## J: 564 (Other): 510
## table price x y
## Min. :49.00 Min. : 327 Min. : 3.810 Min. :3.780
## 1st Qu.:56.00 1st Qu.: 956 1st Qu.: 4.720 1st Qu.:4.730
## Median :57.00 Median : 2434 Median : 5.700 Median :5.720
## Mean :57.46 Mean : 3932 Mean : 5.734 Mean :5.736
## 3rd Qu.:59.00 3rd Qu.: 5324 3rd Qu.: 6.540 3rd Qu.:6.540
## Max. :69.00 Max. :18818 Max. :10.010 Max. :9.940
##
## z diamond_id
## Min. :2.240 Min. : 3
## 1st Qu.:2.910 1st Qu.:13648
## Median :3.530 Median :26870
## Mean :3.541 Mean :27033
## 3rd Qu.:4.030 3rd Qu.:40627
## Max. :6.430 Max. :53937
##
As you see there is no NA and some of the variables are numeric and some of them are categoric.
diamonds_train2[,length(diamond_id)]
## [1] 43143
diamonds_train2[,length(unique(diamond_id))]
## [1] 43143
When we check the id’s there is no problem about uniqness.
ggplot(diamonds_train2, aes(x = price)) +
geom_histogram(color = "blue", fill = "Blue", binwidth = 25) +
scale_x_continuous(breaks = seq(0, 5000, 500)) +
theme(axis.text.x = element_text(angle = 90)) +
coord_cartesian(c(0,5000)) +
facet_grid(cut~color) +
xlab("Price") + ylab("Count")
In this graph we can see that Ideal cut diamonds is the the most seen data point at each colurs.
diamonds_train2[x<=0 & y<=0 & z<=0,]
## carat cut color clarity depth table price x y z diamond_id
## 1: 1.00 Very Good H VS2 63.3 53 5139 0 0 0 11964
## 2: 1.14 Fair G VS1 57.5 67 6381 0 0 0 15952
## 3: 1.56 Ideal G VS2 62.2 54 12800 0 0 0 24521
## 4: 1.20 Premium D VVS1 62.1 59 15686 0 0 0 26244
## 5: 2.25 Premium H SI2 62.8 59 18034 0 0 0 27430
## 6: 0.71 Good F SI2 64.1 60 2130 0 0 0 49557
## 7: 0.71 Good F SI2 64.1 60 2130 0 0 0 49558
There are some diamonds which have x,y,z values are zero. It should be better to put them out.
diamonds_train2<-diamonds_train2[x>0 | y>0 | z>0,]
summary(diamonds$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326 950 2401 3933 5324 18823
ggplot(diamonds_train2, aes(x = price)) +
geom_histogram(color = "black", fill = "Blue", binwidth = 500) +
scale_x_continuous(breaks = seq(0, 20000, 500)) +
theme(axis.text.x = element_text(angle = 90)) +
xlab("Price") + ylab("Count")
when we check the price we can say that range is quite high so we should take the log of price.
diamonds_train2[,cut1:=ifelse(cut=="Fair",1,ifelse(cut=="Good",2,ifelse(cut=="Very Good",3,ifelse(cut=="Premium",4,5))))]
diamonds_train2[,color1:=ifelse(color=="D",1,ifelse(color=="E",2,ifelse(color=="F",3,ifelse(color=="G",4,ifelse(color=="H",5,ifelse(color=="I",6,7))))))]
diamonds_train2[,clarity1:=ifelse(clarity=="I1",1,ifelse(clarity=="SI2",2,ifelse(clarity=="SI1",3,ifelse(clarity=="VS2",4,ifelse(clarity=="VS1",5,ifelse(clarity=="VVS2",6,ifelse(clarity=="VVS1",7,8)))))))]
diamonds_test2[,cut1:=ifelse(cut=="Fair",1,ifelse(cut=="Good",2,ifelse(cut=="Very Good",3,ifelse(cut=="Premium",4,5))))]
diamonds_test2[,color1:=ifelse(color=="D",1,ifelse(color=="E",2,ifelse(color=="F",3,ifelse(color=="G",4,ifelse(color=="H",5,ifelse(color=="I",6,7))))))]
diamonds_test2[,clarity1:=ifelse(clarity=="I1",1,ifelse(clarity=="SI2",2,ifelse(clarity=="SI1",3,ifelse(clarity=="VS2",4,ifelse(clarity=="VS1",5,ifelse(clarity=="VVS2",6,ifelse(clarity=="VVS1",7,8)))))))]
Also for modelling in train and test data set some of the variables turn it to another variable for example D is the worst color category and we will say it 1 as numeric after this process we will change the type of their class from numeric to factor.
diamonds_train2[, carat_log:=log(carat),.(diamond_id)]
diamonds_test2[, carat_log:=log(carat),.(diamond_id)]
diamonds_train<-diamonds_train2[,list(carat_log,cut1,color1,clarity1,depth,x,y,z,price,diamond_id)]
diamonds_test<-diamonds_test2[,list(carat_log,cut1,color1,clarity1,depth,x,y,z,price,diamond_id)]
diamonds_train$cut1<-as.factor(diamonds_train$cut1)
diamonds_train$color1<-as.factor(diamonds_train$color1)
diamonds_train$clarity1<-as.factor(diamonds_train$clarity1)
diamonds_test$cut1<-as.factor(diamonds_test$cut1)
diamonds_test$color1<-as.factor(diamonds_test$color1)
diamonds_test$clarity1<-as.factor(diamonds_test$clarity1)
Carat is also an important variable and after looking its summary the logarithmic version of this variable take into consider.
diamonds_train<-diamonds_train[color1==4 & clarity1==8 & cut1==5,]
diamonds_test<-diamonds_test[color1==4 & clarity1==8 & cut1==5,]
Clustering can give better results in tree models.But clustering was not done in this study so a samll data set was selected in existing categorical variables.
Performance measurement fuction is at below. ( Root mean square percentage error)
RMPSE<- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
elab<-exp(as.numeric(labels))-1
epreds<-exp(as.numeric(preds))-1
err <- sqrt(mean((epreds/elab-1)^2))
return(list(metric = "RMPSE", value = err))
}
We need to set a sample size which is the % 10 of whole data.
nrow(diamonds_train)
## [1] 393
h<-sample(nrow(diamonds_train),10)
Crating the gbm matrix. Added 1 to sales because of log(0) goes inf
dval<-xgb.DMatrix(data=data.matrix(diamonds_train[h,]),label=log(diamonds_train$price+1)[h])
dtrain<-xgb.DMatrix(data=data.matrix(diamonds_train[-h,]),label=log(diamonds_train$price+1)[-h])
watchlist<-list(val=dval,trainset=dtrain)
Setting the parameters
param <- list( objective = "reg:linear",
booster = "gbtree",
eta = 0.45, # 0.40, #0.3, #0.2, #0.1, changed from 0.45
max_depth = 15,
subsample = 0.8,
colsample_bytree = 0.8
)
Finding the best seed number for maximum accuracy
selectseed<-function(a,b){
bestseed2 = data.table(s = 201, x = 1.1, y = 1.1)
for (s in a:b){
set.seed(s)
set.seed(s)
xgbmodel <- xgb.train( params = param,
data = dtrain,
nrounds = 30, #10, #20, #30, #100, # changed from 30
verbose = 0,
early.stop.round = 25,
watchlist = watchlist,
maximize = FALSE,
feval=RMPSE
)
testset<-data.frame(diamonds_test)
feature.names <- names(diamonds_test)[-c(9,10)]
forecasts <- exp(predict(xgbmodel, data.matrix(testset[,feature.names]))) -1
forecasts<-data.table(forecasts)
submission <- data.frame(diamond_id=testset$diamond_id, price=forecasts)
submission<-data.table(submission)
testset<-data.table(testset)
setkey(submission,diamond_id)
setkey(testset,diamond_id)
final<-testset[submission]
final[,Err:=abs(forecasts-price)/price]
final2<-final[,list(sum(price),sum(abs(forecasts-price)),sum(price)),.(diamond_id)]
setnames(final2,"V1","TWS")
setnames(final2,"V2","TE")
setnames(final2,"V3","TP")
x<-final2[,sum(TE)/sum(TWS)]
y<-final[,mean(Err)]
bestseed<-data.table(s,x,y)
bestseed2<-rbind(bestseed,bestseed2)
}
seed<-head(bestseed2[order(x)],1)[]$s
return(seed)
}
Setting the best seed the number between a to b
a=1
b=50
seed=selectseed(a,b)
set.seed(seed)
set.seed(seed)
Training the xgboost model
xgbmodel <- xgb.train( params = param,
data = dtrain,
nrounds = 30, #10, #20, #30, #100, # changed from 30
verbose = 1,
early.stop.round = 25,
watchlist = watchlist,
maximize = FALSE,
feval=RMPSE
)
## Warning: 'early.stop.round' is deprecated.
## Use 'early_stopping_rounds' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1] val-RMPSE:0.976484 trainset-RMPSE:0.974441
## Multiple eval metrics are present. Will use trainset_RMPSE for early stopping.
## Will train until trainset_RMPSE hasn't improved in 25 rounds.
##
## [2] val-RMPSE:0.875599 trainset-RMPSE:0.867573
## [3] val-RMPSE:0.691290 trainset-RMPSE:0.678613
## [4] val-RMPSE:0.483656 trainset-RMPSE:0.471111
## [5] val-RMPSE:0.317719 trainset-RMPSE:0.303876
## [6] val-RMPSE:0.209534 trainset-RMPSE:0.191597
## [7] val-RMPSE:0.142639 trainset-RMPSE:0.120045
## [8] val-RMPSE:0.105805 trainset-RMPSE:0.077998
## [9] val-RMPSE:0.080789 trainset-RMPSE:0.052816
## [10] val-RMPSE:0.069423 trainset-RMPSE:0.038279
## [11] val-RMPSE:0.070516 trainset-RMPSE:0.029056
## [12] val-RMPSE:0.069985 trainset-RMPSE:0.023453
## [13] val-RMPSE:0.068528 trainset-RMPSE:0.018080
## [14] val-RMPSE:0.066506 trainset-RMPSE:0.014908
## [15] val-RMPSE:0.066628 trainset-RMPSE:0.012023
## [16] val-RMPSE:0.065402 trainset-RMPSE:0.010428
## [17] val-RMPSE:0.065398 trainset-RMPSE:0.008980
## [18] val-RMPSE:0.065973 trainset-RMPSE:0.007404
## [19] val-RMPSE:0.066313 trainset-RMPSE:0.006479
## [20] val-RMPSE:0.066140 trainset-RMPSE:0.005651
## [21] val-RMPSE:0.065900 trainset-RMPSE:0.004804
## [22] val-RMPSE:0.066061 trainset-RMPSE:0.004132
## [23] val-RMPSE:0.065737 trainset-RMPSE:0.003777
## [24] val-RMPSE:0.065572 trainset-RMPSE:0.003134
## [25] val-RMPSE:0.065406 trainset-RMPSE:0.002603
## [26] val-RMPSE:0.065297 trainset-RMPSE:0.002446
## [27] val-RMPSE:0.065111 trainset-RMPSE:0.002066
## [28] val-RMPSE:0.065104 trainset-RMPSE:0.001725
## [29] val-RMPSE:0.065085 trainset-RMPSE:0.001427
## [30] val-RMPSE:0.065210 trainset-RMPSE:0.001223
Setting the feature names in test dataset
testset<-data.frame(diamonds_test)
feature.names <- names(testset)[-c(9,10)]
Generating the forecast with predict function and the exp function converts the values log to normal
forecasts <- exp(predict(xgbmodel, data.matrix(testset[,feature.names]))) -1
forecasts<-data.table(forecasts)
Crating the final table which obtains forecasts and actual values
submission <- data.frame(diamond_id=testset$diamond_id, price=forecasts)
submission<-data.table(submission)
testset<-data.table(testset)
setkey(submission,diamond_id)
setkey(testset,diamond_id)
final<-testset[submission]
Measuring the performance
final[,Err:=abs(forecasts-price)/price]
final2<-final[,list(sum(price),sum(abs(forecasts-price)),sum(forecasts)),.(diamond_id)]
setnames(final2,"V1","TWS") #Total Weekly Sales
setnames(final2,"V2","TE") #Total Errors
setnames(final2,"V3","TF") #Total Forecasts
final2[,sum(TE)/sum(TWS)]
## [1] 0.1574091
final[,mean(Err)]
## [1] 0.1299294
We can see that the average error is only %15 and it sounds good. Lets look it on a graph.
g=ggplot(final,aes(x=forecasts,y=price),)
g=g+xlab("forecasts")
g=g+ylab("Price $")
g=g+geom_point(size=6,colour="black",alpha=0.2)
g=g+geom_point(size=5,colour="blue",alpha=0.2)
g=g+geom_smooth(method="lm",colour="black")
g
As it can seen on graph high price point is the weak side of our model. Maybe parameter optimization cn help or adding new variable helps reaching better results.
And finally this is the importance matrix of our model. We can see that the y variable is the most important variable. Also logarithm of carat,x,z variable’s are important too.
model <- xgb.dump(xgbmodel, with.stats = T)
## Warning: 'with.stats' is deprecated.
## Use 'with_stats' instead.
## See help("Deprecated") and help("xgboost-deprecated").
names <- dimnames(data.matrix(testset[,-c(9,10)]))[[2]]
importance_matrix <- xgb.importance(names, model = xgbmodel)
xgb.plot.importance(importance_matrix[1:5,])