Diamonds Price Prediction

First of all, we need to call our libraries.

library(data.table)
library(ggplot2)
library(ggthemes)
library(tidyverse)
library(highcharter)
library(plotly)
library(ggplot2)
library(xgboost)
library(randomForest)
library(caret)
library(rpart)
library(rpart.plot)
library(rattle)
library(dplyr)

Train and Test data sets are preparing.

set.seed(555)
set.seed(555)

diamonds_test2 <- diamonds %>% mutate(diamond_id = row_number()) %>%
  group_by(cut, color, clarity) %>% sample_frac(0.2) %>% ungroup()

diamonds_train2 <- anti_join(diamonds %>% mutate(diamond_id = row_number()),
                            diamonds_test2, by = "diamond_id")

diamonds_train2<-data.table(diamonds_train2)
diamonds_test2<-data.table(diamonds_test2)

You can see the summry of train and test data sets.

summary(diamonds_train2)
##      carat               cut        color       clarity     
##  Min.   :0.2000   Fair     : 1285   D:5416   SI1    :10449  
##  1st Qu.:0.4000   Good     : 3923   E:7835   VS2    : 9806  
##  Median :0.7000   Very Good: 9662   F:7629   SI2    : 7354  
##  Mean   :0.7977   Premium  :11036   G:9037   VS1    : 6538  
##  3rd Qu.:1.0400   Ideal    :17237   H:6646   VVS2   : 4052  
##  Max.   :5.0100                     I:4336   VVS1   : 2923  
##                                     J:2244   (Other): 2021  
##      depth           table           price               x        
##  Min.   :43.00   Min.   :43.00   Min.   :  326.0   Min.   : 0.00  
##  1st Qu.:61.00   1st Qu.:56.00   1st Qu.:  947.5   1st Qu.: 4.71  
##  Median :61.80   Median :57.00   Median : 2398.0   Median : 5.70  
##  Mean   :61.75   Mean   :57.46   Mean   : 3933.1   Mean   : 5.73  
##  3rd Qu.:62.50   3rd Qu.:59.00   3rd Qu.: 5325.0   3rd Qu.: 6.54  
##  Max.   :79.00   Max.   :95.00   Max.   :18823.0   Max.   :10.74  
##                                                                   
##        y                z            diamond_id   
##  Min.   : 0.000   Min.   : 0.000   Min.   :    1  
##  1st Qu.: 4.720   1st Qu.: 2.910   1st Qu.:13457  
##  Median : 5.710   Median : 3.520   Median :26997  
##  Mean   : 5.734   Mean   : 3.538   Mean   :26955  
##  3rd Qu.: 6.540   3rd Qu.: 4.040   3rd Qu.:40415  
##  Max.   :58.900   Max.   :31.800   Max.   :53940  
## 
summary(diamonds_test2)
##      carat               cut       color       clarity         depth      
##  Min.   :0.2000   Fair     : 325   D:1359   SI1    :2616   Min.   :43.00  
##  1st Qu.:0.4000   Good     : 983   E:1962   VS2    :2452   1st Qu.:61.00  
##  Median :0.7000   Very Good:2420   F:1913   SI2    :1840   Median :61.80  
##  Mean   :0.7989   Premium  :2755   G:2255   VS1    :1633   Mean   :61.75  
##  3rd Qu.:1.0400   Ideal    :4314   H:1658   VVS2   :1014   3rd Qu.:62.50  
##  Max.   :4.1300                    I:1086   VVS1   : 732   Max.   :73.60  
##                                    J: 564   (Other): 510                  
##      table           price             x                y        
##  Min.   :49.00   Min.   :  327   Min.   : 3.810   Min.   :3.780  
##  1st Qu.:56.00   1st Qu.:  956   1st Qu.: 4.720   1st Qu.:4.730  
##  Median :57.00   Median : 2434   Median : 5.700   Median :5.720  
##  Mean   :57.46   Mean   : 3932   Mean   : 5.734   Mean   :5.736  
##  3rd Qu.:59.00   3rd Qu.: 5324   3rd Qu.: 6.540   3rd Qu.:6.540  
##  Max.   :69.00   Max.   :18818   Max.   :10.010   Max.   :9.940  
##                                                                  
##        z           diamond_id   
##  Min.   :2.240   Min.   :    3  
##  1st Qu.:2.910   1st Qu.:13648  
##  Median :3.530   Median :26870  
##  Mean   :3.541   Mean   :27033  
##  3rd Qu.:4.030   3rd Qu.:40627  
##  Max.   :6.430   Max.   :53937  
## 

As you see there is no NA and some of the variables are numeric and some of them are categoric.

diamonds_train2[,length(diamond_id)]
## [1] 43143
diamonds_train2[,length(unique(diamond_id))]
## [1] 43143

When we check the id’s there is no problem about uniqness.

ggplot(diamonds_train2, aes(x = price)) +
  geom_histogram(color = "blue", fill = "Blue", binwidth = 25) +
  scale_x_continuous(breaks = seq(0, 5000, 500)) +
  theme(axis.text.x = element_text(angle = 90)) +
  coord_cartesian(c(0,5000)) +
  facet_grid(cut~color) +
  xlab("Price") + ylab("Count")

In this graph we can see that Ideal cut diamonds is the the most seen data point at each colurs.

diamonds_train2[x<=0 & y<=0 & z<=0,]
##    carat       cut color clarity depth table price x y z diamond_id
## 1:  1.00 Very Good     H     VS2  63.3    53  5139 0 0 0      11964
## 2:  1.14      Fair     G     VS1  57.5    67  6381 0 0 0      15952
## 3:  1.56     Ideal     G     VS2  62.2    54 12800 0 0 0      24521
## 4:  1.20   Premium     D    VVS1  62.1    59 15686 0 0 0      26244
## 5:  2.25   Premium     H     SI2  62.8    59 18034 0 0 0      27430
## 6:  0.71      Good     F     SI2  64.1    60  2130 0 0 0      49557
## 7:  0.71      Good     F     SI2  64.1    60  2130 0 0 0      49558

There are some diamonds which have x,y,z values are zero. It should be better to put them out.

diamonds_train2<-diamonds_train2[x>0 | y>0 | z>0,]
summary(diamonds$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     950    2401    3933    5324   18823
ggplot(diamonds_train2, aes(x = price)) +
  geom_histogram(color = "black", fill = "Blue", binwidth = 500) +
  scale_x_continuous(breaks = seq(0, 20000, 500)) +
  theme(axis.text.x = element_text(angle = 90)) +
  xlab("Price") + ylab("Count")

when we check the price we can say that range is quite high so we should take the log of price.

diamonds_train2[,cut1:=ifelse(cut=="Fair",1,ifelse(cut=="Good",2,ifelse(cut=="Very Good",3,ifelse(cut=="Premium",4,5))))]
diamonds_train2[,color1:=ifelse(color=="D",1,ifelse(color=="E",2,ifelse(color=="F",3,ifelse(color=="G",4,ifelse(color=="H",5,ifelse(color=="I",6,7))))))]
diamonds_train2[,clarity1:=ifelse(clarity=="I1",1,ifelse(clarity=="SI2",2,ifelse(clarity=="SI1",3,ifelse(clarity=="VS2",4,ifelse(clarity=="VS1",5,ifelse(clarity=="VVS2",6,ifelse(clarity=="VVS1",7,8)))))))]

diamonds_test2[,cut1:=ifelse(cut=="Fair",1,ifelse(cut=="Good",2,ifelse(cut=="Very Good",3,ifelse(cut=="Premium",4,5))))]
diamonds_test2[,color1:=ifelse(color=="D",1,ifelse(color=="E",2,ifelse(color=="F",3,ifelse(color=="G",4,ifelse(color=="H",5,ifelse(color=="I",6,7))))))]
diamonds_test2[,clarity1:=ifelse(clarity=="I1",1,ifelse(clarity=="SI2",2,ifelse(clarity=="SI1",3,ifelse(clarity=="VS2",4,ifelse(clarity=="VS1",5,ifelse(clarity=="VVS2",6,ifelse(clarity=="VVS1",7,8)))))))]

Also for modelling in train and test data set some of the variables turn it to another variable for example D is the worst color category and we will say it 1 as numeric after this process we will change the type of their class from numeric to factor.

diamonds_train2[, carat_log:=log(carat),.(diamond_id)]
diamonds_test2[, carat_log:=log(carat),.(diamond_id)]

diamonds_train<-diamonds_train2[,list(carat_log,cut1,color1,clarity1,depth,x,y,z,price,diamond_id)]
diamonds_test<-diamonds_test2[,list(carat_log,cut1,color1,clarity1,depth,x,y,z,price,diamond_id)]

diamonds_train$cut1<-as.factor(diamonds_train$cut1)
diamonds_train$color1<-as.factor(diamonds_train$color1)
diamonds_train$clarity1<-as.factor(diamonds_train$clarity1)

diamonds_test$cut1<-as.factor(diamonds_test$cut1)
diamonds_test$color1<-as.factor(diamonds_test$color1)
diamonds_test$clarity1<-as.factor(diamonds_test$clarity1)

Carat is also an important variable and after looking its summary the logarithmic version of this variable take into consider.

diamonds_train<-diamonds_train[color1==4  & clarity1==8 & cut1==5,]
diamonds_test<-diamonds_test[color1==4 & clarity1==8 & cut1==5,]

Clustering can give better results in tree models.But clustering was not done in this study so a samll data set was selected in existing categorical variables.

Performance measurement fuction is at below. ( Root mean square percentage error)

RMPSE<- function(preds, dtrain) {
  labels <- getinfo(dtrain, "label")
  elab<-exp(as.numeric(labels))-1
  epreds<-exp(as.numeric(preds))-1
  err <- sqrt(mean((epreds/elab-1)^2))
  return(list(metric = "RMPSE", value = err))
}

We need to set a sample size which is the % 10 of whole data.

nrow(diamonds_train)
## [1] 393
h<-sample(nrow(diamonds_train),10)

Crating the gbm matrix. Added 1 to sales because of log(0) goes inf

dval<-xgb.DMatrix(data=data.matrix(diamonds_train[h,]),label=log(diamonds_train$price+1)[h])
dtrain<-xgb.DMatrix(data=data.matrix(diamonds_train[-h,]),label=log(diamonds_train$price+1)[-h])
watchlist<-list(val=dval,trainset=dtrain)

Setting the parameters

param <- list(  objective           = "reg:linear",
                booster = "gbtree",
                eta                 = 0.45, # 0.40, #0.3, #0.2, #0.1, changed from 0.45
                max_depth           = 15, 
                subsample           = 0.8, 
                colsample_bytree    = 0.8 
                
)

Finding the best seed number for maximum accuracy

selectseed<-function(a,b){
  
  bestseed2 = data.table(s = 201, x = 1.1, y = 1.1)
  
  for (s in a:b){
    
    set.seed(s)
    set.seed(s)
    
    xgbmodel <- xgb.train(   params         = param,
                             data                = dtrain,
                             nrounds             = 30, #10, #20, #30, #100, # changed from 30
                             verbose             = 0,
                             early.stop.round    = 25,
                             watchlist           = watchlist,
                             maximize            = FALSE,
                             feval=RMPSE
    )
    testset<-data.frame(diamonds_test)
    
    feature.names <- names(diamonds_test)[-c(9,10)]
    
    forecasts <- exp(predict(xgbmodel, data.matrix(testset[,feature.names]))) -1
    
    forecasts<-data.table(forecasts)
    
    submission <- data.frame(diamond_id=testset$diamond_id, price=forecasts)
    
    submission<-data.table(submission)
    testset<-data.table(testset)
    
    setkey(submission,diamond_id)
    setkey(testset,diamond_id)
    
    final<-testset[submission]
    
    final[,Err:=abs(forecasts-price)/price]
    final2<-final[,list(sum(price),sum(abs(forecasts-price)),sum(price)),.(diamond_id)]
    
    setnames(final2,"V1","TWS")
    setnames(final2,"V2","TE")
    setnames(final2,"V3","TP")
    
    x<-final2[,sum(TE)/sum(TWS)]
    y<-final[,mean(Err)]
    
    bestseed<-data.table(s,x,y)
    bestseed2<-rbind(bestseed,bestseed2)
    
  }
  
  seed<-head(bestseed2[order(x)],1)[]$s
  return(seed)
}

Setting the best seed the number between a to b

a=1
b=50
seed=selectseed(a,b)
set.seed(seed)
set.seed(seed)

Training the xgboost model

xgbmodel <- xgb.train(   params              = param,
                         data                = dtrain,
                         nrounds             = 30, #10, #20, #30, #100, # changed from 30
                         verbose             = 1,
                         early.stop.round    = 25,
                         watchlist           = watchlist,
                         maximize            = FALSE,
                         feval=RMPSE
)
## Warning: 'early.stop.round' is deprecated.
## Use 'early_stopping_rounds' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1]  val-RMPSE:0.976484  trainset-RMPSE:0.974441 
## Multiple eval metrics are present. Will use trainset_RMPSE for early stopping.
## Will train until trainset_RMPSE hasn't improved in 25 rounds.
## 
## [2]  val-RMPSE:0.875599  trainset-RMPSE:0.867573 
## [3]  val-RMPSE:0.691290  trainset-RMPSE:0.678613 
## [4]  val-RMPSE:0.483656  trainset-RMPSE:0.471111 
## [5]  val-RMPSE:0.317719  trainset-RMPSE:0.303876 
## [6]  val-RMPSE:0.209534  trainset-RMPSE:0.191597 
## [7]  val-RMPSE:0.142639  trainset-RMPSE:0.120045 
## [8]  val-RMPSE:0.105805  trainset-RMPSE:0.077998 
## [9]  val-RMPSE:0.080789  trainset-RMPSE:0.052816 
## [10] val-RMPSE:0.069423  trainset-RMPSE:0.038279 
## [11] val-RMPSE:0.070516  trainset-RMPSE:0.029056 
## [12] val-RMPSE:0.069985  trainset-RMPSE:0.023453 
## [13] val-RMPSE:0.068528  trainset-RMPSE:0.018080 
## [14] val-RMPSE:0.066506  trainset-RMPSE:0.014908 
## [15] val-RMPSE:0.066628  trainset-RMPSE:0.012023 
## [16] val-RMPSE:0.065402  trainset-RMPSE:0.010428 
## [17] val-RMPSE:0.065398  trainset-RMPSE:0.008980 
## [18] val-RMPSE:0.065973  trainset-RMPSE:0.007404 
## [19] val-RMPSE:0.066313  trainset-RMPSE:0.006479 
## [20] val-RMPSE:0.066140  trainset-RMPSE:0.005651 
## [21] val-RMPSE:0.065900  trainset-RMPSE:0.004804 
## [22] val-RMPSE:0.066061  trainset-RMPSE:0.004132 
## [23] val-RMPSE:0.065737  trainset-RMPSE:0.003777 
## [24] val-RMPSE:0.065572  trainset-RMPSE:0.003134 
## [25] val-RMPSE:0.065406  trainset-RMPSE:0.002603 
## [26] val-RMPSE:0.065297  trainset-RMPSE:0.002446 
## [27] val-RMPSE:0.065111  trainset-RMPSE:0.002066 
## [28] val-RMPSE:0.065104  trainset-RMPSE:0.001725 
## [29] val-RMPSE:0.065085  trainset-RMPSE:0.001427 
## [30] val-RMPSE:0.065210  trainset-RMPSE:0.001223

Setting the feature names in test dataset

testset<-data.frame(diamonds_test)
feature.names <- names(testset)[-c(9,10)]

Generating the forecast with predict function and the exp function converts the values log to normal

forecasts <- exp(predict(xgbmodel, data.matrix(testset[,feature.names]))) -1
forecasts<-data.table(forecasts)

Crating the final table which obtains forecasts and actual values

submission <- data.frame(diamond_id=testset$diamond_id, price=forecasts)

submission<-data.table(submission)
testset<-data.table(testset)

setkey(submission,diamond_id)
setkey(testset,diamond_id)

final<-testset[submission]

Measuring the performance

final[,Err:=abs(forecasts-price)/price]
final2<-final[,list(sum(price),sum(abs(forecasts-price)),sum(forecasts)),.(diamond_id)]

setnames(final2,"V1","TWS") #Total Weekly Sales
setnames(final2,"V2","TE")  #Total Errors
setnames(final2,"V3","TF")  #Total Forecasts

final2[,sum(TE)/sum(TWS)]
## [1] 0.1574091
final[,mean(Err)]
## [1] 0.1299294

We can see that the average error is only %15 and it sounds good. Lets look it on a graph.

g=ggplot(final,aes(x=forecasts,y=price),)
g=g+xlab("forecasts")
g=g+ylab("Price $")
g=g+geom_point(size=6,colour="black",alpha=0.2)
g=g+geom_point(size=5,colour="blue",alpha=0.2)
g=g+geom_smooth(method="lm",colour="black")
g

As it can seen on graph high price point is the weak side of our model. Maybe parameter optimization cn help or adding new variable helps reaching better results.

And finally this is the importance matrix of our model. We can see that the y variable is the most important variable. Also logarithm of carat,x,z variable’s are important too.

model <- xgb.dump(xgbmodel, with.stats = T)
## Warning: 'with.stats' is deprecated.
## Use 'with_stats' instead.
## See help("Deprecated") and help("xgboost-deprecated").
names <- dimnames(data.matrix(testset[,-c(9,10)]))[[2]]
importance_matrix <- xgb.importance(names, model = xgbmodel)
xgb.plot.importance(importance_matrix[1:5,])