library(dplyr)
library(tidyr)
library(stringr)
library(ggplot2)
library(corrplot)
setwd("/Users/yetkineser/Desktop/mef R/data")
train <- read.csv('bigMartTrain.csv')
test <- read.csv('bigMartTest.csv')
glimpse(train)
## Observations: 8,523
## Variables: 12
## $ Item_Identifier <fctr> FDA15, DRC01, FDN15, FDX07, NCD19, ...
## $ Item_Weight <dbl> 9.300, 5.920, 17.500, 19.200, 8.930,...
## $ Item_Fat_Content <fctr> Low Fat, Regular, Low Fat, Regular,...
## $ Item_Visibility <dbl> 0.016047301, 0.019278216, 0.01676007...
## $ Item_Type <fctr> Dairy, Soft Drinks, Meat, Fruits an...
## $ Item_MRP <dbl> 249.8092, 48.2692, 141.6180, 182.095...
## $ Outlet_Identifier <fctr> OUT049, OUT018, OUT049, OUT010, OUT...
## $ Outlet_Establishment_Year <int> 1999, 2009, 1999, 1998, 1987, 2009, ...
## $ Outlet_Size <fctr> Medium, Medium, Medium, , High, Med...
## $ Outlet_Location_Type <fctr> Tier 1, Tier 3, Tier 1, Tier 3, Tie...
## $ Outlet_Type <fctr> Supermarket Type1, Supermarket Type...
## $ Item_Outlet_Sales <dbl> 3735.1380, 443.4228, 2097.2700, 732....
Our dataset has 12 columns and 8523 rows Look at all columns, Firstly factor type columns
Item_Identifier : Unique Product ID
train %>%
summarise(n_distinct(Item_Identifier))
## n_distinct(Item_Identifier)
## 1 1559
train %>%
group_by(Item_Fat_Content) %>%
summarise(Count = n(),Perc=round(n()/nrow(.)*100,2)) %>%
arrange(desc(Count))
## # A tibble: 5 x 3
## Item_Fat_Content Count Perc
## <fctr> <int> <dbl>
## 1 Low Fat 5089 59.71
## 2 Regular 2889 33.90
## 3 LF 316 3.71
## 4 reg 117 1.37
## 5 low fat 112 1.31
train%>%
group_by(Item_Type) %>%
summarise(Count = n(), Perc = round(n() / nrow(.) * 100, 2)) %>%
arrange(desc(Count))
## # A tibble: 16 x 3
## Item_Type Count Perc
## <fctr> <int> <dbl>
## 1 Fruits and Vegetables 1232 14.46
## 2 Snack Foods 1200 14.08
## 3 Household 910 10.68
## 4 Frozen Foods 856 10.04
## 5 Dairy 682 8.00
## 6 Canned 649 7.61
## 7 Baking Goods 648 7.60
## 8 Health and Hygiene 520 6.10
## 9 Soft Drinks 445 5.22
## 10 Meat 425 4.99
## 11 Breads 251 2.94
## 12 Hard Drinks 214 2.51
## 13 Others 169 1.98
## 14 Starchy Foods 148 1.74
## 15 Breakfast 110 1.29
## 16 Seafood 64 0.75
train %>%
group_by(Outlet_Identifier) %>%
summarise(Count = n(), Perc = round(n() / nrow(.) * 100, 2)) %>%
arrange(desc(Count))
## # A tibble: 10 x 3
## Outlet_Identifier Count Perc
## <fctr> <int> <dbl>
## 1 OUT027 935 10.97
## 2 OUT013 932 10.94
## 3 OUT035 930 10.91
## 4 OUT046 930 10.91
## 5 OUT049 930 10.91
## 6 OUT045 929 10.90
## 7 OUT018 928 10.89
## 8 OUT017 926 10.86
## 9 OUT010 555 6.51
## 10 OUT019 528 6.20
train%>%
group_by(Outlet_Size) %>%
summarise(Count = n(),Perc = round(n() / nrow(.) * 100, 2)) %>%
arrange(desc(Count))
## # A tibble: 4 x 3
## Outlet_Size Count Perc
## <fctr> <int> <dbl>
## 1 Medium 2793 32.77
## 2 2410 28.28
## 3 Small 2388 28.02
## 4 High 932 10.94
train%>%
group_by(Outlet_Location_Type) %>%
summarise(Count = n(),Perc = round(n()/nrow(.) * 100, 2)) %>%
arrange(desc(Count))
## # A tibble: 3 x 3
## Outlet_Location_Type Count Perc
## <fctr> <int> <dbl>
## 1 Tier 3 3350 39.31
## 2 Tier 2 2785 32.68
## 3 Tier 1 2388 28.02
train%>%
group_by(Outlet_Type)%>%
summarise(Count=n(),Perc=round(n()/nrow(.)*100,2))%>%
arrange(desc(Count))
## # A tibble: 4 x 3
## Outlet_Type Count Perc
## <fctr> <int> <dbl>
## 1 Supermarket Type1 5577 65.43
## 2 Grocery Store 1083 12.71
## 3 Supermarket Type3 935 10.97
## 4 Supermarket Type2 928 10.89
train%>%
summarise(is_NULL=sum(is.na(Item_Weight)==1),
is_NOT_NULL=sum(!is.na(Item_Weight)==1)
)
## is_NULL is_NOT_NULL
## 1 1463 7060
train%>%
filter(!is.na(Item_Weight))%>%
summarise(
Max=max(Item_Weight),
Min=min(Item_Weight),
Mean=mean(Item_Weight),
Median=median(Item_Weight),
QUA1=quantile(Item_Weight,1/4),
QUA3=quantile(Item_Weight,3/4),
IQR=IQR(Item_Weight)
)
## Max Min Mean Median QUA1 QUA3 IQR
## 1 21.35 4.555 12.85765 12.6 8.77375 16.85 8.07625
train%>%
summarise(is_NULL=sum(is.na(Item_Visibility)==1),
is_NOT_NULL=sum(!is.na(Item_Visibility)==1)
)
## is_NULL is_NOT_NULL
## 1 0 8523
train%>%
filter(!is.na(Item_Visibility))%>%
summarise(
Max=max(Item_Visibility),
Min=min(Item_Visibility),
Mean=mean(Item_Visibility),
Median=median(Item_Visibility),
QUA1=quantile(Item_Visibility,1/4),
QUA3=quantile(Item_Visibility,3/4),
IQR=IQR(Item_Visibility)
)
## Max Min Mean Median QUA1 QUA3 IQR
## 1 0.3283909 0 0.06613203 0.05393093 0.02698948 0.09458529 0.06759582
train%>%
summarise(is_NULL=sum(is.na(Item_MRP)==1),
is_NOT_NULL=sum(!is.na(Item_MRP)==1)
)
## is_NULL is_NOT_NULL
## 1 0 8523
train%>%
filter(!is.na(Item_MRP))%>%
summarise(
Max=max(Item_MRP),
Min=min(Item_MRP),
Mean=mean(Item_MRP),
Median=median(Item_MRP),
QUA1=quantile(Item_MRP,1/4),
QUA3=quantile(Item_MRP,3/4),
IQR=IQR(Item_MRP)
)
## Max Min Mean Median QUA1 QUA3 IQR
## 1 266.8884 31.29 140.9928 143.0128 93.8265 185.6437 91.8172
train%>%
summarise(is_NULL=sum(is.na(Outlet_Establishment_Year)==1),
is_NOT_NULL=sum(!is.na(Outlet_Establishment_Year)==1)
)
## is_NULL is_NOT_NULL
## 1 0 8523
train%>%
filter(!is.na(Outlet_Establishment_Year))%>%
summarise(
Max=max(Outlet_Establishment_Year),
Min=min(Outlet_Establishment_Year),
Mean=mean(Outlet_Establishment_Year),
Median=median(Outlet_Establishment_Year),
QUA1=quantile(Outlet_Establishment_Year,1/4),
QUA3=quantile(Outlet_Establishment_Year,3/4),
IQR=IQR(Outlet_Establishment_Year)
)
## Max Min Mean Median QUA1 QUA3 IQR
## 1 2009 1985 1997.832 1999 1987 2004 17
train%>%
summarise(is_NULL=sum(is.na(Item_Outlet_Sales)==1),
is_NOT_NULL=sum(!is.na(Item_Outlet_Sales)==1)
)
## is_NULL is_NOT_NULL
## 1 0 8523
train%>%
filter(!is.na(Item_Outlet_Sales))%>%
summarise(
Max=max(Item_Outlet_Sales),
Min=min(Item_Outlet_Sales),
Mean=mean(Item_Outlet_Sales),
Median=median(Item_Outlet_Sales),
QUA1=quantile(Item_Outlet_Sales,1/4),
QUA3=quantile(Item_Outlet_Sales,3/4),
IQR=IQR(Item_Outlet_Sales)
)
## Max Min Mean Median QUA1 QUA3 IQR
## 1 13086.96 33.29 2181.289 1794.331 834.2474 3101.296 2267.049
summary(train)
## Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## FDG33 : 10 Min. : 4.555 LF : 316 Min. :0.00000
## FDW13 : 10 1st Qu.: 8.774 Low Fat:5089 1st Qu.:0.02699
## DRE49 : 9 Median :12.600 Regular:2889 Median :0.05393
## DRN47 : 9 Mean :12.858 low fat: 112 Mean :0.06613
## FDD38 : 9 3rd Qu.:16.850 reg : 117 3rd Qu.:0.09459
## FDF52 : 9 Max. :21.350 Max. :0.32839
## (Other):8467 NA's :1463
## Item_Type Item_MRP Outlet_Identifier
## Fruits and Vegetables:1232 Min. : 31.29 OUT027 : 935
## Snack Foods :1200 1st Qu.: 93.83 OUT013 : 932
## Household : 910 Median :143.01 OUT035 : 930
## Frozen Foods : 856 Mean :140.99 OUT046 : 930
## Dairy : 682 3rd Qu.:185.64 OUT049 : 930
## Canned : 649 Max. :266.89 OUT045 : 929
## (Other) :2994 (Other):2937
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## Min. :1985 :2410 Tier 1:2388
## 1st Qu.:1987 High : 932 Tier 2:2785
## Median :1999 Medium:2793 Tier 3:3350
## Mean :1998 Small :2388
## 3rd Qu.:2004
## Max. :2009
##
## Outlet_Type Item_Outlet_Sales
## Grocery Store :1083 Min. : 33.29
## Supermarket Type1:5577 1st Qu.: 834.25
## Supermarket Type2: 928 Median : 1794.33
## Supermarket Type3: 935 Mean : 2181.29
## 3rd Qu.: 3101.30
## Max. :13086.97
##
test$Item_Outlet_Sales <- -999
combi <- rbind(train, test)
combi$Item_Weight[is.na(combi$Item_Weight)] <- median(combi$Item_Weight, na.rm = TRUE)
table(is.na(combi$Item_Weight))
##
## FALSE
## 14204
combi$Item_Visibility <- ifelse(combi$Item_Visibility == 0,
median(combi$Item_Visibility), combi$Item_Visibility)
combi$Item_Fat_Content <- str_replace(
str_replace(
str_replace(combi$Item_Fat_Content,"LF","Low Fat")
,"reg","Regular"),"low fat","Low Fat")
table(combi$Item_Fat_Content)
##
## Low Fat Regular
## 9185 5019
##first two and three letter.
combi <-
combi %>%
mutate(Item_Identifier_Str3 = substr(Item_Identifier,1,3), #First three letter of Item_Identifier.
Item_Identifier_Str2 = substr(Item_Identifier,1,2), #First second letter of Item_Identifier.
Item_Identifier_Num=as.numeric(substr(Item_Identifier,4,6)), # Number part of Item_Identifier column.
Outlet_Age=2013-Outlet_Establishment_Year, #Outlet Age
PK=row_number())
#table(combi$Item_Identifier_Str3)
table(combi$Item_Identifier_Str2)
##
## DR FD NC
## 1317 10201 2686
#combi %>% summarise(n_distinct(Item_Identifier_Str3))
#Split combined data into train and test data again to examine clearly.
new_train <- combi %>%
filter(Item_Outlet_Sales != -999)
new_test <- combi %>%
filter(Item_Outlet_Sales == -999)
#dim(new_train)
#dim(new_test)
# Looking at new Item_Fat_Content column.
qplot(x=Item_Fat_Content,data=new_train)
# Looking at Item Type.
qplot(x=Item_Type,data=new_train)+
geom_bar(color="green")+
theme(axis.text = element_text(angle = 0,color="purple"))+
coord_flip()
# Looking at Item Type with facet wrap according to Item_Identifier_Str
qplot(x=Item_Type,data=new_train)+
geom_bar(color="green")+
theme(axis.text = element_text(color="purple"))+
coord_flip()+
facet_wrap(~Item_Identifier_Str2,nrow=1)
# Looking at Item Type with facet wrap according to Outlet Identifier.
qplot(x = Item_Type, data = new_train) +
geom_bar(color = "green") +
theme(axis.text = element_text(angle = 0,color = "purple")) +
coord_flip() +
facet_wrap(~Outlet_Identifier, nrow = 2)
qplot(x = Item_Identifier_Str2, data = new_train) +
geom_bar(color = "green") +
theme(axis.text = element_text(angle = 0,color = "purple")) +
coord_flip() +
facet_wrap(~Outlet_Identifier, nrow = 2)
# Looking at Item_Outlet_Sales
qplot(x = Item_Outlet_Sales, data = new_train, binwidth = 250) +
geom_bar(color = "green")+
theme(axis.text = element_text(angle = 90,color = "purple")) +
scale_x_continuous(limits = c(0, 10000), breaks = seq(0, 10000, 500))
# It is better now like normal distibution with sqrt and log10
qplot(x = sqrt(Item_Outlet_Sales), data = new_train, binwidth = 1,
ylab = "Count Of Sales",
xlab = "SQRT of Outlet Sales") +
geom_bar(color = "green") +
theme(axis.text.x = element_text(angle = 90,color = "purple"),
axis.text.y = element_text(angle = 30,color = "tomato")) +
scale_x_continuous(limits = c(0,100), breaks = seq(0,70,15)) +
scale_y_continuous(limits = c(0,200), breaks = seq(0,200,100))
qplot(x = log10(Item_Outlet_Sales+1), data = new_train, binwidth = 0.01,
ylab = "Count Of Sales",
xlab = "LOG10 of Outlet Sales") +
geom_bar(color = "green") +
theme(axis.text.x = element_text(angle = 90, color = "purple"),
axis.text.y = element_text(angle = 30, color = "tomato")) +
scale_x_continuous(limits = c(1.5,4.5), breaks = seq(1.5, 4.5, 0.5)) +
scale_y_continuous(limits = c(0,200), breaks = seq(0, 200, 100))
qplot(x = Item_Outlet_Sales, data = new_train,binwidth = 250)+
geom_bar(color = "green")+
theme(axis.text.x = element_text(angle = 90,color = "purple"),
axis.text.y = element_text(angle = 30,color = "tomato")) +
scale_x_continuous(limits = c(0, 5000), breaks = seq(0, 5000, 500)) +
scale_y_continuous(limits = c(0, 250), breaks = seq(0, 200, 100)) +
facet_wrap(~Outlet_Identifier, nrow = 5)
# Looking at Item_Outlet_Sales according to Outlet_Size
qplot(x = Item_Outlet_Sales, data = new_train,binwidth = 250) +
geom_bar(color = "green") +
theme(axis.text.x = element_text(angle = 90, color = "purple"),
axis.text.y = element_text(angle = 30, color = "tomato")) +
scale_x_continuous(limits = c(0, 5000), breaks = seq(0, 5000, 500)) +
scale_y_continuous(limits = c(0, 250), breaks = seq(0, 200, 100)) +
facet_wrap(~Outlet_Size, nrow = 4)
by(new_train$Item_Outlet_Sales, new_train$Outlet_Size, summary)
## new_train$Outlet_Size:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 33.29 554.78 1443.45 1822.63 2681.51 9664.75
## --------------------------------------------------------
## new_train$Outlet_Size: High
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 73.24 1072.60 2050.66 2298.99 3166.38 10256.65
## --------------------------------------------------------
## new_train$Outlet_Size: Medium
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 69.24 1270.35 2251.07 2681.60 3691.20 13086.97
## --------------------------------------------------------
## new_train$Outlet_Size: Small
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 33.96 601.05 1544.66 1912.15 2824.32 9779.94
qplot(x = Item_Outlet_Sales, data = new_train, binwidth = 0.01,
ylab = "Count Of Sales",
xlab = "Log10 of Outlet Sales",
geom = "freqpoly",
color = Outlet_Size) +
geom_bar(color = "green") +
theme(axis.text.x = element_text(angle = 90, color = "purple"),
axis.text.y = element_text(angle = 30, color = "tomato")) +
scale_x_continuous(limits = c(1.5,4.5), breaks = seq(1.5,4.5,0.5)) +
scale_y_continuous(limits = c(0,40), breaks = seq(0,40,10)) +
scale_x_log10()
qplot(x = Outlet_Size, y = Item_Outlet_Sales,
data = new_train,
geom = "boxplot") +
theme(axis.text.x = element_text(angle = 90, color = "purple"),
axis.text.y = element_text(angle = 30, color = "tomato"))
Note : In this chapter, we used analyticsvidhya.com kernels
Before we start to create new columns, look at our data types.
new_combi <- rbind(new_train, new_test)
glimpse(new_combi)
## Observations: 14,204
## Variables: 17
## $ Item_Identifier <fctr> FDA15, DRC01, FDN15, FDX07, NCD19, ...
## $ Item_Weight <dbl> 9.300, 5.920, 17.500, 19.200, 8.930,...
## $ Item_Fat_Content <chr> "Low Fat", "Regular", "Low Fat", "Re...
## $ Item_Visibility <dbl> 0.016047301, 0.019278216, 0.01676007...
## $ Item_Type <fctr> Dairy, Soft Drinks, Meat, Fruits an...
## $ Item_MRP <dbl> 249.8092, 48.2692, 141.6180, 182.095...
## $ Outlet_Identifier <fctr> OUT049, OUT018, OUT049, OUT010, OUT...
## $ Outlet_Establishment_Year <int> 1999, 2009, 1999, 1998, 1987, 2009, ...
## $ Outlet_Size <fctr> Medium, Medium, Medium, , High, Med...
## $ Outlet_Location_Type <fctr> Tier 1, Tier 3, Tier 1, Tier 3, Tie...
## $ Outlet_Type <fctr> Supermarket Type1, Supermarket Type...
## $ Item_Outlet_Sales <dbl> 3735.1380, 443.4228, 2097.2700, 732....
## $ Item_Identifier_Str3 <chr> "FDA", "DRC", "FDN", "FDX", "NCD", "...
## $ Item_Identifier_Str2 <chr> "FD", "DR", "FD", "FD", "NC", "FD", ...
## $ Item_Identifier_Num <dbl> 15, 1, 15, 7, 19, 36, 10, 10, 17, 28...
## $ Outlet_Age <dbl> 14, 4, 14, 15, 26, 4, 26, 28, 11, 6,...
## $ PK <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1...
new_combi <- rbind(new_train, new_test)
new_combi$Item_Fat_Content <- ifelse(combi$Item_Fat_Content == "Regular",1,0)
library(dummies)
new_combi <- dummy.data.frame(new_combi, names = c('Outlet_Size','Outlet_Location_Type'
,'Outlet_Type','Item_Identifier_Str2'),sep = '_')
glimpse(new_combi)
## Observations: 14,204
## Variables: 27
## $ Item_Identifier <fctr> FDA15, DRC01, FDN15, FDX07, N...
## $ Item_Weight <dbl> 9.300, 5.920, 17.500, 19.200, ...
## $ Item_Fat_Content <dbl> 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, ...
## $ Item_Visibility <dbl> 0.016047301, 0.019278216, 0.01...
## $ Item_Type <fctr> Dairy, Soft Drinks, Meat, Fru...
## $ Item_MRP <dbl> 249.8092, 48.2692, 141.6180, 1...
## $ Outlet_Identifier <fctr> OUT049, OUT018, OUT049, OUT01...
## $ Outlet_Establishment_Year <int> 1999, 2009, 1999, 1998, 1987, ...
## $ Outlet_Size_ <int> 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, ...
## $ Outlet_Size_High <int> 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, ...
## $ Outlet_Size_Medium <int> 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, ...
## $ Outlet_Size_Small <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ `Outlet_Location_Type_Tier 1` <int> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ `Outlet_Location_Type_Tier 2` <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...
## $ `Outlet_Location_Type_Tier 3` <int> 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, ...
## $ `Outlet_Type_Grocery Store` <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...
## $ `Outlet_Type_Supermarket Type1` <int> 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, ...
## $ `Outlet_Type_Supermarket Type2` <int> 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ `Outlet_Type_Supermarket Type3` <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...
## $ Item_Outlet_Sales <dbl> 3735.1380, 443.4228, 2097.2700...
## $ Item_Identifier_Str3 <chr> "FDA", "DRC", "FDN", "FDX", "N...
## $ Item_Identifier_Str2_DR <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ Item_Identifier_Str2_FD <int> 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, ...
## $ Item_Identifier_Str2_NC <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
## $ Item_Identifier_Num <dbl> 15, 1, 15, 7, 19, 36, 10, 10, ...
## $ Outlet_Age <dbl> 14, 4, 14, 15, 26, 4, 26, 28, ...
## $ PK <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,...
new_combi <- select(new_combi, -c(Item_Identifier, Outlet_Identifier, Item_Fat_Content, Outlet_Establishment_Year, Item_Type, Item_Identifier_Str3, PK))
str(new_combi)
## 'data.frame': 14204 obs. of 20 variables:
## $ Item_Weight : num 9.3 5.92 17.5 19.2 8.93 ...
## $ Item_Visibility : num 0.016 0.0193 0.0168 0.054 0.054 ...
## $ Item_MRP : num 249.8 48.3 141.6 182.1 53.9 ...
## $ Outlet_Size_ : int 0 0 0 1 0 0 0 0 1 1 ...
## $ Outlet_Size_High : int 0 0 0 0 1 0 1 0 0 0 ...
## $ Outlet_Size_Medium : int 1 1 1 0 0 1 0 1 0 0 ...
## $ Outlet_Size_Small : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Outlet_Location_Type_Tier 1 : int 1 0 1 0 0 0 0 0 0 0 ...
## $ Outlet_Location_Type_Tier 2 : int 0 0 0 0 0 0 0 0 1 1 ...
## $ Outlet_Location_Type_Tier 3 : int 0 1 0 1 1 1 1 1 0 0 ...
## $ Outlet_Type_Grocery Store : int 0 0 0 1 0 0 0 0 0 0 ...
## $ Outlet_Type_Supermarket Type1: int 1 0 1 0 1 0 1 0 1 1 ...
## $ Outlet_Type_Supermarket Type2: int 0 1 0 0 0 1 0 0 0 0 ...
## $ Outlet_Type_Supermarket Type3: int 0 0 0 0 0 0 0 1 0 0 ...
## $ Item_Outlet_Sales : num 3735 443 2097 732 995 ...
## $ Item_Identifier_Str2_DR : int 0 1 0 0 0 0 0 0 0 0 ...
## $ Item_Identifier_Str2_FD : int 1 0 1 1 0 1 1 1 1 1 ...
## $ Item_Identifier_Str2_NC : int 0 0 0 0 1 0 0 0 0 0 ...
## $ Item_Identifier_Num : num 15 1 15 7 19 36 10 10 17 28 ...
## $ Outlet_Age : num 14 4 14 15 26 4 26 28 11 6 ...
## - attr(*, "dummies")=List of 4
## ..$ Outlet_Size : int 9 10 11 12
## ..$ Outlet_Location_Type: int 13 14 15
## ..$ Outlet_Type : int 16 17 18 19
## ..$ Item_Identifier_Str2: int 22 23 24
pred_train <- new_combi %>%
filter(Item_Outlet_Sales != -999)
pred_test <- new_combi %>%
filter(Item_Outlet_Sales == -999)
#dim(pred_train)
#dim(pred_test)
linear_model <- lm(Item_Outlet_Sales ~ ., data = pred_train)
summary(linear_model)
##
## Call:
## lm(formula = Item_Outlet_Sales ~ ., data = pred_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4277.2 -681.4 -85.4 568.6 7897.8
##
## Coefficients: (4 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2348.3559 326.2603 7.198 6.64e-13 ***
## Item_Weight -0.9462 2.8983 -0.326 0.74407
## Item_Visibility -221.0750 262.8463 -0.841 0.40033
## Item_MRP 15.5755 0.1965 79.257 < 2e-16 ***
## Outlet_Size_ -142.6728 45.6008 -3.129 0.00176 **
## Outlet_Size_High 694.3313 254.9895 2.723 0.00648 **
## Outlet_Size_Medium 29.5461 56.3255 0.525 0.59990
## Outlet_Size_Small NA NA NA NA
## `Outlet_Location_Type_Tier 1` 320.5095 154.5967 2.073 0.03818 *
## `Outlet_Location_Type_Tier 2` 223.3423 100.3885 2.225 0.02612 *
## `Outlet_Location_Type_Tier 3` NA NA NA NA
## `Outlet_Type_Grocery Store` -3630.9270 177.7976 -20.422 < 2e-16 ***
## `Outlet_Type_Supermarket Type1` -2152.1612 294.1229 -7.317 2.76e-13 ***
## `Outlet_Type_Supermarket Type2` -2549.1114 256.6856 -9.931 < 2e-16 ***
## `Outlet_Type_Supermarket Type3` NA NA NA NA
## Item_Identifier_Str2_DR 14.9593 49.0122 0.305 0.76021
## Item_Identifier_Str2_FD 43.8687 31.8015 1.379 0.16779
## Item_Identifier_Str2_NC NA NA NA NA
## Item_Identifier_Num 2.9993 0.7092 4.229 2.37e-05 ***
## Outlet_Age -34.3064 10.4716 -3.276 0.00106 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1127 on 8507 degrees of freedom
## Multiple R-squared: 0.5643, Adjusted R-squared: 0.5635
## F-statistic: 734.5 on 15 and 8507 DF, p-value: < 2.2e-16
# cor(pred_train)
library("corrplot")
library(RColorBrewer)
M<-cor(pred_train)
corrplot(M, diag = FALSE, order = "FPC",
tl.pos = "td", tl.cex = 0.5, method = "circle", type = "upper")
#create a new variable in test file
test$Item_Outlet_Sales <- -999
#combine train and test data
combi <- rbind(train, test)
#impute missing value in Item_Weight
combi$Item_Weight[is.na(combi$Item_Weight)] <- median(combi$Item_Weight, na.rm = TRUE)
#impute 0 in item_visibility
combi$Item_Visibility <- ifelse(combi$Item_Visibility == 0, median(combi$Item_Visibility), combi$Item_Visibility)
#rename level in Outlet_Size
levels(combi$Outlet_Size)[1] <- "Other"
#rename levels of Item_Fat_Content
library(plyr)
combi$Item_Fat_Content <- revalue(combi$Item_Fat_Content,c("LF" = "Low Fat", "reg" = "Regular"))
combi$Item_Fat_Content <- revalue(combi$Item_Fat_Content, c("low fat" = "Low Fat"))
#create a new column 2013 - Year
combi$Year <- 2013 - combi$Outlet_Establishment_Year
#drop variables not required in modeling
library(dplyr)
combi <- select(combi, -c(Item_Identifier, Outlet_Identifier, Outlet_Establishment_Year))
#divide data set
new_train <- combi[1:nrow(train),]
new_test <- combi[-(1:nrow(train)),]
#linear regression
linear_model_simple <- lm(Item_Outlet_Sales ~ ., data = new_train)
summary(linear_model_simple)
##
## Call:
## lm(formula = Item_Outlet_Sales ~ ., data = new_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4336.2 -680.8 -89.8 568.3 7946.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -977.43100 304.34139 -3.212 0.001325 **
## Item_Weight -0.51139 2.91490 -0.175 0.860738
## Item_Fat_ContentRegular 40.50509 28.23186 1.435 0.151401
## Item_Visibility -255.77401 263.78199 -0.970 0.332253
## Item_TypeBreads 5.25222 84.04714 0.062 0.950173
## Item_TypeBreakfast 6.26997 116.59546 0.054 0.957115
## Item_TypeCanned 25.28447 62.76864 0.403 0.687091
## Item_TypeDairy -41.05177 62.22667 -0.660 0.509456
## Item_TypeFrozen Foods -28.00838 58.86986 -0.476 0.634252
## Item_TypeFruits and Vegetables 29.84893 54.95872 0.543 0.587065
## Item_TypeHard Drinks -0.07955 90.18745 -0.001 0.999296
## Item_TypeHealth and Hygiene -10.09434 68.02370 -0.148 0.882035
## Item_TypeHousehold -39.12266 59.92886 -0.653 0.513891
## Item_TypeMeat -0.35463 70.66270 -0.005 0.995996
## Item_TypeOthers -21.27266 98.62482 -0.216 0.829232
## Item_TypeSeafood 184.55373 148.00848 1.247 0.212464
## Item_TypeSnack Foods -11.48660 55.25066 -0.208 0.835312
## Item_TypeSoft Drinks -27.46544 70.16973 -0.391 0.695501
## Item_TypeStarchy Foods 21.86315 103.04028 0.212 0.831971
## Item_MRP 15.56477 0.19769 78.733 < 2e-16 ***
## Outlet_SizeHigh 849.13529 256.29910 3.313 0.000927 ***
## Outlet_SizeMedium 170.93952 71.08120 2.405 0.016200 *
## Outlet_SizeSmall 144.05363 45.67140 3.154 0.001615 **
## Outlet_Location_TypeTier 2 -100.36636 90.23496 -1.112 0.266050
## Outlet_Location_TypeTier 3 -326.31974 154.89294 -2.107 0.035169 *
## Outlet_TypeSupermarket Type1 1473.06659 140.73922 10.467 < 2e-16 ***
## Outlet_TypeSupermarket Type2 1076.77568 136.08116 7.913 2.83e-15 ***
## Outlet_TypeSupermarket Type3 3639.29114 178.13923 20.429 < 2e-16 ***
## Year -34.82155 10.49271 -3.319 0.000908 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1129 on 8494 degrees of freedom
## Multiple R-squared: 0.5637, Adjusted R-squared: 0.5623
## F-statistic: 392 on 28 and 8494 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(linear_model_simple)
linear_model_log <- lm(log(Item_Outlet_Sales) ~ ., data = new_train)
summary(linear_model)
##
## Call:
## lm(formula = Item_Outlet_Sales ~ ., data = pred_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4277.2 -681.4 -85.4 568.6 7897.8
##
## Coefficients: (4 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2348.3559 326.2603 7.198 6.64e-13 ***
## Item_Weight -0.9462 2.8983 -0.326 0.74407
## Item_Visibility -221.0750 262.8463 -0.841 0.40033
## Item_MRP 15.5755 0.1965 79.257 < 2e-16 ***
## Outlet_Size_ -142.6728 45.6008 -3.129 0.00176 **
## Outlet_Size_High 694.3313 254.9895 2.723 0.00648 **
## Outlet_Size_Medium 29.5461 56.3255 0.525 0.59990
## Outlet_Size_Small NA NA NA NA
## `Outlet_Location_Type_Tier 1` 320.5095 154.5967 2.073 0.03818 *
## `Outlet_Location_Type_Tier 2` 223.3423 100.3885 2.225 0.02612 *
## `Outlet_Location_Type_Tier 3` NA NA NA NA
## `Outlet_Type_Grocery Store` -3630.9270 177.7976 -20.422 < 2e-16 ***
## `Outlet_Type_Supermarket Type1` -2152.1612 294.1229 -7.317 2.76e-13 ***
## `Outlet_Type_Supermarket Type2` -2549.1114 256.6856 -9.931 < 2e-16 ***
## `Outlet_Type_Supermarket Type3` NA NA NA NA
## Item_Identifier_Str2_DR 14.9593 49.0122 0.305 0.76021
## Item_Identifier_Str2_FD 43.8687 31.8015 1.379 0.16779
## Item_Identifier_Str2_NC NA NA NA NA
## Item_Identifier_Num 2.9993 0.7092 4.229 2.37e-05 ***
## Outlet_Age -34.3064 10.4716 -3.276 0.00106 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1127 on 8507 degrees of freedom
## Multiple R-squared: 0.5643, Adjusted R-squared: 0.5635
## F-statistic: 734.5 on 15 and 8507 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(linear_model_log)
library(rpart)
library(e1071)
library(rpart.plot)
library(caret)
#setting the tree control parameters
fitControl <- trainControl(method = "cv", number = 5)
cartGrid <- expand.grid(.cp=(1:50)*0.01)
#decision tree
tree_model <- train(Item_Outlet_Sales ~ ., data = new_train, method = "rpart", trControl = fitControl, tuneGrid = cartGrid)
#print(tree_model)
main_tree <- rpart(Item_Outlet_Sales ~ ., data = new_train, control = rpart.control(cp=0.01))
prp(main_tree)
library(Metrics)
rmse(new_train$Item_Outlet_Sales, linear_model$fitted.values)
## [1] 1126.362
rmse(new_train$Item_Outlet_Sales, linear_model_simple$fitted.values)
## [1] 1127.081
rmse(new_train$Item_Outlet_Sales, exp(linear_model_log$fitted.values))
## [1] 1140.004
pre_score <- predict(main_tree, type = "vector")
rmse(new_train$Item_Outlet_Sales, pre_score)
## [1] 1102.774
statistical-modeling-in-r-part-1
lm_model_1 <- lm(Item_Outlet_Sales ~ ., data = new_train)
rpart_model_1 <- rpart(Item_Outlet_Sales ~ ., data = new_train)
lm_model_1_output <- predict(lm_model_1, newdata = new_train)
rpart_model_1_output <- predict(rpart_model_1, newdata = new_train)
# Find the case by case differences
lm_model_1_diff <- with(new_train, Item_Outlet_Sales - lm_model_1_output)
rpart_model_1_diff <- with(new_train, Item_Outlet_Sales - rpart_model_1_output)
# Calculate the mean square errors
mean(lm_model_1_diff)
## [1] 6.105024e-11
mean(rpart_model_1_diff)
## [1] 1.151509e-12
(formula <- as.formula(Item_Outlet_Sales ~ Item_Weight + Item_Fat_Content + Item_Visibility + Item_Type + Item_MRP + Outlet_Size +
Outlet_Location_Type + Outlet_Type + Year))
## Item_Outlet_Sales ~ Item_Weight + Item_Fat_Content + Item_Visibility +
## Item_Type + Item_MRP + Outlet_Size + Outlet_Location_Type +
## Outlet_Type + Year
new_train$training_cases <- rnorm(nrow(new_train)) > 0
# summary(new_train$training_cases)
lm_model_1 <- lm(formula, data = subset(new_train, training_cases))
rpart_model_1 <- rpart(formula, data = subset(new_train, training_cases))
lm_preds <- predict(lm_model_1, newdata = subset(new_train, !training_cases))
rpart_preds <- predict(rpart_model_1, newdata = subset(new_train, !training_cases))
# Find the mean of case by case differences
with(subset(new_train, !training_cases), mean((Item_Outlet_Sales - lm_preds) ^ 2))
## [1] 1294078
with(subset(new_train, !training_cases), mean((Item_Outlet_Sales - rpart_preds) ^ 2))
## [1] 1250418