library(dplyr)
library(tidyr)
library(stringr)
library(ggplot2)
library(corrplot)
library(dummies)
setwd("/Users/yetkineser/Desktop/mef R/data")
train <- read.csv('bigMartTrain.csv')
test <- read.csv('bigMartTest.csv')
glimpse(train)
## Observations: 8,523
## Variables: 12
## $ Item_Identifier <fctr> FDA15, DRC01, FDN15, FDX07, NCD19, ...
## $ Item_Weight <dbl> 9.300, 5.920, 17.500, 19.200, 8.930,...
## $ Item_Fat_Content <fctr> Low Fat, Regular, Low Fat, Regular,...
## $ Item_Visibility <dbl> 0.016047301, 0.019278216, 0.01676007...
## $ Item_Type <fctr> Dairy, Soft Drinks, Meat, Fruits an...
## $ Item_MRP <dbl> 249.8092, 48.2692, 141.6180, 182.095...
## $ Outlet_Identifier <fctr> OUT049, OUT018, OUT049, OUT010, OUT...
## $ Outlet_Establishment_Year <int> 1999, 2009, 1999, 1998, 1987, 2009, ...
## $ Outlet_Size <fctr> Medium, Medium, Medium, , High, Med...
## $ Outlet_Location_Type <fctr> Tier 1, Tier 3, Tier 1, Tier 3, Tie...
## $ Outlet_Type <fctr> Supermarket Type1, Supermarket Type...
## $ Item_Outlet_Sales <dbl> 3735.1380, 443.4228, 2097.2700, 732....
train %>%
summarise(n_distinct(Item_Identifier))
## n_distinct(Item_Identifier)
## 1 1559
Item_Identifier column has 1559 unique value. So, we can say we will examine 1559 unique item properties due to the stores.
train %>%
group_by(Item_Fat_Content) %>%
summarise(Count = n(),Perc=round(n()/nrow(.)*100,2)) %>%
arrange(desc(Count))
## # A tibble: 5 x 3
## Item_Fat_Content Count Perc
## <fctr> <int> <dbl>
## 1 Low Fat 5089 59.71
## 2 Regular 2889 33.90
## 3 LF 316 3.71
## 4 reg 117 1.37
## 5 low fat 112 1.31
Item_Fat_Content distribution can be seen as belove summary. Low fat items have highest rate, so we can say low fat items reside on stores generally. There is corruption about string values, such as “Low fat”/“low fat”/“LF” or “Regular” / “reg” , we know these string values represent same value and so, we should clean this column values on cleaning part.
train%>%
group_by(Item_Type) %>%
summarise(Count = n(), Perc = round(n() / nrow(.) * 100, 2)) %>%
arrange(desc(Count))
## # A tibble: 16 x 3
## Item_Type Count Perc
## <fctr> <int> <dbl>
## 1 Fruits and Vegetables 1232 14.46
## 2 Snack Foods 1200 14.08
## 3 Household 910 10.68
## 4 Frozen Foods 856 10.04
## 5 Dairy 682 8.00
## 6 Canned 649 7.61
## 7 Baking Goods 648 7.60
## 8 Health and Hygiene 520 6.10
## 9 Soft Drinks 445 5.22
## 10 Meat 425 4.99
## 11 Breads 251 2.94
## 12 Hard Drinks 214 2.51
## 13 Others 169 1.98
## 14 Starchy Foods 148 1.74
## 15 Breakfast 110 1.29
## 16 Seafood 64 0.75
Train dataset has 16 different item_type value and “Fruits and Vegetables” items reside on stores popularly.
train %>%
group_by(Outlet_Identifier) %>%
summarise(Count = n(), Perc = round(n() / nrow(.) * 100, 2)) %>%
arrange(desc(Count))
## # A tibble: 10 x 3
## Outlet_Identifier Count Perc
## <fctr> <int> <dbl>
## 1 OUT027 935 10.97
## 2 OUT013 932 10.94
## 3 OUT035 930 10.91
## 4 OUT046 930 10.91
## 5 OUT049 930 10.91
## 6 OUT045 929 10.90
## 7 OUT018 928 10.89
## 8 OUT017 926 10.86
## 9 OUT010 555 6.51
## 10 OUT019 528 6.20
Outlet_Identifier data show that train dataset includes item information of 10 different stores.
train%>%
group_by(Outlet_Size) %>%
summarise(Count = n(),Perc = round(n() / nrow(.) * 100, 2)) %>%
arrange(desc(Count))
## # A tibble: 4 x 3
## Outlet_Size Count Perc
## <fctr> <int> <dbl>
## 1 Medium 2793 32.77
## 2 2410 28.28
## 3 Small 2388 28.02
## 4 High 932 10.94
As we can see, outlet_size have some null values. So, on cleaning part we should consider about this column,also.
train%>%
group_by(Outlet_Location_Type) %>%
summarise(Count = n(),Perc = round(n()/nrow(.) * 100, 2)) %>%
arrange(desc(Count))
## # A tibble: 3 x 3
## Outlet_Location_Type Count Perc
## <fctr> <int> <dbl>
## 1 Tier 3 3350 39.31
## 2 Tier 2 2785 32.68
## 3 Tier 1 2388 28.02
Data of Outlet_Location_Type column distribution is fair for each type.
train%>%
group_by(Outlet_Type)%>%
summarise(Count=n(),Perc=round(n()/nrow(.)*100,2))%>%
arrange(desc(Count))
## # A tibble: 4 x 3
## Outlet_Type Count Perc
## <fctr> <int> <dbl>
## 1 Supermarket Type1 5577 65.43
## 2 Grocery Store 1083 12.71
## 3 Supermarket Type3 935 10.97
## 4 Supermarket Type2 928 10.89
This result show that we have item information on “Supermarket Type1” store with the highest rate (65.43). It means, our data includes “Supermarket Type1” store mostly.
train%>%
summarise(is_NULL=sum(is.na(Item_Weight)==1),
is_NOT_NULL=sum(!is.na(Item_Weight)==1)
)
## is_NULL is_NOT_NULL
## 1 1463 7060
train%>%
filter(!is.na(Item_Weight))%>%
summarise(
Max=max(Item_Weight),
Min=min(Item_Weight),
Mean=mean(Item_Weight),
Median=median(Item_Weight),
QUA1=quantile(Item_Weight,1/4),
QUA3=quantile(Item_Weight,3/4),
IQR=IQR(Item_Weight)
)
## Max Min Mean Median QUA1 QUA3 IQR
## 1 21.35 4.555 12.85765 12.6 8.77375 16.85 8.07625
The result show that Item_Weight column has 1463 null value, so cleaning part we should consider this column also. When we exclude the null values, the heaviest item is 21.35 gr and the lightest ite is 4.555 gr.
train%>%
summarise(is_NULL=sum(is.na(Item_Visibility)==1),
is_NOT_NULL=sum(!is.na(Item_Visibility)==1)
)
## is_NULL is_NOT_NULL
## 1 0 8523
train%>%
filter(!is.na(Item_Visibility))%>%
summarise(
Max=max(Item_Visibility),
Min=min(Item_Visibility),
Mean=mean(Item_Visibility),
Median=median(Item_Visibility),
QUA1=quantile(Item_Visibility,1/4),
QUA3=quantile(Item_Visibility,3/4),
IQR=IQR(Item_Visibility)
)
## Max Min Mean Median QUA1 QUA3 IQR
## 1 0.3283909 0 0.06613203 0.05393093 0.02698948 0.09458529 0.06759582
There is no null value on Item_Visibility column but from minumum value we can say there is “0” value on column.
train%>%
summarise(is_NULL=sum(is.na(Item_MRP)==1),
is_NOT_NULL=sum(!is.na(Item_MRP)==1)
)
## is_NULL is_NOT_NULL
## 1 0 8523
train%>%
filter(!is.na(Item_MRP))%>%
summarise(
Max=max(Item_MRP),
Min=min(Item_MRP),
Mean=mean(Item_MRP),
Median=median(Item_MRP),
QUA1=quantile(Item_MRP,1/4),
QUA3=quantile(Item_MRP,3/4),
IQR=IQR(Item_MRP)
)
## Max Min Mean Median QUA1 QUA3 IQR
## 1 266.8884 31.29 140.9928 143.0128 93.8265 185.6437 91.8172
Good news, there is no null or “0” value on ITEM_MRP column. MAx price is 266.8884 and min price is 31.29.
train%>%
summarise(is_NULL=sum(is.na(Outlet_Establishment_Year)==1),
is_NOT_NULL=sum(!is.na(Outlet_Establishment_Year)==1)
)
## is_NULL is_NOT_NULL
## 1 0 8523
train%>%
filter(!is.na(Outlet_Establishment_Year))%>%
summarise(
Max=max(Outlet_Establishment_Year),
Min=min(Outlet_Establishment_Year),
Mean=mean(Outlet_Establishment_Year),
Median=median(Outlet_Establishment_Year),
QUA1=quantile(Outlet_Establishment_Year,1/4),
QUA3=quantile(Outlet_Establishment_Year,3/4),
IQR=IQR(Outlet_Establishment_Year)
)
## Max Min Mean Median QUA1 QUA3 IQR
## 1 2009 1985 1997.832 1999 1987 2004 17
The oldest store has opened in 1985 and the newest one has opened in 2009. when we look at mean and median,w e can say stores in dataset are generally old stores. (More than 17-18 years old.)
train%>%
summarise(is_NULL=sum(is.na(Item_Outlet_Sales)==1),
is_NOT_NULL=sum(!is.na(Item_Outlet_Sales)==1)
)
## is_NULL is_NOT_NULL
## 1 0 8523
train%>%
filter(!is.na(Item_Outlet_Sales))%>%
summarise(
Max=max(Item_Outlet_Sales),
Min=min(Item_Outlet_Sales),
Mean=mean(Item_Outlet_Sales),
Median=median(Item_Outlet_Sales),
QUA1=quantile(Item_Outlet_Sales,1/4),
QUA3=quantile(Item_Outlet_Sales,3/4),
IQR=IQR(Item_Outlet_Sales)
)
## Max Min Mean Median QUA1 QUA3 IQR
## 1 13086.96 33.29 2181.289 1794.331 834.2474 3101.296 2267.049
Item_Outlet_Sales column has no null values.
summary(train)
## Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## FDG33 : 10 Min. : 4.555 LF : 316 Min. :0.00000
## FDW13 : 10 1st Qu.: 8.774 Low Fat:5089 1st Qu.:0.02699
## DRE49 : 9 Median :12.600 Regular:2889 Median :0.05393
## DRN47 : 9 Mean :12.858 low fat: 112 Mean :0.06613
## FDD38 : 9 3rd Qu.:16.850 reg : 117 3rd Qu.:0.09459
## FDF52 : 9 Max. :21.350 Max. :0.32839
## (Other):8467 NA's :1463
## Item_Type Item_MRP Outlet_Identifier
## Fruits and Vegetables:1232 Min. : 31.29 OUT027 : 935
## Snack Foods :1200 1st Qu.: 93.83 OUT013 : 932
## Household : 910 Median :143.01 OUT035 : 930
## Frozen Foods : 856 Mean :140.99 OUT046 : 930
## Dairy : 682 3rd Qu.:185.64 OUT049 : 930
## Canned : 649 Max. :266.89 OUT045 : 929
## (Other) :2994 (Other):2937
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## Min. :1985 :2410 Tier 1:2388
## 1st Qu.:1987 High : 932 Tier 2:2785
## Median :1999 Medium:2793 Tier 3:3350
## Mean :1998 Small :2388
## 3rd Qu.:2004
## Max. :2009
##
## Outlet_Type Item_Outlet_Sales
## Grocery Store :1083 Min. : 33.29
## Supermarket Type1:5577 1st Qu.: 834.25
## Supermarket Type2: 928 Median : 1794.33
## Supermarket Type3: 935 Mean : 2181.29
## 3rd Qu.: 3101.30
## Max. :13086.97
##
test$Item_Outlet_Sales <- -999
combi <- rbind(train, test)
combi$Item_Weight[is.na(combi$Item_Weight)] <- median(combi$Item_Weight, na.rm = TRUE)
table(is.na(combi$Item_Weight))
##
## FALSE
## 14204
combi$Item_Visibility <- ifelse(combi$Item_Visibility == 0,
median(combi$Item_Visibility), combi$Item_Visibility)
combi$Item_Fat_Content <- str_replace(
str_replace(
str_replace(combi$Item_Fat_Content,"LF","Low Fat")
,"reg","Regular"),"low fat","Low Fat")
table(combi$Item_Fat_Content)
##
## Low Fat Regular
## 9185 5019
##first two and three letter.
combi <-
combi %>%
mutate(Item_Identifier_Str3 = substr(Item_Identifier,1,3), #First three letter of Item_Identifier.
Item_Identifier_Str2 = substr(Item_Identifier,1,2), #First second letter of Item_Identifier.
Item_Identifier_Num=as.numeric(substr(Item_Identifier,4,6)), # Number part of Item_Identifier column.
Outlet_Age=2013-Outlet_Establishment_Year, #Outlet Age
PK=row_number())
#table(combi$Item_Identifier_Str3)
table(combi$Item_Identifier_Str2)
##
## DR FD NC
## 1317 10201 2686
#combi %>% summarise(n_distinct(Item_Identifier_Str3))
#Split combined data into train and test data again to examine clearly.
new_train <- combi %>%
filter(Item_Outlet_Sales != -999)
new_test <- combi %>%
filter(Item_Outlet_Sales == -999)
#dim(new_train)
#dim(new_test)
# Looking at new Item_Fat_Content column.
qplot(x=Item_Fat_Content,data=new_train) +
geom_bar(fill="tomato") +
theme_minimal()
# Looking at Item Type with facet wrap according to Item_Identifier_Str
qplot(x=Item_Type,data=new_train)+
geom_bar(fill="tomato")+
theme(axis.text = element_text(color="tomato"))+
coord_flip() +
theme_minimal() +
facet_wrap(~Item_Identifier_Str2,nrow=1)
# Looking at Item Type with facet wrap according to Outlet Identifier.
qplot(x = Item_Identifier_Str2, data = new_train) +
geom_bar(fill = "tomato") +
theme(axis.text = element_text(angle = 0)) +
coord_flip() +
theme_minimal() +
facet_wrap(~Outlet_Identifier, nrow = 2)
library(gridExtra)
# Looking at Item_Outlet_Sales
p0 <- qplot(x = Item_Outlet_Sales, data = new_train, binwidth = 250,fill=I("lightblue")) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
theme_minimal() +
scale_x_continuous(limits = c(0, 10000), breaks = seq(0, 10000, 1000))
# It is better now like normal distibution with sqrt and log10
p1 <- qplot(x = sqrt(Item_Outlet_Sales), data = new_train, binwidth = 1,
ylab = "Count Of Sales",
xlab = "SQRT of Outlet Sales",fill=I("tomato")) +
theme(axis.text.x = element_text(angle = 90),
axis.text.y = element_text(angle = 30)) +
theme_minimal() +
scale_x_continuous(limits = c(0,100), breaks = seq(0,70,15)) +
scale_y_continuous(limits = c(0,200), breaks = seq(0,200,100))
p2 <- qplot(x = log10(Item_Outlet_Sales+1), data = new_train, binwidth = 0.01,
ylab = "Count Of Sales",
xlab = "LOG10 of Outlet Sales",fill=I("lightblue")) +
theme(axis.text.x = element_text(angle = 90),
axis.text.y = element_text(angle = 30)) +
theme_minimal() +
scale_x_continuous(limits = c(1.5,4.5), breaks = seq(1.5, 4.5, 0.5)) +
scale_y_continuous(limits = c(0,200), breaks = seq(0, 200, 100))
grid.arrange(p0, p1, p2, ncol=1)
# "Sales Distribution trough the Item Fat Content"
p0 <- qplot(x = sqrt(Item_Outlet_Sales), data = new_train, binwidth = 1,
ylab = "",
xlab = "SQRT of Outlet Sales",
fill=I("tomato")) +
theme(axis.text.x = element_text(angle = 90),
axis.text.y = element_text(angle = 30)) +
scale_x_continuous(limits = c(0,100), breaks = seq(0,120,15)) +
scale_y_continuous(limits = c(0,200), breaks = seq(0,200,100)) +
theme_minimal() +
facet_wrap(~Item_Fat_Content)
#boxplot
p1 <- qplot(x = Item_Fat_Content, y = sqrt(Item_Outlet_Sales),
ylab = "SQRT of Outlet Sales",
data = new_train,
geom = "boxplot",
fill=I("tomato")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 0),
axis.text.y = element_text(angle = 30))
grid.arrange(p0, p1, ncol=1)
# It is better now like normal distibution with sqrt and log10
p0 <- qplot(x = sqrt(Item_Outlet_Sales), data = new_train, binwidth = 1,
ylab = "Sales Distribution",
xlab = "SQRT of Outlet Sales",
fill=I("tomato")) +
theme(axis.text.x = element_text(angle = 90),
axis.text.y = element_text(angle = 30)) +
theme_minimal() +
scale_x_continuous(limits = c(0,100), breaks = seq(0,120,15)) +
scale_y_continuous(limits = c(0,40), breaks = seq(0,40,10)) +
facet_wrap(~Item_Type)
p1 <- qplot(x = Item_Type, y = sqrt(Item_Outlet_Sales),
ylab = "SQRT of Outlet Sales",
data = new_train,
geom = "boxplot",
fill=I("tomato")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90),
axis.text.y = element_text(angle = 30))
p0
p1
# It is better now like normal distibution with sqrt and log10
p0 <- qplot(x = sqrt(Item_Outlet_Sales), data = new_train, binwidth = 1,
ylab = "Sales Distribution",
xlab = "SQRT of Outlet Sales",
fill=I("tomato")) +
theme(axis.text.x = element_text(angle = 90),
axis.text.y = element_text(angle = 30)) +
theme_minimal() +
scale_x_continuous(limits = c(0,100), breaks = seq(0,120,15)) +
scale_y_continuous(limits = c(0,40), breaks = seq(0,40,10)) +
facet_wrap(~Outlet_Identifier,2)
p1 <- qplot(x = Outlet_Identifier, y = sqrt(Item_Outlet_Sales),
ylab = "SQRT of Outlet Sales",
data = new_train,
geom = "boxplot",
fill=I("tomato")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90),
axis.text.y = element_text(angle = 30))
p0
p1
# It is better now like normal distibution with sqrt and log10
p0 <- qplot(x = sqrt(Item_Outlet_Sales), data = new_train, binwidth = 0.5,
ylab = "Sales Distribution",
xlab = "SQRT of Outlet Sales",
fill=I("tomato")) +
theme(axis.text.x = element_text(angle = 90),
axis.text.y = element_text(angle = 30)) +
theme_minimal() +
scale_x_continuous(limits = c(0,100), breaks = seq(0,120,15)) +
scale_y_continuous(limits = c(0,40), breaks = seq(0,40,10)) +
facet_wrap(~Outlet_Size)
p1 <- qplot(x = Outlet_Size, y = sqrt(Item_Outlet_Sales),
ylab = "SQRT of Outlet Sales",
data = new_train,
geom = "boxplot",
fill=I("tomato")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90),
axis.text.y = element_text(angle = 30))
p2 <- qplot(x = Item_Outlet_Sales, data = new_train, binwidth = 0.02,
ylab = "Count Of Sales",
xlab = "Log10 of Outlet Sales",
geom = "freqpoly",
color = Outlet_Size) +
theme(axis.text.x = element_text(angle = 90, color = "tomato"),
axis.text.y = element_text(angle = 30, color = "tomato")) +
theme_minimal() +
scale_x_continuous(limits = c(1.5,4.5), breaks = seq(1.5,4.5,0.5)) +
scale_y_continuous(limits = c(0,100), breaks = seq(0,100,10)) +
scale_x_log10()
grid.arrange(p0, p1, ncol=1)
p2
* The relationship between Outlet_Location_Type and Item_Outlet_Sales (Berkay)
# It is better now like normal distibution with sqrt and log10
p0 <- qplot(x = sqrt(Item_Outlet_Sales), data = new_train, binwidth = 0.5,
ylab = "Sales Distribution",
xlab = "SQRT of Outlet Sales",
fill=I("tomato")) +
theme(axis.text.x = element_text(angle = 90),
axis.text.y = element_text(angle = 30)) +
theme_minimal() +
scale_x_continuous(limits = c(0,100), breaks = seq(0,120,15)) +
scale_y_continuous(limits = c(0,40), breaks = seq(0,40,10)) +
facet_wrap(~Outlet_Location_Type)
p1 <- qplot(x = Outlet_Location_Type, y = sqrt(Item_Outlet_Sales),
ylab = "SQRT of Outlet Sales",
data = new_train,
geom = "boxplot",
fill=I("tomato")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 0),
axis.text.y = element_text(angle = 30))
grid.arrange(p0, p1, ncol=1)
# It is better now like normal distibution with sqrt and log10
p0 <- qplot(x = sqrt(Item_Outlet_Sales), data = new_train, binwidth = 0.3,
ylab = "Sales Distribution",
xlab = "SQRT of Outlet Sales",
fill=I("tomato")) +
theme(axis.text.x = element_text(angle = 90),
axis.text.y = element_text(angle = 30)) +
theme_minimal() +
scale_x_continuous(limits = c(0,100), breaks = seq(0,120,15)) +
scale_y_continuous(limits = c(0,40), breaks = seq(0,40,10)) +
facet_wrap(~Outlet_Type)
p1 <- qplot(x = Outlet_Type, y = sqrt(Item_Outlet_Sales),
ylab = "SQRT of Outlet Sales",
data = new_train,
geom = "boxplot",
fill=I("tomato")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90),
axis.text.y = element_text(angle = 30))
p0
p1
#grid.arrange(p0, p1, ncol=1)
# It is better now like normal distibution with sqrt and log10
p0 <- qplot(x = sqrt(Item_Outlet_Sales), data = new_train, binwidth = 0.1,
ylab = "Sales Distribution",
xlab = "SQRT of Outlet Sales",
fill=I("tomato")) +
theme(axis.text.x = element_text(angle = 90),
axis.text.y = element_text(angle = 30)) +
theme_minimal() +
scale_x_continuous(limits = c(0,100), breaks = seq(0,120,15)) +
scale_y_continuous(limits = c(0,20), breaks = seq(0,20,5)) +
facet_wrap(~Item_Identifier_Str2)
p1 <- qplot(x = Item_Identifier_Str2, y = sqrt(Item_Outlet_Sales),
ylab = "SQRT of Outlet Sales",
data = new_train,
geom = "boxplot",
fill=I("tomato")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90),
axis.text.y = element_text(angle = 30))
grid.arrange(p0, p1, ncol=1)
Bulk data is like that:
new_combi <- rbind(new_train, new_test)
glimpse(new_combi)
## Observations: 14,204
## Variables: 17
## $ Item_Identifier <fctr> FDA15, DRC01, FDN15, FDX07, NCD19, ...
## $ Item_Weight <dbl> 9.300, 5.920, 17.500, 19.200, 8.930,...
## $ Item_Fat_Content <chr> "Low Fat", "Regular", "Low Fat", "Re...
## $ Item_Visibility <dbl> 0.016047301, 0.019278216, 0.01676007...
## $ Item_Type <fctr> Dairy, Soft Drinks, Meat, Fruits an...
## $ Item_MRP <dbl> 249.8092, 48.2692, 141.6180, 182.095...
## $ Outlet_Identifier <fctr> OUT049, OUT018, OUT049, OUT010, OUT...
## $ Outlet_Establishment_Year <int> 1999, 2009, 1999, 1998, 1987, 2009, ...
## $ Outlet_Size <fctr> Medium, Medium, Medium, , High, Med...
## $ Outlet_Location_Type <fctr> Tier 1, Tier 3, Tier 1, Tier 3, Tie...
## $ Outlet_Type <fctr> Supermarket Type1, Supermarket Type...
## $ Item_Outlet_Sales <dbl> 3735.1380, 443.4228, 2097.2700, 732....
## $ Item_Identifier_Str3 <chr> "FDA", "DRC", "FDN", "FDX", "NCD", "...
## $ Item_Identifier_Str2 <chr> "FD", "DR", "FD", "FD", "NC", "FD", ...
## $ Item_Identifier_Num <dbl> 15, 1, 15, 7, 19, 36, 10, 10, 17, 28...
## $ Outlet_Age <dbl> 14, 4, 14, 15, 26, 4, 26, 28, 11, 6,...
## $ PK <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1...
new_combi <- rbind(new_train, new_test)
new_combi$Item_Fat_Content <- ifelse(combi$Item_Fat_Content == "Regular",1,0)
library(dummies)
new_combi <- dummy.data.frame(new_combi, names = c('Outlet_Size','Outlet_Location_Type'
,'Outlet_Type','Item_Identifier_Str2'),sep = '_')
glimpse(new_combi)
## Observations: 14,204
## Variables: 27
## $ Item_Identifier <fctr> FDA15, DRC01, FDN15, FDX07, N...
## $ Item_Weight <dbl> 9.300, 5.920, 17.500, 19.200, ...
## $ Item_Fat_Content <dbl> 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, ...
## $ Item_Visibility <dbl> 0.016047301, 0.019278216, 0.01...
## $ Item_Type <fctr> Dairy, Soft Drinks, Meat, Fru...
## $ Item_MRP <dbl> 249.8092, 48.2692, 141.6180, 1...
## $ Outlet_Identifier <fctr> OUT049, OUT018, OUT049, OUT01...
## $ Outlet_Establishment_Year <int> 1999, 2009, 1999, 1998, 1987, ...
## $ Outlet_Size_ <int> 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, ...
## $ Outlet_Size_High <int> 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, ...
## $ Outlet_Size_Medium <int> 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, ...
## $ Outlet_Size_Small <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ `Outlet_Location_Type_Tier 1` <int> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ `Outlet_Location_Type_Tier 2` <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...
## $ `Outlet_Location_Type_Tier 3` <int> 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, ...
## $ `Outlet_Type_Grocery Store` <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...
## $ `Outlet_Type_Supermarket Type1` <int> 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, ...
## $ `Outlet_Type_Supermarket Type2` <int> 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ `Outlet_Type_Supermarket Type3` <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...
## $ Item_Outlet_Sales <dbl> 3735.1380, 443.4228, 2097.2700...
## $ Item_Identifier_Str3 <chr> "FDA", "DRC", "FDN", "FDX", "N...
## $ Item_Identifier_Str2_DR <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ Item_Identifier_Str2_FD <int> 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, ...
## $ Item_Identifier_Str2_NC <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
## $ Item_Identifier_Num <dbl> 15, 1, 15, 7, 19, 36, 10, 10, ...
## $ Outlet_Age <dbl> 14, 4, 14, 15, 26, 4, 26, 28, ...
## $ PK <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,...
new_combi <- select(new_combi, -c(Item_Identifier, Outlet_Identifier, Item_Fat_Content,Outlet_Establishment_Year, Item_Type, Item_Identifier_Str3, PK))
str(new_combi)
## 'data.frame': 14204 obs. of 20 variables:
## $ Item_Weight : num 9.3 5.92 17.5 19.2 8.93 ...
## $ Item_Visibility : num 0.016 0.0193 0.0168 0.054 0.054 ...
## $ Item_MRP : num 249.8 48.3 141.6 182.1 53.9 ...
## $ Outlet_Size_ : int 0 0 0 1 0 0 0 0 1 1 ...
## $ Outlet_Size_High : int 0 0 0 0 1 0 1 0 0 0 ...
## $ Outlet_Size_Medium : int 1 1 1 0 0 1 0 1 0 0 ...
## $ Outlet_Size_Small : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Outlet_Location_Type_Tier 1 : int 1 0 1 0 0 0 0 0 0 0 ...
## $ Outlet_Location_Type_Tier 2 : int 0 0 0 0 0 0 0 0 1 1 ...
## $ Outlet_Location_Type_Tier 3 : int 0 1 0 1 1 1 1 1 0 0 ...
## $ Outlet_Type_Grocery Store : int 0 0 0 1 0 0 0 0 0 0 ...
## $ Outlet_Type_Supermarket Type1: int 1 0 1 0 1 0 1 0 1 1 ...
## $ Outlet_Type_Supermarket Type2: int 0 1 0 0 0 1 0 0 0 0 ...
## $ Outlet_Type_Supermarket Type3: int 0 0 0 0 0 0 0 1 0 0 ...
## $ Item_Outlet_Sales : num 3735 443 2097 732 995 ...
## $ Item_Identifier_Str2_DR : int 0 1 0 0 0 0 0 0 0 0 ...
## $ Item_Identifier_Str2_FD : int 1 0 1 1 0 1 1 1 1 1 ...
## $ Item_Identifier_Str2_NC : int 0 0 0 0 1 0 0 0 0 0 ...
## $ Item_Identifier_Num : num 15 1 15 7 19 36 10 10 17 28 ...
## $ Outlet_Age : num 14 4 14 15 26 4 26 28 11 6 ...
## - attr(*, "dummies")=List of 4
## ..$ Outlet_Size : int 9 10 11 12
## ..$ Outlet_Location_Type: int 13 14 15
## ..$ Outlet_Type : int 16 17 18 19
## ..$ Item_Identifier_Str2: int 22 23 24
pred_train <- new_combi %>%
filter(Item_Outlet_Sales != -999)
pred_test <- new_combi %>%
filter(Item_Outlet_Sales == -999)
#dim(pred_train)
#dim(pred_test)
p1 <- ggplot(pred_train, aes(x = Item_Weight, y = Item_Outlet_Sales)) +
geom_point() +
geom_smooth(method = 'lm', se = FALSE, color='tomato')
p2 <- ggplot(pred_train, aes(x = Item_Visibility, y = Item_Outlet_Sales)) +
geom_point() +
geom_smooth(method = 'lm', se = FALSE, color='tomato')
p3 <- ggplot(pred_train, aes(x = Item_MRP, y = Item_Outlet_Sales)) +
geom_point() +
geom_smooth(method = 'lm', se = FALSE, color='tomato')
p4 <- ggplot(pred_train, aes(x = Item_Identifier_Num, y = Item_Outlet_Sales)) +
geom_point() +
geom_smooth(method = 'lm', se = FALSE, color='tomato')
p5 <- ggplot(pred_train, aes(x = Outlet_Age, y = Item_Outlet_Sales)) +
geom_point() +
geom_smooth(method = 'lm', se = FALSE, color='tomato')
grid.arrange(p1, p2, p3, p4, p5, ncol=3)
library("corrplot")
library(RColorBrewer)
M<-cor(pred_train)
corrplot(M, diag = FALSE, order = "FPC",
tl.pos = "td", tl.cex = 0.5, method = "circle", type = "upper")
set.seed(1)
n <- nrow(pred_train)
shuffled <- pred_train[sample(n),]
#split train data again:
train_indices <- 1:round(0.7*n)
test_indices <- (round(0.7*n)+1):n
splitted_train <- shuffled[train_indices,]
splitted_test <- shuffled[test_indices,]
# build model with train data: (70% of actual data)
linear_model_simple <- lm(Item_Outlet_Sales ~ ., data = splitted_train)
#summary(linear_model_simple)
linear_model_log <- lm(log10(Item_Outlet_Sales) ~ ., data = splitted_train)
#summary(linear_model_log)
linear_model_sqrt <- lm(sqrt(Item_Outlet_Sales) ~ ., data = splitted_train)
#summary(linear_model_sqrt)
#make prediction with test data (%30 of actual train data)
pred_simple_test <- predict(linear_model_simple, splitted_test)
pred_log_test <-10 ^ predict(linear_model_log, splitted_test)
pred_sqrt_test <-predict(linear_model_sqrt, splitted_test) ^ 2
pred_simple_train <- predict(linear_model_simple, splitted_train)
pred_log_train <-10 ^ predict(linear_model_log, splitted_train)
pred_sqrt_train <-predict(linear_model_sqrt, splitted_train) ^ 2
#
print("MAE Function for test")
## [1] "MAE Function for test"
(MAE <- function(actual, predicted){mean(abs(actual - predicted))})
## function(actual, predicted){mean(abs(actual - predicted))}
print("MAE of Simple for test")
## [1] "MAE of Simple for test"
MAE(splitted_test$Item_Outlet_Sales, pred_simple_test)
## [1] 840.0915
print("MAE of Log for test")
## [1] "MAE of Log for test"
(MAE(splitted_test$Item_Outlet_Sales, pred_log_test))
## [1] 798.8246
print("MAE of Sqrt for test")
## [1] "MAE of Sqrt for test"
(MAE(splitted_test$Item_Outlet_Sales, pred_sqrt_test))
## [1] 780.975
print("RMSE Function")
## [1] "RMSE Function"
(RMSE <- function(actual, predicted) {sqrt(mean((actual - predicted)^2))})
## function(actual, predicted) {sqrt(mean((actual - predicted)^2))}
print("RMSE of Simple for test dataset")
## [1] "RMSE of Simple for test dataset"
(RMSE(splitted_test$Item_Outlet_Sales, pred_simple_test))
## [1] 1138.833
print("RMSE of Simple for train dataset")
## [1] "RMSE of Simple for train dataset"
(RMSE(splitted_train$Item_Outlet_Sales, pred_simple_train))
## [1] 1121.622
print("RMSE of Simple for train/test")
## [1] "RMSE of Simple for train/test"
(RMSE(splitted_train$Item_Outlet_Sales, pred_simple_train))/(RMSE(splitted_test$Item_Outlet_Sales, pred_simple_test))
## [1] 0.9848873
print("RMSE of Log for test dataset")
## [1] "RMSE of Log for test dataset"
(RMSE(splitted_test$Item_Outlet_Sales, pred_log_test))
## [1] 1164.727
print("RMSE of Log for train dataset")
## [1] "RMSE of Log for train dataset"
(RMSE(splitted_train$Item_Outlet_Sales, pred_log_train))
## [1] 1127.386
print("RMSE of Log for train/test")
## [1] "RMSE of Log for train/test"
(RMSE(splitted_train$Item_Outlet_Sales, pred_log_train))/(RMSE(splitted_test$Item_Outlet_Sales, pred_log_test))
## [1] 0.9679399
print("RMSE of Sqrt for test dataset")
## [1] "RMSE of Sqrt for test dataset"
(RMSE(splitted_test$Item_Outlet_Sales, pred_sqrt_test))
## [1] 1116.636
print("RMSE of Sqrt for train dataset")
## [1] "RMSE of Sqrt for train dataset"
(RMSE(splitted_train$Item_Outlet_Sales, pred_sqrt_train))
## [1] 1089.782
print("RMSE of Sqrt for train/test")
## [1] "RMSE of Sqrt for train/test"
(RMSE(splitted_train$Item_Outlet_Sales, pred_sqrt_train))/(RMSE(splitted_test$Item_Outlet_Sales, pred_sqrt_test))
## [1] 0.9759507
summary(linear_model_simple)$coefficients[,4] < 0.05
## (Intercept) Item_Weight
## TRUE FALSE
## Item_Visibility Item_MRP
## FALSE TRUE
## Outlet_Size_ Outlet_Size_High
## TRUE FALSE
## Outlet_Size_Medium `Outlet_Location_Type_Tier 1`
## FALSE FALSE
## `Outlet_Location_Type_Tier 2` `Outlet_Type_Grocery Store`
## FALSE TRUE
## `Outlet_Type_Supermarket Type1` `Outlet_Type_Supermarket Type2`
## TRUE TRUE
## Item_Identifier_Str2_DR Item_Identifier_Str2_FD
## FALSE FALSE
## Item_Identifier_Num Outlet_Age
## TRUE TRUE
('')
## [1] ""
summary(linear_model_log)$coefficients[,4] < 0.05
## (Intercept) Item_Weight
## TRUE FALSE
## Item_Visibility Item_MRP
## FALSE TRUE
## Outlet_Size_ Outlet_Size_High
## TRUE TRUE
## Outlet_Size_Medium `Outlet_Location_Type_Tier 1`
## FALSE TRUE
## `Outlet_Location_Type_Tier 2` `Outlet_Type_Grocery Store`
## TRUE TRUE
## `Outlet_Type_Supermarket Type1` `Outlet_Type_Supermarket Type2`
## TRUE TRUE
## Item_Identifier_Str2_DR Item_Identifier_Str2_FD
## FALSE FALSE
## Item_Identifier_Num Outlet_Age
## TRUE TRUE
('')
## [1] ""
summary(linear_model_sqrt)$coefficients[,4] < 0.05
## (Intercept) Item_Weight
## TRUE FALSE
## Item_Visibility Item_MRP
## FALSE TRUE
## Outlet_Size_ Outlet_Size_High
## TRUE FALSE
## Outlet_Size_Medium `Outlet_Location_Type_Tier 1`
## FALSE FALSE
## `Outlet_Location_Type_Tier 2` `Outlet_Type_Grocery Store`
## FALSE TRUE
## `Outlet_Type_Supermarket Type1` `Outlet_Type_Supermarket Type2`
## TRUE TRUE
## Item_Identifier_Str2_DR Item_Identifier_Str2_FD
## FALSE FALSE
## Item_Identifier_Num Outlet_Age
## TRUE TRUE
set.seed(1)
(formula_simp <- as.formula(Item_Outlet_Sales ~ Item_MRP + Outlet_Size_ + `Outlet_Type_Grocery Store` + `Outlet_Type_Supermarket Type1` + `Outlet_Type_Supermarket Type2`+ Item_Identifier_Num + Outlet_Age))
## Item_Outlet_Sales ~ Item_MRP + Outlet_Size_ + `Outlet_Type_Grocery Store` +
## `Outlet_Type_Supermarket Type1` + `Outlet_Type_Supermarket Type2` +
## Item_Identifier_Num + Outlet_Age
(formula_log <- as.formula(log10(Item_Outlet_Sales) ~ Item_MRP + Outlet_Size_ + Outlet_Size_High + `Outlet_Location_Type_Tier 1` + `Outlet_Location_Type_Tier 2` + `Outlet_Type_Grocery Store` + `Outlet_Type_Supermarket Type1` + `Outlet_Type_Supermarket Type2`+ Item_Identifier_Num + Outlet_Age))
## log10(Item_Outlet_Sales) ~ Item_MRP + Outlet_Size_ + Outlet_Size_High +
## `Outlet_Location_Type_Tier 1` + `Outlet_Location_Type_Tier 2` +
## `Outlet_Type_Grocery Store` + `Outlet_Type_Supermarket Type1` +
## `Outlet_Type_Supermarket Type2` + Item_Identifier_Num + Outlet_Age
(formula_sqrt <- as.formula(sqrt(Item_Outlet_Sales) ~ Item_MRP + Outlet_Size_ + `Outlet_Type_Grocery Store` + `Outlet_Type_Supermarket Type1` + `Outlet_Type_Supermarket Type2`+ Item_Identifier_Num + Outlet_Age))
## sqrt(Item_Outlet_Sales) ~ Item_MRP + Outlet_Size_ + `Outlet_Type_Grocery Store` +
## `Outlet_Type_Supermarket Type1` + `Outlet_Type_Supermarket Type2` +
## Item_Identifier_Num + Outlet_Age
# build model with train data: (70% of actual data)
linear_model_simple <- lm(formula_simp, data = splitted_train)
summary(linear_model_simple)
##
## Call:
## lm(formula = formula_simp, data = splitted_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3815.5 -670.1 -94.0 575.8 7899.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1636.4040 112.8537 14.500 < 2e-16 ***
## Item_MRP 15.5596 0.2334 66.665 < 2e-16 ***
## Outlet_Size_ -100.1277 45.1746 -2.216 0.02670 *
## `Outlet_Type_Grocery Store` -3348.3780 63.3187 -52.881 < 2e-16 ***
## `Outlet_Type_Supermarket Type1` -1479.8923 62.1051 -23.829 < 2e-16 ***
## `Outlet_Type_Supermarket Type2` -1967.4377 102.1684 -19.257 < 2e-16 ***
## Item_Identifier_Num 3.4458 0.8406 4.099 4.2e-05 ***
## Outlet_Age -8.9377 3.3681 -2.654 0.00798 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1123 on 5958 degrees of freedom
## Multiple R-squared: 0.5625, Adjusted R-squared: 0.562
## F-statistic: 1094 on 7 and 5958 DF, p-value: < 2.2e-16
linear_model_log <- lm(formula_log, data = splitted_train)
#summary(linear_model_log)
linear_model_sqrt <- lm(formula_sqrt, data = splitted_train)
#summary(linear_model_sqrt)
#make prediction with test data (%30 of actual train data)
pred_simple_test <- predict(linear_model_simple, splitted_test)
pred_log_test <-10 ^ predict(linear_model_log, splitted_test)
pred_sqrt_test <-predict(linear_model_sqrt, splitted_test) ^ 2
pred_simple_train <- predict(linear_model_simple, splitted_train)
pred_log_train <-10 ^ predict(linear_model_log, splitted_train)
pred_sqrt_train <-predict(linear_model_sqrt, splitted_train) ^ 2
#
print("MAE Function")
## [1] "MAE Function"
MAE <- function(actual, predicted){mean(abs(actual - predicted))}
print("MAE of Simple for test")
## [1] "MAE of Simple for test"
MAE(splitted_test$Item_Outlet_Sales, pred_simple_test)
## [1] 839.5721
print("MAE of Log for test")
## [1] "MAE of Log for test"
(MAE(splitted_test$Item_Outlet_Sales, pred_log_test))
## [1] 799.2971
print("MAE of Sqrt for Test")
## [1] "MAE of Sqrt for Test"
(MAE(splitted_test$Item_Outlet_Sales, pred_sqrt_test))
## [1] 781.7254
print("RMSE Function")
## [1] "RMSE Function"
RMSE <- function(actual, predicted) {sqrt(mean((actual - predicted)^2))}
print("RMSE of Simple for test dataset")
## [1] "RMSE of Simple for test dataset"
(RMSE(splitted_test$Item_Outlet_Sales, pred_simple_test))
## [1] 1139.64
print("RMSE of Simple for train dataset")
## [1] "RMSE of Simple for train dataset"
(RMSE(splitted_train$Item_Outlet_Sales, pred_simple_train))
## [1] 1122.395
print("RMSE of Simple for train/test")
## [1] "RMSE of Simple for train/test"
(RMSE(splitted_train$Item_Outlet_Sales, pred_simple_train))/(RMSE(splitted_test$Item_Outlet_Sales, pred_simple_test))
## [1] 0.9848678
print("RMSE of Log for test dataset")
## [1] "RMSE of Log for test dataset"
(RMSE(splitted_test$Item_Outlet_Sales, pred_log_test))
## [1] 1165.215
print("RMSE of Log for train dataset")
## [1] "RMSE of Log for train dataset"
(RMSE(splitted_train$Item_Outlet_Sales, pred_log_train))
## [1] 1127.359
print("RMSE of Log for train/test")
## [1] "RMSE of Log for train/test"
(RMSE(splitted_train$Item_Outlet_Sales, pred_log_train))/(RMSE(splitted_test$Item_Outlet_Sales, pred_log_test))
## [1] 0.9675116
print("RMSE of Sqrt for test dataset")
## [1] "RMSE of Sqrt for test dataset"
(RMSE(splitted_test$Item_Outlet_Sales, pred_sqrt_test))
## [1] 1118.607
print("RMSE of Sqrt for train dataset")
## [1] "RMSE of Sqrt for train dataset"
(RMSE(splitted_train$Item_Outlet_Sales, pred_sqrt_train))
## [1] 1090.689
print("RMSE of Sqrt for train/test")
## [1] "RMSE of Sqrt for train/test"
(RMSE(splitted_train$Item_Outlet_Sales, pred_sqrt_train))/(RMSE(splitted_test$Item_Outlet_Sales, pred_sqrt_test))
## [1] 0.9750425
qqnorm(linear_model_simple$residuals, ylab = "Residual Quantiles")
qqnorm(linear_model_log$residuals, ylab = "Residual Quantiles")
qqnorm(linear_model_sqrt$residuals, ylab = "Residual Quantiles")
plot(linear_model_simple$fitted.values, linear_model_simple$residuals)
#create a new variable in test file
test$Item_Outlet_Sales <- -999
#combine train and test data
combi <- rbind(train, test)
#impute missing value in Item_Weight
combi$Item_Weight[is.na(combi$Item_Weight)] <- median(combi$Item_Weight, na.rm = TRUE)
#impute 0 in item_visibility
combi$Item_Visibility <- ifelse(combi$Item_Visibility == 0, median(combi$Item_Visibility), combi$Item_Visibility)
#rename level in Outlet_Size
levels(combi$Outlet_Size)[1] <- "Other"
#rename levels of Item_Fat_Content
library(plyr)
combi$Item_Fat_Content <- revalue(combi$Item_Fat_Content,c("LF" = "Low Fat", "reg" = "Regular"))
combi$Item_Fat_Content <- revalue(combi$Item_Fat_Content, c("low fat" = "Low Fat"))
#create a new column 2013 - Year
combi$Year <- 2013 - combi$Outlet_Establishment_Year
#drop variables not required in modeling
library(dplyr)
combi <- select(combi, -c(Item_Identifier, Outlet_Identifier, Outlet_Establishment_Year))
#divide data set
new_train <- combi[1:nrow(train),]
new_test <- combi[-(1:nrow(train)),]
#linear regression
linear_model_simple <- lm(Item_Outlet_Sales ~ ., data = new_train)
summary(linear_model_simple)
##
## Call:
## lm(formula = Item_Outlet_Sales ~ ., data = new_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4336.2 -680.8 -89.8 568.3 7946.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -977.43100 304.34139 -3.212 0.001325 **
## Item_Weight -0.51139 2.91490 -0.175 0.860738
## Item_Fat_ContentRegular 40.50509 28.23186 1.435 0.151401
## Item_Visibility -255.77401 263.78199 -0.970 0.332253
## Item_TypeBreads 5.25222 84.04714 0.062 0.950173
## Item_TypeBreakfast 6.26997 116.59546 0.054 0.957115
## Item_TypeCanned 25.28447 62.76864 0.403 0.687091
## Item_TypeDairy -41.05177 62.22667 -0.660 0.509456
## Item_TypeFrozen Foods -28.00838 58.86986 -0.476 0.634252
## Item_TypeFruits and Vegetables 29.84893 54.95872 0.543 0.587065
## Item_TypeHard Drinks -0.07955 90.18745 -0.001 0.999296
## Item_TypeHealth and Hygiene -10.09434 68.02370 -0.148 0.882035
## Item_TypeHousehold -39.12266 59.92886 -0.653 0.513891
## Item_TypeMeat -0.35463 70.66270 -0.005 0.995996
## Item_TypeOthers -21.27266 98.62482 -0.216 0.829232
## Item_TypeSeafood 184.55373 148.00848 1.247 0.212464
## Item_TypeSnack Foods -11.48660 55.25066 -0.208 0.835312
## Item_TypeSoft Drinks -27.46544 70.16973 -0.391 0.695501
## Item_TypeStarchy Foods 21.86315 103.04028 0.212 0.831971
## Item_MRP 15.56477 0.19769 78.733 < 2e-16 ***
## Outlet_SizeHigh 849.13529 256.29910 3.313 0.000927 ***
## Outlet_SizeMedium 170.93952 71.08120 2.405 0.016200 *
## Outlet_SizeSmall 144.05363 45.67140 3.154 0.001615 **
## Outlet_Location_TypeTier 2 -100.36636 90.23496 -1.112 0.266050
## Outlet_Location_TypeTier 3 -326.31974 154.89294 -2.107 0.035169 *
## Outlet_TypeSupermarket Type1 1473.06659 140.73922 10.467 < 2e-16 ***
## Outlet_TypeSupermarket Type2 1076.77568 136.08116 7.913 2.83e-15 ***
## Outlet_TypeSupermarket Type3 3639.29114 178.13923 20.429 < 2e-16 ***
## Year -34.82155 10.49271 -3.319 0.000908 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1129 on 8494 degrees of freedom
## Multiple R-squared: 0.5637, Adjusted R-squared: 0.5623
## F-statistic: 392 on 28 and 8494 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(linear_model_simple)
linear_model_log <- lm(log(Item_Outlet_Sales) ~ ., data = new_train)
#summary(linear_model)
par(mfrow = c(2,2))
plot(linear_model_log)
library(rpart)
library(e1071)
library(rpart.plot)
library(caret)
#setting the tree control parameters
fitControl <- trainControl(method = "cv", number = 5)
cartGrid <- expand.grid(.cp=(1:50)*0.01)
#decision tree
tree_model <- train(Item_Outlet_Sales ~ ., data = new_train, method = "rpart", trControl = fitControl, tuneGrid = cartGrid)
#print(tree_model)
main_tree <- rpart(Item_Outlet_Sales ~ ., data = new_train, control = rpart.control(cp=0.01))
prp(main_tree)
main_tree <- rpart(Item_Outlet_Sales ~ ., data = new_train, control = rpart.control(cp=0.001))
prp(main_tree)
library(Metrics)
#rmse(new_train$Item_Outlet_Sales, linear_model$fitted.values)
rmse(new_train$Item_Outlet_Sales, linear_model_simple$fitted.values)
## [1] 1127.081
rmse(new_train$Item_Outlet_Sales, exp(linear_model_log$fitted.values))
## [1] 1140.004
pre_score <- predict(main_tree, type = "vector")
rmse(new_train$Item_Outlet_Sales, pre_score)
## [1] 1062.202
statistical-modeling-in-r-part-1
lm_model_1 <- lm(Item_Outlet_Sales ~ ., data = new_train)
rpart_model_1 <- rpart(Item_Outlet_Sales ~ ., data = new_train)
lm_model_1_output <- predict(lm_model_1, newdata = new_train)
rpart_model_1_output <- predict(rpart_model_1, newdata = new_train)
# Find the case by case differences
lm_model_1_diff <- with(new_train, Item_Outlet_Sales - lm_model_1_output)
rpart_model_1_diff <- with(new_train, Item_Outlet_Sales - rpart_model_1_output)
# Calculate the mean square errors
mean(lm_model_1_diff)
## [1] 6.105024e-11
mean(rpart_model_1_diff)
## [1] 1.151509e-12
(formula <- as.formula(Item_Outlet_Sales ~ Item_Weight + Item_Fat_Content + Item_Visibility + Item_Type + Item_MRP + Outlet_Size +
Outlet_Location_Type + Outlet_Type + Year))
## Item_Outlet_Sales ~ Item_Weight + Item_Fat_Content + Item_Visibility +
## Item_Type + Item_MRP + Outlet_Size + Outlet_Location_Type +
## Outlet_Type + Year
new_train$training_cases <- rnorm(nrow(new_train)) > 0
# summary(new_train$training_cases)
lm_model_1 <- lm(formula, data = subset(new_train, training_cases))
rpart_model_1 <- rpart(formula, data = subset(new_train, training_cases))
lm_preds <- predict(lm_model_1, newdata = subset(new_train, !training_cases))
rpart_preds <- predict(rpart_model_1, newdata = subset(new_train, !training_cases))
# Find the mean of case by case differences
with(subset(new_train, !training_cases), mean((Item_Outlet_Sales - lm_preds) ^ 2))
## [1] 1241263
with(subset(new_train, !training_cases), mean((Item_Outlet_Sales - rpart_preds) ^ 2))
## [1] 1190170