DataMunglers

Team Members

Our Objective

Data Getting

About BigMart

Get Used to Big Mart Sales Data

1.Exploring The BigMart Dataset

  • Call Libraries and Read Data from File
library(dplyr)
library(tidyr)
library(stringr)
library(ggplot2)
library(corrplot)

setwd("/Users/yetkineser/Desktop/mef R/data")
train <- read.csv('bigMartTrain.csv')
test  <- read.csv('bigMartTest.csv')
  • View the structere of train with glimpse function
glimpse(train)
## Observations: 8,523
## Variables: 12
## $ Item_Identifier           <fctr> FDA15, DRC01, FDN15, FDX07, NCD19, ...
## $ Item_Weight               <dbl> 9.300, 5.920, 17.500, 19.200, 8.930,...
## $ Item_Fat_Content          <fctr> Low Fat, Regular, Low Fat, Regular,...
## $ Item_Visibility           <dbl> 0.016047301, 0.019278216, 0.01676007...
## $ Item_Type                 <fctr> Dairy, Soft Drinks, Meat, Fruits an...
## $ Item_MRP                  <dbl> 249.8092, 48.2692, 141.6180, 182.095...
## $ Outlet_Identifier         <fctr> OUT049, OUT018, OUT049, OUT010, OUT...
## $ Outlet_Establishment_Year <int> 1999, 2009, 1999, 1998, 1987, 2009, ...
## $ Outlet_Size               <fctr> Medium, Medium, Medium, , High, Med...
## $ Outlet_Location_Type      <fctr> Tier 1, Tier 3, Tier 1, Tier 3, Tie...
## $ Outlet_Type               <fctr> Supermarket Type1, Supermarket Type...
## $ Item_Outlet_Sales         <dbl> 3735.1380, 443.4228, 2097.2700, 732....
  • Our dataset has 12 columns and 8523 rows Look at all columns, Firstly factor type columns

  • Item_Identifier : Unique Product ID

train %>%
  summarise(n_distinct(Item_Identifier))
##   n_distinct(Item_Identifier)
## 1                        1559
  • Item_Fat_Content : Whether the product is low fat or not
train %>%
  group_by(Item_Fat_Content) %>%
  summarise(Count = n(),Perc=round(n()/nrow(.)*100,2)) %>%
  arrange(desc(Count))
## # A tibble: 5 x 3
##   Item_Fat_Content Count  Perc
##             <fctr> <int> <dbl>
## 1          Low Fat  5089 59.71
## 2          Regular  2889 33.90
## 3               LF   316  3.71
## 4              reg   117  1.37
## 5          low fat   112  1.31
  • Item_Type : The category to which the product belongs
train%>%
  group_by(Item_Type) %>% 
  summarise(Count = n(), Perc = round(n() / nrow(.) * 100, 2)) %>%
  arrange(desc(Count))
## # A tibble: 16 x 3
##                Item_Type Count  Perc
##                   <fctr> <int> <dbl>
##  1 Fruits and Vegetables  1232 14.46
##  2           Snack Foods  1200 14.08
##  3             Household   910 10.68
##  4          Frozen Foods   856 10.04
##  5                 Dairy   682  8.00
##  6                Canned   649  7.61
##  7          Baking Goods   648  7.60
##  8    Health and Hygiene   520  6.10
##  9           Soft Drinks   445  5.22
## 10                  Meat   425  4.99
## 11                Breads   251  2.94
## 12           Hard Drinks   214  2.51
## 13                Others   169  1.98
## 14         Starchy Foods   148  1.74
## 15             Breakfast   110  1.29
## 16               Seafood    64  0.75
  • Outlet_Identifier : Unique Store ID
train %>%
  group_by(Outlet_Identifier) %>%
  summarise(Count = n(), Perc = round(n() / nrow(.) * 100, 2)) %>%
  arrange(desc(Count))
## # A tibble: 10 x 3
##    Outlet_Identifier Count  Perc
##               <fctr> <int> <dbl>
##  1            OUT027   935 10.97
##  2            OUT013   932 10.94
##  3            OUT035   930 10.91
##  4            OUT046   930 10.91
##  5            OUT049   930 10.91
##  6            OUT045   929 10.90
##  7            OUT018   928 10.89
##  8            OUT017   926 10.86
##  9            OUT010   555  6.51
## 10            OUT019   528  6.20
  • Outlet_Size : The size of the store in terms of ground area covered
train%>%
  group_by(Outlet_Size) %>%
  summarise(Count = n(),Perc = round(n() / nrow(.) * 100, 2)) %>%
  arrange(desc(Count))
## # A tibble: 4 x 3
##   Outlet_Size Count  Perc
##        <fctr> <int> <dbl>
## 1      Medium  2793 32.77
## 2              2410 28.28
## 3       Small  2388 28.02
## 4        High   932 10.94
  • Outlet_Location_Type : The type of city in which the store is located
train%>%
  group_by(Outlet_Location_Type) %>%
  summarise(Count = n(),Perc = round(n()/nrow(.) * 100, 2)) %>%
  arrange(desc(Count))
## # A tibble: 3 x 3
##   Outlet_Location_Type Count  Perc
##                 <fctr> <int> <dbl>
## 1               Tier 3  3350 39.31
## 2               Tier 2  2785 32.68
## 3               Tier 1  2388 28.02
  • Outlet_Type : Whether the outlet is just a grocery store or some sort of supermarket
train%>%
  group_by(Outlet_Type)%>%
  summarise(Count=n(),Perc=round(n()/nrow(.)*100,2))%>%
  arrange(desc(Count))
## # A tibble: 4 x 3
##         Outlet_Type Count  Perc
##              <fctr> <int> <dbl>
## 1 Supermarket Type1  5577 65.43
## 2     Grocery Store  1083 12.71
## 3 Supermarket Type3   935 10.97
## 4 Supermarket Type2   928 10.89
  • Item_Weight : Weight of product
train%>%
    summarise(is_NULL=sum(is.na(Item_Weight)==1),
              is_NOT_NULL=sum(!is.na(Item_Weight)==1)
              )
##   is_NULL is_NOT_NULL
## 1    1463        7060
train%>%
  filter(!is.na(Item_Weight))%>%
  summarise(
    Max=max(Item_Weight),
    Min=min(Item_Weight),
    Mean=mean(Item_Weight),
    Median=median(Item_Weight),
    QUA1=quantile(Item_Weight,1/4),
    QUA3=quantile(Item_Weight,3/4),
    IQR=IQR(Item_Weight)
  )
##     Max   Min     Mean Median    QUA1  QUA3     IQR
## 1 21.35 4.555 12.85765   12.6 8.77375 16.85 8.07625
  • Item_Visibility : The % of total display area of all products in a store allocated to the particular product
train%>%
    summarise(is_NULL=sum(is.na(Item_Visibility)==1),
              is_NOT_NULL=sum(!is.na(Item_Visibility)==1)
              )
##   is_NULL is_NOT_NULL
## 1       0        8523
train%>%
  filter(!is.na(Item_Visibility))%>%
  summarise(
    Max=max(Item_Visibility),
    Min=min(Item_Visibility),
    Mean=mean(Item_Visibility),
    Median=median(Item_Visibility),
    QUA1=quantile(Item_Visibility,1/4),
    QUA3=quantile(Item_Visibility,3/4),
    IQR=IQR(Item_Visibility)
  )
##         Max Min       Mean     Median       QUA1       QUA3        IQR
## 1 0.3283909   0 0.06613203 0.05393093 0.02698948 0.09458529 0.06759582
  • Item_MRP : Maximum Retail Price (list price) of the product
train%>%
    summarise(is_NULL=sum(is.na(Item_MRP)==1),
              is_NOT_NULL=sum(!is.na(Item_MRP)==1)
              )
##   is_NULL is_NOT_NULL
## 1       0        8523
train%>%
  filter(!is.na(Item_MRP))%>%
  summarise(
    Max=max(Item_MRP),
    Min=min(Item_MRP),
    Mean=mean(Item_MRP),
    Median=median(Item_MRP),
    QUA1=quantile(Item_MRP,1/4),
    QUA3=quantile(Item_MRP,3/4),
    IQR=IQR(Item_MRP)
  )
##        Max   Min     Mean   Median    QUA1     QUA3     IQR
## 1 266.8884 31.29 140.9928 143.0128 93.8265 185.6437 91.8172
  • Outlet_Establishment_Year : The year in which store was established
train%>%
    summarise(is_NULL=sum(is.na(Outlet_Establishment_Year)==1),
              is_NOT_NULL=sum(!is.na(Outlet_Establishment_Year)==1)
              )
##   is_NULL is_NOT_NULL
## 1       0        8523
train%>%
  filter(!is.na(Outlet_Establishment_Year))%>%
  summarise(
    Max=max(Outlet_Establishment_Year),
    Min=min(Outlet_Establishment_Year),
    Mean=mean(Outlet_Establishment_Year),
    Median=median(Outlet_Establishment_Year),
    QUA1=quantile(Outlet_Establishment_Year,1/4),
    QUA3=quantile(Outlet_Establishment_Year,3/4),
    IQR=IQR(Outlet_Establishment_Year)
  )
##    Max  Min     Mean Median QUA1 QUA3 IQR
## 1 2009 1985 1997.832   1999 1987 2004  17
  • Item_Outlet_Sales : Sales of the product in the particulat store. This is the outcome variable to be predicted.
train%>%
    summarise(is_NULL=sum(is.na(Item_Outlet_Sales)==1),
              is_NOT_NULL=sum(!is.na(Item_Outlet_Sales)==1)
              )
##   is_NULL is_NOT_NULL
## 1       0        8523
train%>%
  filter(!is.na(Item_Outlet_Sales))%>%
  summarise(
    Max=max(Item_Outlet_Sales),
    Min=min(Item_Outlet_Sales),
    Mean=mean(Item_Outlet_Sales),
    Median=median(Item_Outlet_Sales),
    QUA1=quantile(Item_Outlet_Sales,1/4),
    QUA3=quantile(Item_Outlet_Sales,3/4),
    IQR=IQR(Item_Outlet_Sales)
  )
##        Max   Min     Mean   Median     QUA1     QUA3      IQR
## 1 13086.96 33.29 2181.289 1794.331 834.2474 3101.296 2267.049
  • Summary =)))
summary(train)
##  Item_Identifier  Item_Weight     Item_Fat_Content Item_Visibility  
##  FDG33  :  10    Min.   : 4.555   LF     : 316     Min.   :0.00000  
##  FDW13  :  10    1st Qu.: 8.774   Low Fat:5089     1st Qu.:0.02699  
##  DRE49  :   9    Median :12.600   Regular:2889     Median :0.05393  
##  DRN47  :   9    Mean   :12.858   low fat: 112     Mean   :0.06613  
##  FDD38  :   9    3rd Qu.:16.850   reg    : 117     3rd Qu.:0.09459  
##  FDF52  :   9    Max.   :21.350                    Max.   :0.32839  
##  (Other):8467    NA's   :1463                                       
##                  Item_Type       Item_MRP      Outlet_Identifier
##  Fruits and Vegetables:1232   Min.   : 31.29   OUT027 : 935     
##  Snack Foods          :1200   1st Qu.: 93.83   OUT013 : 932     
##  Household            : 910   Median :143.01   OUT035 : 930     
##  Frozen Foods         : 856   Mean   :140.99   OUT046 : 930     
##  Dairy                : 682   3rd Qu.:185.64   OUT049 : 930     
##  Canned               : 649   Max.   :266.89   OUT045 : 929     
##  (Other)              :2994                    (Other):2937     
##  Outlet_Establishment_Year Outlet_Size   Outlet_Location_Type
##  Min.   :1985                    :2410   Tier 1:2388         
##  1st Qu.:1987              High  : 932   Tier 2:2785         
##  Median :1999              Medium:2793   Tier 3:3350         
##  Mean   :1998              Small :2388                       
##  3rd Qu.:2004                                                
##  Max.   :2009                                                
##                                                              
##             Outlet_Type   Item_Outlet_Sales 
##  Grocery Store    :1083   Min.   :   33.29  
##  Supermarket Type1:5577   1st Qu.:  834.25  
##  Supermarket Type2: 928   Median : 1794.33  
##  Supermarket Type3: 935   Mean   : 2181.29  
##                           3rd Qu.: 3101.30  
##                           Max.   :13086.97  
## 

2. Data Manipulation

  • Let?s first combine the data sets. This will save our time as we don?t need to write separate codes for train and test data sets. To combine the two data frames, we must make sure that they have equal columns, which is not the case. Test data set has one less column (response variable). Let?s first add the column. We can give this column any value. An intuitive approach would be to extract the mean value of sales from train data set and use it as placeholder for test variable Item Outlet Sales. Anyways, let?s make it simple for now. I?ve taken a value -999. Now, we?ll combine the data sets. (!!!!Change!!!!)
test$Item_Outlet_Sales <- -999
combi <- rbind(train, test)
  • Impute missing value by median. We are using median because it is known to be highly robust to outliers. Moreover, for this problem, our evaluation metric is RMSE which is also highly affected by outliers. Hence, median is better in this case. (!!!!Change!!!!)
combi$Item_Weight[is.na(combi$Item_Weight)] <- median(combi$Item_Weight, na.rm = TRUE)
table(is.na(combi$Item_Weight))
## 
## FALSE 
## 14204
  • Let?s take up Item_Visibility. In the graph above, we saw item visibility has zero value also, which is practically not feasible. Hence, we?ll consider it as a missing value and once again make the imputation using median.
combi$Item_Visibility <- ifelse(combi$Item_Visibility == 0,
                           median(combi$Item_Visibility), combi$Item_Visibility) 
  • Let?s proceed to categorical variables now. During exploration, we saw there are mis-matched levels in variables which needs to be corrected. Item fat content should be corrected.
combi$Item_Fat_Content <- str_replace(
           str_replace(
             str_replace(combi$Item_Fat_Content,"LF","Low Fat")
             ,"reg","Regular"),"low fat","Low Fat")
  
table(combi$Item_Fat_Content)
## 
## Low Fat Regular 
##    9185    5019
  • We need to mutate new columns for more meaningful data. First, we evaluate Item_Identifier column because We discovered Item_Identifier column has special codes to recognize the type of item when we tried to understand data. So, letters on Item_Identifier column means Food, Drinks and etc. and numbers can be meaningful. We observed first three letters and two letters to control which of them is more meaningful. In any case, we hold both of them as two seperate column to use them later but now, we decided to use first two letters. (DR=Drink, FD=Food,NC=Non-Consumable) Secondly, we generate new Outlet_Age column from Outlet_Establishment_Year.
##first two and three letter.
combi <-
  combi %>%
  mutate(Item_Identifier_Str3 = substr(Item_Identifier,1,3),  #First three letter of Item_Identifier. 
         Item_Identifier_Str2 = substr(Item_Identifier,1,2),  #First second letter of Item_Identifier.
         Item_Identifier_Num=as.numeric(substr(Item_Identifier,4,6)), # Number part of Item_Identifier column.
         Outlet_Age=2013-Outlet_Establishment_Year, #Outlet Age 
         PK=row_number())

#table(combi$Item_Identifier_Str3)

table(combi$Item_Identifier_Str2)
## 
##    DR    FD    NC 
##  1317 10201  2686
#combi %>% summarise(n_distinct(Item_Identifier_Str3))

3. Data Visualizing

  • We should use train data for visualizing data.
#Split combined data into train and test data again to examine clearly. 
new_train <- combi %>% 
  filter(Item_Outlet_Sales != -999)

new_test <- combi %>% 
  filter(Item_Outlet_Sales == -999)
#dim(new_train)
#dim(new_test)
  • We manipulated Item_Fat_Content column; so graph it to see new version.
# Looking at new Item_Fat_Content column.
qplot(x=Item_Fat_Content,data=new_train)

  • Looking at Item type and Item Identifier because we observed there is a correlation between these two columns and we manipulated Item_Identifier_Content column as Item_Identifier_Str2 according to Item_Type data. (For example; if Item_Type = “Soft Drinks”, then Item_Identifier_Content starts with ‘DR’ .) As you can see on second graph group; manipulated Item_Identifier_Str2 column is correct and clear now.
# Looking at Item Type.
qplot(x=Item_Type,data=new_train)+
  geom_bar(color="green")+
  theme(axis.text = element_text(angle = 0,color="purple"))+
  coord_flip()

# Looking at Item Type with facet wrap according to Item_Identifier_Str
qplot(x=Item_Type,data=new_train)+
  geom_bar(color="green")+
  theme(axis.text = element_text(color="purple"))+
  coord_flip()+
  facet_wrap(~Item_Identifier_Str2,nrow=1)

  • Observing the relationship between Item Type/Item Identifier and Outlet_Identifier column to check Item distribution is similar for each Outlet. As we can see, the distribution of items on outlets is very similar.
# Looking at Item Type with facet wrap according to Outlet Identifier.

qplot(x = Item_Type, data = new_train) +
  geom_bar(color = "green") +
  theme(axis.text = element_text(angle = 0,color = "purple")) +
  coord_flip() +
  facet_wrap(~Outlet_Identifier, nrow = 2)

qplot(x = Item_Identifier_Str2, data = new_train) +
  geom_bar(color = "green") +
  theme(axis.text = element_text(angle = 0,color = "purple")) +
  coord_flip() +
  facet_wrap(~Outlet_Identifier, nrow = 2)

  • Observing the histogram of Item Outlet Sales for looking of sales distribution.
# Looking at Item_Outlet_Sales 
qplot(x = Item_Outlet_Sales, data = new_train, binwidth = 250) +
  geom_bar(color = "green")+
  theme(axis.text = element_text(angle = 90,color = "purple")) +
  scale_x_continuous(limits = c(0, 10000), breaks = seq(0, 10000, 500))

  • Observing the histogram of Item Outlet Sales with sqrt and log function to like distribution to normal.
# It is better now like normal distibution with sqrt and log10
qplot(x = sqrt(Item_Outlet_Sales), data = new_train, binwidth = 1,
      ylab = "Count Of Sales",
      xlab = "SQRT of Outlet Sales") +
  geom_bar(color = "green") +
  theme(axis.text.x = element_text(angle = 90,color = "purple"),
        axis.text.y = element_text(angle = 30,color = "tomato")) +
  scale_x_continuous(limits = c(0,100), breaks = seq(0,70,15)) +
  scale_y_continuous(limits = c(0,200), breaks = seq(0,200,100))

qplot(x = log10(Item_Outlet_Sales+1), data = new_train, binwidth = 0.01,
      ylab = "Count Of Sales",
      xlab = "LOG10 of Outlet Sales") +
  geom_bar(color = "green") +
  theme(axis.text.x = element_text(angle = 90, color = "purple"),
        axis.text.y = element_text(angle = 30, color = "tomato")) +
  scale_x_continuous(limits = c(1.5,4.5), breaks = seq(1.5, 4.5, 0.5)) +
  scale_y_continuous(limits = c(0,200), breaks = seq(0, 200, 100))

  • Observing the histogram of Item Outlet Sales according to Outlet Identifier to understand disttribution of sales in each shop.
qplot(x = Item_Outlet_Sales, data = new_train,binwidth = 250)+
  geom_bar(color = "green")+
  theme(axis.text.x = element_text(angle = 90,color = "purple"),
        axis.text.y = element_text(angle = 30,color = "tomato")) +
  scale_x_continuous(limits = c(0, 5000), breaks = seq(0, 5000, 500)) +
  scale_y_continuous(limits = c(0, 250), breaks = seq(0, 200, 100)) +
  facet_wrap(~Outlet_Identifier, nrow = 5)

  • Observing the histogram of Item Outlet Sales according to Outlet Size to understand relationship between sales and outlet size.
# Looking at Item_Outlet_Sales according to Outlet_Size
qplot(x = Item_Outlet_Sales, data = new_train,binwidth = 250) +
  geom_bar(color = "green") +
  theme(axis.text.x = element_text(angle = 90, color = "purple"),
        axis.text.y = element_text(angle = 30, color = "tomato")) +
  scale_x_continuous(limits = c(0, 5000), breaks = seq(0, 5000, 500)) +
  scale_y_continuous(limits = c(0, 250), breaks = seq(0, 200, 100)) +
  facet_wrap(~Outlet_Size, nrow = 4)

by(new_train$Item_Outlet_Sales, new_train$Outlet_Size, summary)
## new_train$Outlet_Size: 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   33.29  554.78 1443.45 1822.63 2681.51 9664.75 
## -------------------------------------------------------- 
## new_train$Outlet_Size: High
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    73.24  1072.60  2050.66  2298.99  3166.38 10256.65 
## -------------------------------------------------------- 
## new_train$Outlet_Size: Medium
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    69.24  1270.35  2251.07  2681.60  3691.20 13086.97 
## -------------------------------------------------------- 
## new_train$Outlet_Size: Small
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   33.96  601.05 1544.66 1912.15 2824.32 9779.94
  • We can observe different ways like line plot and box plot.
qplot(x = Item_Outlet_Sales, data = new_train, binwidth = 0.01,
      ylab = "Count Of Sales",
      xlab = "Log10 of Outlet Sales",
      geom = "freqpoly",
      color = Outlet_Size) +
  geom_bar(color = "green") +
  theme(axis.text.x = element_text(angle = 90, color = "purple"),
        axis.text.y = element_text(angle = 30, color = "tomato")) +
  scale_x_continuous(limits = c(1.5,4.5), breaks = seq(1.5,4.5,0.5)) +
  scale_y_continuous(limits = c(0,40), breaks = seq(0,40,10)) +
  scale_x_log10()

qplot(x = Outlet_Size, y = Item_Outlet_Sales,
      data = new_train,
      geom = "boxplot") +
  theme(axis.text.x = element_text(angle = 90, color = "purple"),
        axis.text.y = element_text(angle = 30, color = "tomato"))

3. Prepering Data and Modeling based on kernels

new_combi <- rbind(new_train, new_test)

glimpse(new_combi)
## Observations: 14,204
## Variables: 17
## $ Item_Identifier           <fctr> FDA15, DRC01, FDN15, FDX07, NCD19, ...
## $ Item_Weight               <dbl> 9.300, 5.920, 17.500, 19.200, 8.930,...
## $ Item_Fat_Content          <chr> "Low Fat", "Regular", "Low Fat", "Re...
## $ Item_Visibility           <dbl> 0.016047301, 0.019278216, 0.01676007...
## $ Item_Type                 <fctr> Dairy, Soft Drinks, Meat, Fruits an...
## $ Item_MRP                  <dbl> 249.8092, 48.2692, 141.6180, 182.095...
## $ Outlet_Identifier         <fctr> OUT049, OUT018, OUT049, OUT010, OUT...
## $ Outlet_Establishment_Year <int> 1999, 2009, 1999, 1998, 1987, 2009, ...
## $ Outlet_Size               <fctr> Medium, Medium, Medium, , High, Med...
## $ Outlet_Location_Type      <fctr> Tier 1, Tier 3, Tier 1, Tier 3, Tie...
## $ Outlet_Type               <fctr> Supermarket Type1, Supermarket Type...
## $ Item_Outlet_Sales         <dbl> 3735.1380, 443.4228, 2097.2700, 732....
## $ Item_Identifier_Str3      <chr> "FDA", "DRC", "FDN", "FDX", "NCD", "...
## $ Item_Identifier_Str2      <chr> "FD", "DR", "FD", "FD", "NC", "FD", ...
## $ Item_Identifier_Num       <dbl> 15, 1, 15, 7, 19, 36, 10, 10, 17, 28...
## $ Outlet_Age                <dbl> 14, 4, 14, 15, 26, 4, 26, 28, 11, 6,...
## $ PK                        <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1...
  • First we change our character data to numerical(1,0) data by spread columns
new_combi <- rbind(new_train, new_test)

new_combi$Item_Fat_Content <- ifelse(combi$Item_Fat_Content == "Regular",1,0)

library(dummies)

new_combi <- dummy.data.frame(new_combi, names = c('Outlet_Size','Outlet_Location_Type'
                                            ,'Outlet_Type','Item_Identifier_Str2'),sep = '_')

glimpse(new_combi)
## Observations: 14,204
## Variables: 27
## $ Item_Identifier                 <fctr> FDA15, DRC01, FDN15, FDX07, N...
## $ Item_Weight                     <dbl> 9.300, 5.920, 17.500, 19.200, ...
## $ Item_Fat_Content                <dbl> 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, ...
## $ Item_Visibility                 <dbl> 0.016047301, 0.019278216, 0.01...
## $ Item_Type                       <fctr> Dairy, Soft Drinks, Meat, Fru...
## $ Item_MRP                        <dbl> 249.8092, 48.2692, 141.6180, 1...
## $ Outlet_Identifier               <fctr> OUT049, OUT018, OUT049, OUT01...
## $ Outlet_Establishment_Year       <int> 1999, 2009, 1999, 1998, 1987, ...
## $ Outlet_Size_                    <int> 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, ...
## $ Outlet_Size_High                <int> 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, ...
## $ Outlet_Size_Medium              <int> 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, ...
## $ Outlet_Size_Small               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ `Outlet_Location_Type_Tier 1`   <int> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ `Outlet_Location_Type_Tier 2`   <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...
## $ `Outlet_Location_Type_Tier 3`   <int> 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, ...
## $ `Outlet_Type_Grocery Store`     <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...
## $ `Outlet_Type_Supermarket Type1` <int> 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, ...
## $ `Outlet_Type_Supermarket Type2` <int> 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ `Outlet_Type_Supermarket Type3` <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...
## $ Item_Outlet_Sales               <dbl> 3735.1380, 443.4228, 2097.2700...
## $ Item_Identifier_Str3            <chr> "FDA", "DRC", "FDN", "FDX", "N...
## $ Item_Identifier_Str2_DR         <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ Item_Identifier_Str2_FD         <int> 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, ...
## $ Item_Identifier_Str2_NC         <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
## $ Item_Identifier_Num             <dbl> 15, 1, 15, 7, 19, 36, 10, 10, ...
## $ Outlet_Age                      <dbl> 14, 4, 14, 15, 26, 4, 26, 28, ...
## $ PK                              <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,...
  • We delete our characters columns before modeling.
new_combi <- select(new_combi, -c(Item_Identifier, Outlet_Identifier, Item_Fat_Content,                                Outlet_Establishment_Year, Item_Type, Item_Identifier_Str3, PK))

str(new_combi)
## 'data.frame':    14204 obs. of  20 variables:
##  $ Item_Weight                  : num  9.3 5.92 17.5 19.2 8.93 ...
##  $ Item_Visibility              : num  0.016 0.0193 0.0168 0.054 0.054 ...
##  $ Item_MRP                     : num  249.8 48.3 141.6 182.1 53.9 ...
##  $ Outlet_Size_                 : int  0 0 0 1 0 0 0 0 1 1 ...
##  $ Outlet_Size_High             : int  0 0 0 0 1 0 1 0 0 0 ...
##  $ Outlet_Size_Medium           : int  1 1 1 0 0 1 0 1 0 0 ...
##  $ Outlet_Size_Small            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Outlet_Location_Type_Tier 1  : int  1 0 1 0 0 0 0 0 0 0 ...
##  $ Outlet_Location_Type_Tier 2  : int  0 0 0 0 0 0 0 0 1 1 ...
##  $ Outlet_Location_Type_Tier 3  : int  0 1 0 1 1 1 1 1 0 0 ...
##  $ Outlet_Type_Grocery Store    : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ Outlet_Type_Supermarket Type1: int  1 0 1 0 1 0 1 0 1 1 ...
##  $ Outlet_Type_Supermarket Type2: int  0 1 0 0 0 1 0 0 0 0 ...
##  $ Outlet_Type_Supermarket Type3: int  0 0 0 0 0 0 0 1 0 0 ...
##  $ Item_Outlet_Sales            : num  3735 443 2097 732 995 ...
##  $ Item_Identifier_Str2_DR      : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ Item_Identifier_Str2_FD      : int  1 0 1 1 0 1 1 1 1 1 ...
##  $ Item_Identifier_Str2_NC      : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ Item_Identifier_Num          : num  15 1 15 7 19 36 10 10 17 28 ...
##  $ Outlet_Age                   : num  14 4 14 15 26 4 26 28 11 6 ...
##  - attr(*, "dummies")=List of 4
##   ..$ Outlet_Size         : int  9 10 11 12
##   ..$ Outlet_Location_Type: int  13 14 15
##   ..$ Outlet_Type         : int  16 17 18 19
##   ..$ Item_Identifier_Str2: int  22 23 24
  • We divide our test and train data for modeling.
pred_train <- new_combi %>% 
  filter(Item_Outlet_Sales != -999)

pred_test <- new_combi %>% 
  filter(Item_Outlet_Sales == -999)
#dim(pred_train)
#dim(pred_test)
  • Linear Regression
linear_model <- lm(Item_Outlet_Sales ~ ., data = pred_train)
summary(linear_model)
## 
## Call:
## lm(formula = Item_Outlet_Sales ~ ., data = pred_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4277.2  -681.4   -85.4   568.6  7897.8 
## 
## Coefficients: (4 not defined because of singularities)
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      2348.3559   326.2603   7.198 6.64e-13 ***
## Item_Weight                        -0.9462     2.8983  -0.326  0.74407    
## Item_Visibility                  -221.0750   262.8463  -0.841  0.40033    
## Item_MRP                           15.5755     0.1965  79.257  < 2e-16 ***
## Outlet_Size_                     -142.6728    45.6008  -3.129  0.00176 ** 
## Outlet_Size_High                  694.3313   254.9895   2.723  0.00648 ** 
## Outlet_Size_Medium                 29.5461    56.3255   0.525  0.59990    
## Outlet_Size_Small                       NA         NA      NA       NA    
## `Outlet_Location_Type_Tier 1`     320.5095   154.5967   2.073  0.03818 *  
## `Outlet_Location_Type_Tier 2`     223.3423   100.3885   2.225  0.02612 *  
## `Outlet_Location_Type_Tier 3`           NA         NA      NA       NA    
## `Outlet_Type_Grocery Store`     -3630.9270   177.7976 -20.422  < 2e-16 ***
## `Outlet_Type_Supermarket Type1` -2152.1612   294.1229  -7.317 2.76e-13 ***
## `Outlet_Type_Supermarket Type2` -2549.1114   256.6856  -9.931  < 2e-16 ***
## `Outlet_Type_Supermarket Type3`         NA         NA      NA       NA    
## Item_Identifier_Str2_DR            14.9593    49.0122   0.305  0.76021    
## Item_Identifier_Str2_FD            43.8687    31.8015   1.379  0.16779    
## Item_Identifier_Str2_NC                 NA         NA      NA       NA    
## Item_Identifier_Num                 2.9993     0.7092   4.229 2.37e-05 ***
## Outlet_Age                        -34.3064    10.4716  -3.276  0.00106 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1127 on 8507 degrees of freedom
## Multiple R-squared:  0.5643, Adjusted R-squared:  0.5635 
## F-statistic: 734.5 on 15 and 8507 DF,  p-value: < 2.2e-16
  • Look at correlation
# cor(pred_train)
library("corrplot")
library(RColorBrewer)

M<-cor(pred_train)

corrplot(M, diag = FALSE, order = "FPC",
         tl.pos = "td", tl.cex = 0.5, method = "circle", type = "upper")

#create a new variable in test file 
test$Item_Outlet_Sales <- -999

#combine train and test data
combi <- rbind(train, test)

#impute missing value in Item_Weight
combi$Item_Weight[is.na(combi$Item_Weight)] <- median(combi$Item_Weight, na.rm = TRUE)

#impute 0 in item_visibility
combi$Item_Visibility <- ifelse(combi$Item_Visibility == 0, median(combi$Item_Visibility),                         combi$Item_Visibility)

#rename level in Outlet_Size
levels(combi$Outlet_Size)[1] <- "Other"

#rename levels of Item_Fat_Content
library(plyr)
combi$Item_Fat_Content <- revalue(combi$Item_Fat_Content,c("LF" = "Low Fat", "reg" = "Regular"))
combi$Item_Fat_Content <- revalue(combi$Item_Fat_Content, c("low fat" = "Low Fat"))

#create a new column 2013 - Year
combi$Year <- 2013 - combi$Outlet_Establishment_Year

#drop variables not required in modeling
library(dplyr)
combi <- select(combi, -c(Item_Identifier, Outlet_Identifier, Outlet_Establishment_Year))

#divide data set
new_train <- combi[1:nrow(train),]
new_test <- combi[-(1:nrow(train)),]

#linear regression
linear_model_simple <- lm(Item_Outlet_Sales ~ ., data = new_train)
summary(linear_model_simple)
## 
## Call:
## lm(formula = Item_Outlet_Sales ~ ., data = new_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4336.2  -680.8   -89.8   568.3  7946.1 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -977.43100  304.34139  -3.212 0.001325 ** 
## Item_Weight                      -0.51139    2.91490  -0.175 0.860738    
## Item_Fat_ContentRegular          40.50509   28.23186   1.435 0.151401    
## Item_Visibility                -255.77401  263.78199  -0.970 0.332253    
## Item_TypeBreads                   5.25222   84.04714   0.062 0.950173    
## Item_TypeBreakfast                6.26997  116.59546   0.054 0.957115    
## Item_TypeCanned                  25.28447   62.76864   0.403 0.687091    
## Item_TypeDairy                  -41.05177   62.22667  -0.660 0.509456    
## Item_TypeFrozen Foods           -28.00838   58.86986  -0.476 0.634252    
## Item_TypeFruits and Vegetables   29.84893   54.95872   0.543 0.587065    
## Item_TypeHard Drinks             -0.07955   90.18745  -0.001 0.999296    
## Item_TypeHealth and Hygiene     -10.09434   68.02370  -0.148 0.882035    
## Item_TypeHousehold              -39.12266   59.92886  -0.653 0.513891    
## Item_TypeMeat                    -0.35463   70.66270  -0.005 0.995996    
## Item_TypeOthers                 -21.27266   98.62482  -0.216 0.829232    
## Item_TypeSeafood                184.55373  148.00848   1.247 0.212464    
## Item_TypeSnack Foods            -11.48660   55.25066  -0.208 0.835312    
## Item_TypeSoft Drinks            -27.46544   70.16973  -0.391 0.695501    
## Item_TypeStarchy Foods           21.86315  103.04028   0.212 0.831971    
## Item_MRP                         15.56477    0.19769  78.733  < 2e-16 ***
## Outlet_SizeHigh                 849.13529  256.29910   3.313 0.000927 ***
## Outlet_SizeMedium               170.93952   71.08120   2.405 0.016200 *  
## Outlet_SizeSmall                144.05363   45.67140   3.154 0.001615 ** 
## Outlet_Location_TypeTier 2     -100.36636   90.23496  -1.112 0.266050    
## Outlet_Location_TypeTier 3     -326.31974  154.89294  -2.107 0.035169 *  
## Outlet_TypeSupermarket Type1   1473.06659  140.73922  10.467  < 2e-16 ***
## Outlet_TypeSupermarket Type2   1076.77568  136.08116   7.913 2.83e-15 ***
## Outlet_TypeSupermarket Type3   3639.29114  178.13923  20.429  < 2e-16 ***
## Year                            -34.82155   10.49271  -3.319 0.000908 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1129 on 8494 degrees of freedom
## Multiple R-squared:  0.5637, Adjusted R-squared:  0.5623 
## F-statistic:   392 on 28 and 8494 DF,  p-value: < 2.2e-16
  • Lets Visiulize simple model
par(mfrow = c(2,2))
plot(linear_model_simple)

linear_model_log <- lm(log(Item_Outlet_Sales) ~ ., data = new_train)
summary(linear_model)
## 
## Call:
## lm(formula = Item_Outlet_Sales ~ ., data = pred_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4277.2  -681.4   -85.4   568.6  7897.8 
## 
## Coefficients: (4 not defined because of singularities)
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      2348.3559   326.2603   7.198 6.64e-13 ***
## Item_Weight                        -0.9462     2.8983  -0.326  0.74407    
## Item_Visibility                  -221.0750   262.8463  -0.841  0.40033    
## Item_MRP                           15.5755     0.1965  79.257  < 2e-16 ***
## Outlet_Size_                     -142.6728    45.6008  -3.129  0.00176 ** 
## Outlet_Size_High                  694.3313   254.9895   2.723  0.00648 ** 
## Outlet_Size_Medium                 29.5461    56.3255   0.525  0.59990    
## Outlet_Size_Small                       NA         NA      NA       NA    
## `Outlet_Location_Type_Tier 1`     320.5095   154.5967   2.073  0.03818 *  
## `Outlet_Location_Type_Tier 2`     223.3423   100.3885   2.225  0.02612 *  
## `Outlet_Location_Type_Tier 3`           NA         NA      NA       NA    
## `Outlet_Type_Grocery Store`     -3630.9270   177.7976 -20.422  < 2e-16 ***
## `Outlet_Type_Supermarket Type1` -2152.1612   294.1229  -7.317 2.76e-13 ***
## `Outlet_Type_Supermarket Type2` -2549.1114   256.6856  -9.931  < 2e-16 ***
## `Outlet_Type_Supermarket Type3`         NA         NA      NA       NA    
## Item_Identifier_Str2_DR            14.9593    49.0122   0.305  0.76021    
## Item_Identifier_Str2_FD            43.8687    31.8015   1.379  0.16779    
## Item_Identifier_Str2_NC                 NA         NA      NA       NA    
## Item_Identifier_Num                 2.9993     0.7092   4.229 2.37e-05 ***
## Outlet_Age                        -34.3064    10.4716  -3.276  0.00106 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1127 on 8507 degrees of freedom
## Multiple R-squared:  0.5643, Adjusted R-squared:  0.5635 
## F-statistic: 734.5 on 15 and 8507 DF,  p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(linear_model_log)

library(rpart)
library(e1071)
library(rpart.plot)
library(caret)

#setting the tree control parameters
fitControl <- trainControl(method = "cv", number = 5)
cartGrid <- expand.grid(.cp=(1:50)*0.01)

#decision tree
tree_model <- train(Item_Outlet_Sales ~ ., data = new_train, method = "rpart", trControl = fitControl, tuneGrid = cartGrid)
#print(tree_model)

main_tree <- rpart(Item_Outlet_Sales ~ ., data = new_train, control = rpart.control(cp=0.01))
prp(main_tree)

  • Compare Kernels Model RMSE, Best model created with Decision Tree Algorithm.
library(Metrics)

rmse(new_train$Item_Outlet_Sales, linear_model$fitted.values)
## [1] 1126.362
rmse(new_train$Item_Outlet_Sales, linear_model_simple$fitted.values)
## [1] 1127.081
rmse(new_train$Item_Outlet_Sales, exp(linear_model_log$fitted.values))
## [1] 1140.004
pre_score <- predict(main_tree, type = "vector")
rmse(new_train$Item_Outlet_Sales, pre_score)
## [1] 1102.774

4. Creating own models based on datacamp courses

statistical-modeling-in-r-part-1

  • Create one lm and one rpart model, find case by case differences between predictions and real sales value and calculate mean.
lm_model_1 <- lm(Item_Outlet_Sales ~ ., data = new_train)
rpart_model_1 <- rpart(Item_Outlet_Sales ~ ., data = new_train)

lm_model_1_output <- predict(lm_model_1, newdata = new_train)
rpart_model_1_output <- predict(rpart_model_1, newdata = new_train)

# Find the case by case differences

lm_model_1_diff <- with(new_train, Item_Outlet_Sales - lm_model_1_output)
rpart_model_1_diff <- with(new_train, Item_Outlet_Sales - rpart_model_1_output)

# Calculate the mean square errors
mean(lm_model_1_diff)
## [1] 6.105024e-11
mean(rpart_model_1_diff)
## [1] 1.151509e-12
  • Generate a random TRUE or FALSE for each case in new_train. Create one lm and one rpart model with True datasets, Make predictions with FALSE datasets and find case by case differences between predictions and real sales value and calculate mean for two models.
(formula <- as.formula(Item_Outlet_Sales ~ Item_Weight + Item_Fat_Content + Item_Visibility + Item_Type + Item_MRP + Outlet_Size + 
  Outlet_Location_Type + Outlet_Type + Year))
## Item_Outlet_Sales ~ Item_Weight + Item_Fat_Content + Item_Visibility + 
##     Item_Type + Item_MRP + Outlet_Size + Outlet_Location_Type + 
##     Outlet_Type + Year
new_train$training_cases <- rnorm(nrow(new_train)) > 0

# summary(new_train$training_cases)

lm_model_1 <- lm(formula, data = subset(new_train, training_cases))
rpart_model_1 <- rpart(formula, data = subset(new_train, training_cases))

lm_preds <- predict(lm_model_1, newdata = subset(new_train, !training_cases))
rpart_preds <- predict(rpart_model_1, newdata = subset(new_train, !training_cases))

# Find the mean of case by case differences

with(subset(new_train, !training_cases), mean((Item_Outlet_Sales - lm_preds) ^ 2)) 
## [1] 1294078
with(subset(new_train, !training_cases), mean((Item_Outlet_Sales - rpart_preds) ^ 2))
## [1] 1250418