We take data from analyticsvidhya.com, Our data is Big Mart Sales Practise Problem Data, It is open comppetition now and its final date is 31 Dec 2017. We downloaded training dataset as a csv file.
View the structere of bigMart with str function
str(bigMart)
## 'data.frame': 8523 obs. of 12 variables:
## $ Item_Identifier : Factor w/ 1559 levels "DRA12","DRA24",..: 157 9 663 1122 1298 759 697 739 441 991 ...
## $ Item_Weight : num 9.3 5.92 17.5 19.2 8.93 ...
## $ Item_Fat_Content : Factor w/ 5 levels "LF","low fat",..: 3 5 3 5 3 5 5 3 5 5 ...
## $ Item_Visibility : num 0.016 0.0193 0.0168 0 0 ...
## $ Item_Type : Factor w/ 16 levels "Baking Goods",..: 5 15 11 7 10 1 14 14 6 6 ...
## $ Item_MRP : num 249.8 48.3 141.6 182.1 53.9 ...
## $ Outlet_Identifier : Factor w/ 10 levels "OUT010","OUT013",..: 10 4 10 1 2 4 2 6 8 3 ...
## $ Outlet_Establishment_Year: int 1999 2009 1999 1998 1987 2009 1987 1985 2002 2007 ...
## $ Outlet_Size : Factor w/ 4 levels "","High","Medium",..: 3 3 3 1 2 3 2 3 1 1 ...
## $ Outlet_Location_Type : Factor w/ 3 levels "Tier 1","Tier 2",..: 1 3 1 3 3 3 3 3 2 2 ...
## $ Outlet_Type : Factor w/ 4 levels "Grocery Store",..: 2 3 2 1 2 3 2 4 2 2 ...
## $ Item_Outlet_Sales : num 3735 443 2097 732 995 ...
glimpse(bigMart)
## Observations: 8,523
## Variables: 12
## $ Item_Identifier <fctr> FDA15, DRC01, FDN15, FDX07, NCD19, ...
## $ Item_Weight <dbl> 9.300, 5.920, 17.500, 19.200, 8.930,...
## $ Item_Fat_Content <fctr> Low Fat, Regular, Low Fat, Regular,...
## $ Item_Visibility <dbl> 0.016047301, 0.019278216, 0.01676007...
## $ Item_Type <fctr> Dairy, Soft Drinks, Meat, Fruits an...
## $ Item_MRP <dbl> 249.8092, 48.2692, 141.6180, 182.095...
## $ Outlet_Identifier <fctr> OUT049, OUT018, OUT049, OUT010, OUT...
## $ Outlet_Establishment_Year <int> 1999, 2009, 1999, 1998, 1987, 2009, ...
## $ Outlet_Size <fctr> Medium, Medium, Medium, , High, Med...
## $ Outlet_Location_Type <fctr> Tier 1, Tier 3, Tier 1, Tier 3, Tie...
## $ Outlet_Type <fctr> Supermarket Type1, Supermarket Type...
## $ Item_Outlet_Sales <dbl> 3735.1380, 443.4228, 2097.2700, 732....
Our dataset has 12 columns and 8523 rows Look at all columns, Firstly factor type columns
Item_Identifier : Unique Product ID
bigMart%>%
summarise(n_distinct(Item_Identifier))
## n_distinct(Item_Identifier)
## 1 1559
bigMart%>%
group_by(Item_Fat_Content)%>%
summarise(Count=n(),Perc=round(n()/nrow(.)*100,2))%>%
arrange(desc(Count))
## # A tibble: 5 x 3
## Item_Fat_Content Count Perc
## <fctr> <int> <dbl>
## 1 Low Fat 5089 59.71
## 2 Regular 2889 33.90
## 3 LF 316 3.71
## 4 reg 117 1.37
## 5 low fat 112 1.31
bigMart%>%
group_by(Item_Type)%>%
summarise(Count=n(),Perc=round(n()/nrow(.)*100,2))%>%
arrange(desc(Count))
## # A tibble: 16 x 3
## Item_Type Count Perc
## <fctr> <int> <dbl>
## 1 Fruits and Vegetables 1232 14.46
## 2 Snack Foods 1200 14.08
## 3 Household 910 10.68
## 4 Frozen Foods 856 10.04
## 5 Dairy 682 8.00
## 6 Canned 649 7.61
## 7 Baking Goods 648 7.60
## 8 Health and Hygiene 520 6.10
## 9 Soft Drinks 445 5.22
## 10 Meat 425 4.99
## 11 Breads 251 2.94
## 12 Hard Drinks 214 2.51
## 13 Others 169 1.98
## 14 Starchy Foods 148 1.74
## 15 Breakfast 110 1.29
## 16 Seafood 64 0.75
bigMart%>%
group_by(Outlet_Identifier)%>%
summarise(Count=n(),Perc=round(n()/nrow(.)*100,2))%>%
arrange(desc(Count))
## # A tibble: 10 x 3
## Outlet_Identifier Count Perc
## <fctr> <int> <dbl>
## 1 OUT027 935 10.97
## 2 OUT013 932 10.94
## 3 OUT035 930 10.91
## 4 OUT046 930 10.91
## 5 OUT049 930 10.91
## 6 OUT045 929 10.90
## 7 OUT018 928 10.89
## 8 OUT017 926 10.86
## 9 OUT010 555 6.51
## 10 OUT019 528 6.20
bigMart%>%
group_by(Outlet_Size)%>%
summarise(Count=n(),Perc=round(n()/nrow(.)*100,2))%>%
arrange(desc(Count))
## # A tibble: 4 x 3
## Outlet_Size Count Perc
## <fctr> <int> <dbl>
## 1 Medium 2793 32.77
## 2 2410 28.28
## 3 Small 2388 28.02
## 4 High 932 10.94
bigMart%>%
group_by(Outlet_Location_Type)%>%
summarise(Count=n(),Perc=round(n()/nrow(.)*100,2))%>%
arrange(desc(Count))
## # A tibble: 3 x 3
## Outlet_Location_Type Count Perc
## <fctr> <int> <dbl>
## 1 Tier 3 3350 39.31
## 2 Tier 2 2785 32.68
## 3 Tier 1 2388 28.02
bigMart%>%
group_by(Outlet_Type)%>%
summarise(Count=n(),Perc=round(n()/nrow(.)*100,2))%>%
arrange(desc(Count))
## # A tibble: 4 x 3
## Outlet_Type Count Perc
## <fctr> <int> <dbl>
## 1 Supermarket Type1 5577 65.43
## 2 Grocery Store 1083 12.71
## 3 Supermarket Type3 935 10.97
## 4 Supermarket Type2 928 10.89
bigMart%>%
summarise(is_NULL=sum(is.na(Item_Weight)==1),
is_NOT_NULL=sum(!is.na(Item_Weight)==1)
)
## is_NULL is_NOT_NULL
## 1 1463 7060
bigMart%>%
filter(!is.na(Item_Weight))%>%
summarise(
Max=max(Item_Weight),
Min=min(Item_Weight),
Mean=mean(Item_Weight),
Median=median(Item_Weight),
QUA1=quantile(Item_Weight,1/4),
QUA3=quantile(Item_Weight,3/4),
IQR=IQR(Item_Weight)
)
## Max Min Mean Median QUA1 QUA3 IQR
## 1 21.35 4.555 12.85765 12.6 8.77375 16.85 8.07625
bigMart%>%
summarise(is_NULL=sum(is.na(Item_Visibility)==1),
is_NOT_NULL=sum(!is.na(Item_Visibility)==1)
)
## is_NULL is_NOT_NULL
## 1 0 8523
bigMart%>%
filter(!is.na(Item_Visibility))%>%
summarise(
Max=max(Item_Visibility),
Min=min(Item_Visibility),
Mean=mean(Item_Visibility),
Median=median(Item_Visibility),
QUA1=quantile(Item_Visibility,1/4),
QUA3=quantile(Item_Visibility,3/4),
IQR=IQR(Item_Visibility)
)
## Max Min Mean Median QUA1 QUA3 IQR
## 1 0.3283909 0 0.06613203 0.05393093 0.02698948 0.09458529 0.06759582
bigMart%>%
summarise(is_NULL=sum(is.na(Item_MRP)==1),
is_NOT_NULL=sum(!is.na(Item_MRP)==1)
)
## is_NULL is_NOT_NULL
## 1 0 8523
bigMart%>%
filter(!is.na(Item_MRP))%>%
summarise(
Max=max(Item_MRP),
Min=min(Item_MRP),
Mean=mean(Item_MRP),
Median=median(Item_MRP),
QUA1=quantile(Item_MRP,1/4),
QUA3=quantile(Item_MRP,3/4),
IQR=IQR(Item_MRP)
)
## Max Min Mean Median QUA1 QUA3 IQR
## 1 266.8884 31.29 140.9928 143.0128 93.8265 185.6437 91.8172
bigMart%>%
summarise(is_NULL=sum(is.na(Outlet_Establishment_Year)==1),
is_NOT_NULL=sum(!is.na(Outlet_Establishment_Year)==1)
)
## is_NULL is_NOT_NULL
## 1 0 8523
bigMart%>%
filter(!is.na(Outlet_Establishment_Year))%>%
summarise(
Max=max(Outlet_Establishment_Year),
Min=min(Outlet_Establishment_Year),
Mean=mean(Outlet_Establishment_Year),
Median=median(Outlet_Establishment_Year),
QUA1=quantile(Outlet_Establishment_Year,1/4),
QUA3=quantile(Outlet_Establishment_Year,3/4),
IQR=IQR(Outlet_Establishment_Year)
)
## Max Min Mean Median QUA1 QUA3 IQR
## 1 2009 1985 1997.832 1999 1987 2004 17
bigMart%>%
summarise(is_NULL=sum(is.na(Item_Outlet_Sales)==1),
is_NOT_NULL=sum(!is.na(Item_Outlet_Sales)==1)
)
## is_NULL is_NOT_NULL
## 1 0 8523
bigMart%>%
filter(!is.na(Item_Outlet_Sales))%>%
summarise(
Max=max(Item_Outlet_Sales),
Min=min(Item_Outlet_Sales),
Mean=mean(Item_Outlet_Sales),
Median=median(Item_Outlet_Sales),
QUA1=quantile(Item_Outlet_Sales,1/4),
QUA3=quantile(Item_Outlet_Sales,3/4),
IQR=IQR(Item_Outlet_Sales)
)
## Max Min Mean Median QUA1 QUA3 IQR
## 1 13086.96 33.29 2181.289 1794.331 834.2474 3101.296 2267.049
summary(bigMart)
## Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## FDG33 : 10 Min. : 4.555 LF : 316 Min. :0.00000
## FDW13 : 10 1st Qu.: 8.774 low fat: 112 1st Qu.:0.02699
## DRE49 : 9 Median :12.600 Low Fat:5089 Median :0.05393
## DRN47 : 9 Mean :12.858 reg : 117 Mean :0.06613
## FDD38 : 9 3rd Qu.:16.850 Regular:2889 3rd Qu.:0.09459
## FDF52 : 9 Max. :21.350 Max. :0.32839
## (Other):8467 NA's :1463
## Item_Type Item_MRP Outlet_Identifier
## Fruits and Vegetables:1232 Min. : 31.29 OUT027 : 935
## Snack Foods :1200 1st Qu.: 93.83 OUT013 : 932
## Household : 910 Median :143.01 OUT035 : 930
## Frozen Foods : 856 Mean :140.99 OUT046 : 930
## Dairy : 682 3rd Qu.:185.64 OUT049 : 930
## Canned : 649 Max. :266.89 OUT045 : 929
## (Other) :2994 (Other):2937
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## Min. :1985 :2410 Tier 1:2388
## 1st Qu.:1987 High : 932 Tier 2:2785
## Median :1999 Medium:2793 Tier 3:3350
## Mean :1998 Small :2388
## 3rd Qu.:2004
## Max. :2009
##
## Outlet_Type Item_Outlet_Sales
## Grocery Store :1083 Min. : 33.29
## Supermarket Type1:5577 1st Qu.: 834.25
## Supermarket Type2: 928 Median : 1794.33
## Supermarket Type3: 935 Mean : 2181.29
## 3rd Qu.: 3101.30
## Max. :13086.97
##