Get Used to Big Mart Sales Data

setwd("C:/Users/yetkinEser/Desktop/R/datas")
bigMart<-read.csv('bigMartTrain.csv')


library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
library(ggplot2)

bigMart2<-
  bigMart%>%
  mutate(Item_Identifier_Str=substr(Item_Identifier,1,3),
         Item_Identifier_Num=as.numeric(substr(Item_Identifier,4,6)),
         Outlet_Age=2013-Outlet_Establishment_Year,
         Item_Fat_Fixed=str_replace(
           str_replace(
             str_replace(Item_Fat_Content,"LF","Low Fat")
             ,"reg","Regular"),"low fat","Low Fat"),
         Item_outlet_Sales_Count = Item_Outlet_Sales / Item_MRP,
         PK=row_number())

names(bigMart2)
##  [1] "Item_Identifier"           "Item_Weight"              
##  [3] "Item_Fat_Content"          "Item_Visibility"          
##  [5] "Item_Type"                 "Item_MRP"                 
##  [7] "Outlet_Identifier"         "Outlet_Establishment_Year"
##  [9] "Outlet_Size"               "Outlet_Location_Type"     
## [11] "Outlet_Type"               "Item_Outlet_Sales"        
## [13] "Item_Identifier_Str"       "Item_Identifier_Num"      
## [15] "Outlet_Age"                "Item_Fat_Fixed"           
## [17] "Item_outlet_Sales_Count"   "PK"
qplot(x=Item_Fat_Content,data=bigMart2)

qplot(x=Item_Fat_Fixed,data=bigMart2)

# Looking at Item Type
qplot(x=Item_Type,data=bigMart2)+
  geom_bar(color="green")+
  theme(axis.text = element_text(angle = 0,color="purple"))+
  coord_flip()

# Looking at Item Type with facet wrap according to Outlet Identifier
qplot(x=Item_Type,data=bigMart2)+
  geom_bar(color="green")+
  theme(axis.text = element_text(angle = 0,color="purple"))+
  coord_flip()+
  facet_wrap(~Outlet_Identifier,nrow=2)

# Looking at Item Type with facet wrap according to Item_Identifier_Num
qplot(x=Item_Type,data=bigMart2)+
  geom_bar(color="green")+
  theme(axis.text = element_text(angle = 0,color="purple"))+
  coord_flip()+
  facet_wrap(~Item_Identifier_Num,nrow=2)

# Looking at Item Type with facet wrap according to Item_Identifier_Str
qplot(x=Item_Type,data=bigMart2)+
  geom_bar(color="green")+
  theme(axis.text = element_text(angle = 90,color="purple"))+
  coord_flip()+
  facet_wrap(~Item_Identifier_Str,nrow=2)

# Looking at Item_Outlet_Sales 
qplot(x=Item_Outlet_Sales, data = bigMart2,binwidth = 250)+
  geom_bar(color="green")+
  theme(axis.text = element_text(angle = 90,color="purple"))+
  scale_x_continuous(limits=c(0, 10000),breaks=seq(0, 10000, 500))
## Warning: Removed 8 rows containing non-finite values (stat_bin).
## Warning: Removed 8 rows containing non-finite values (stat_count).

# Looking at Item_Outlet_Sales according to Outlet_Identifier
qplot(x=Item_Outlet_Sales,data = bigMart2,binwidth = 250)+
  geom_bar(color="green")+
  theme(axis.text.x = element_text(angle = 90,color="purple"),
        axis.text.y = element_text(angle = 30,color="tomato"))+
  scale_x_continuous(limits=c(0,5000),breaks=seq(0,5000,500))+
  scale_y_continuous(limits=c(0,250),breaks=seq(0,200,100))+
  facet_wrap(~Outlet_Identifier,nrow=5)
## Warning: Removed 629 rows containing non-finite values (stat_bin).
## Warning: Removed 629 rows containing non-finite values (stat_count).
## Warning: Removed 2 rows containing missing values (geom_bar).

# Looking at Item_Outlet_Sales according to Outlet_Size
qplot(x=Item_Outlet_Sales,data = bigMart2,binwidth = 250)+
  geom_bar(color="green")+
  theme(axis.text.x = element_text(angle = 90,color="purple"),
        axis.text.y = element_text(angle = 30,color="tomato"))+
  scale_x_continuous(limits=c(0,5000),breaks=seq(0,5000,500))+
  scale_y_continuous(limits=c(0,250),breaks=seq(0,200,100))+
  facet_wrap(~Outlet_Size,nrow=4)
## Warning: Removed 629 rows containing non-finite values (stat_bin).
## Warning: Removed 629 rows containing non-finite values (stat_count).
## Warning: Removed 2 rows containing missing values (geom_bar).

# Looking at Item_Outlet_Sales according to Outlet_Size with is.na function
qplot(x=Item_Outlet_Sales,data = subset(bigMart2, !is.na(Outlet_Size)),binwidth = 250)+
  geom_bar(color="green")+
  theme(axis.text.x = element_text(angle = 90,color="purple"),
        axis.text.y = element_text(angle = 30,color="tomato"))+
  scale_x_continuous(limits=c(0,5000),breaks=seq(0,5000,500))+
  scale_y_continuous(limits=c(0,250),breaks=seq(0,200,100))+
  facet_wrap(~Outlet_Size,nrow=4)
## Warning: Removed 629 rows containing non-finite values (stat_bin).
## Warning: Removed 629 rows containing non-finite values (stat_count).
## Warning: Removed 2 rows containing missing values (geom_bar).

table(bigMart2$Outlet_Size)
## 
##          High Medium  Small 
##   2410    932   2793   2388
by(bigMart2$Item_Outlet_Sales,bigMart2$Outlet_Size,summary)
## bigMart2$Outlet_Size: 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   33.29  554.78 1443.45 1822.63 2681.51 9664.75 
## -------------------------------------------------------- 
## bigMart2$Outlet_Size: High
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    73.24  1072.60  2050.66  2298.99  3166.38 10256.65 
## -------------------------------------------------------- 
## bigMart2$Outlet_Size: Medium
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    69.24  1270.35  2251.07  2681.60  3691.20 13086.97 
## -------------------------------------------------------- 
## bigMart2$Outlet_Size: Small
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   33.96  601.05 1544.66 1912.15 2824.32 9779.94
# Looking at Item_Outlet_Sales according to Outlet_Size with is.na function
# And giving name to y label
qplot(x=Item_Outlet_Sales,data = subset(bigMart2, !is.na(Outlet_Size)),binwidth = 250,
      ylab="Count Of Sales")+
  geom_bar(color="green")+
  theme(axis.text.x = element_text(angle = 90,color="purple"),
        axis.text.y = element_text(angle = 30,color="tomato"))+
  scale_x_continuous(limits=c(0,5000),breaks=seq(0,5000,500))+
  scale_y_continuous(limits=c(0,250),breaks=seq(0,200,100))+
  facet_wrap(~Outlet_Size,nrow=4)
## Warning: Removed 629 rows containing non-finite values (stat_bin).
## Warning: Removed 629 rows containing non-finite values (stat_count).
## Warning: Removed 2 rows containing missing values (geom_bar).

qplot(x=Item_Type,data=bigMart2,
      ylab="Count Of Sales",color=I("black"),fill=I("#5760AB"))+
  #geom_bar(color="green")+
  theme(axis.text = element_text(angle = 0,color="purple"))+
  coord_flip()

# Look at summary with different wat

summary(bigMart$Item_Outlet_Sales)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    33.29   834.25  1794.33  2181.29  3101.30 13086.97
summary(log10(bigMart$Item_Outlet_Sales+1))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.535   2.922   3.254   3.169   3.492   4.117
summary(sqrt(bigMart$Item_Outlet_Sales))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.77   28.88   42.36   42.94   55.69  114.40
# histogram with sqrt and log10
qplot(x=Item_Outlet_Sales,data = subset(bigMart2, !is.na(Outlet_Size)),binwidth = 50,
      ylab="Count Of Sales")+
  geom_bar(color="green")+
  theme(axis.text.x = element_text(angle = 90,color="purple"),
        axis.text.y = element_text(angle = 30,color="tomato"))+
  scale_x_continuous(limits=c(0,5000),breaks=seq(0,5000,250))+
  scale_y_continuous(limits=c(0,250),breaks=seq(0,200,100))
## Warning: Removed 629 rows containing non-finite values (stat_bin).
## Warning: Removed 629 rows containing non-finite values (stat_count).

# It is better now like normal distibution with sqrt and log10
qplot(x=sqrt(Item_Outlet_Sales),data = subset(bigMart2, !is.na(Outlet_Size)),binwidth = 1,
      ylab="Count Of Sales",
      xlab="SQRT of Outlet Sales")+
  geom_bar(color="green")+
  theme(axis.text.x = element_text(angle = 90,color="purple"),
        axis.text.y = element_text(angle = 30,color="tomato"))+
  scale_x_continuous(limits=c(0,100),breaks=seq(0,70,15))+
  scale_y_continuous(limits=c(0,200),breaks=seq(0,200,100))
## Warning: Removed 8 rows containing non-finite values (stat_bin).
## Warning: Removed 8 rows containing non-finite values (stat_count).

qplot(x=log10(Item_Outlet_Sales+1),data = subset(bigMart2, !is.na(Outlet_Size)),binwidth = 0.01,
      ylab="Count Of Sales",
      xlab="LOG10 of Outlet Sales")+
  geom_bar(color="green")+
  theme(axis.text.x = element_text(angle = 90,color="purple"),
        axis.text.y = element_text(angle = 30,color="tomato"))+
  scale_x_continuous(limits=c(1.5,4.5),breaks=seq(1.5,4.5,0.5))+
  scale_y_continuous(limits=c(0,200),breaks=seq(0,200,100))

# it is bettew with scale_x_log10()
qplot(x=Item_Outlet_Sales,data = subset(bigMart2, !is.na(Outlet_Size)),binwidth = 0.01,
      ylab="Count Of Sales",
      xlab="LOG10 of Outlet Sales")+
  geom_bar(color="green")+
  theme(axis.text.x = element_text(angle = 90,color="purple"),
        axis.text.y = element_text(angle = 30,color="tomato"))+
  scale_x_continuous(limits=c(1.5,4.5),breaks=seq(1.5,4.5,0.5))+
  scale_y_continuous(limits=c(0,200),breaks=seq(0,200,100))+
  scale_x_log10()
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.

# it is bettew with scale_x_log10()
qplot(x=Item_Outlet_Sales,data = subset(bigMart2, !is.na(Outlet_Size)),binwidth = 0.01,
      ylab="Count Of Sales",
      xlab="LOG10 of Outlet Sales",
      geom="freqpoly",
      color=Outlet_Size)+
  geom_bar(color="green")+
  theme(axis.text.x = element_text(angle = 90,color="purple"),
        axis.text.y = element_text(angle = 30,color="tomato"))+
  scale_x_continuous(limits=c(1.5,4.5),breaks=seq(1.5,4.5,0.5))+
  scale_y_continuous(limits=c(0,40),breaks=seq(0,40,10))+
  scale_x_log10()
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.

# First Box-Plot

qplot(x=Outlet_Size, y=Item_Outlet_Sales,
      data = subset(bigMart2, !(Outlet_Size=="")),
      geom = "boxplot")+
  theme(axis.text.x = element_text(angle = 90,color="purple"),
        axis.text.y = element_text(angle = 30,color="tomato"))

#it is better
qplot(x=Outlet_Size, y=Item_Outlet_Sales,
      data = subset(bigMart2, !(Outlet_Size=="")),
      geom = "boxplot")+
  theme(axis.text.x = element_text(angle = 90,color="purple"),
        axis.text.y = element_text(angle = 30,color="tomato"))+
  coord_cartesian(ylim=c(0,5000))

library(knitr)

by(bigMart2$Item_Outlet_Sales,bigMart2$Outlet_Size,summary)
## bigMart2$Outlet_Size: 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   33.29  554.78 1443.45 1822.63 2681.51 9664.75 
## -------------------------------------------------------- 
## bigMart2$Outlet_Size: High
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    73.24  1072.60  2050.66  2298.99  3166.38 10256.65 
## -------------------------------------------------------- 
## bigMart2$Outlet_Size: Medium
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    69.24  1270.35  2251.07  2681.60  3691.20 13086.97 
## -------------------------------------------------------- 
## bigMart2$Outlet_Size: Small
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   33.96  601.05 1544.66 1912.15 2824.32 9779.94
## Udacity - Explore Two Variables

qplot(x = Item_Weight, y = Item_Visibility,data = bigMart2)
## Warning: Removed 1463 rows containing missing values (geom_point).

qplot(x = Item_Visibility, y = Item_Outlet_Sales,data = bigMart2)

#More Formal GGPLOT

ggplot(aes(x = Item_Visibility, y = Item_Outlet_Sales), data = bigMart2)+
  geom_point()

ggplot(aes(x = Item_Visibility, y = Item_Outlet_Sales), data = bigMart2)+
  geom_point()+
  xlim(0.0, 0.2)+
  ylim(0,10000)
## Warning: Removed 142 rows containing missing values (geom_point).

ggplot(aes(x = Item_Visibility, y = Item_outlet_Sales_Count), data = bigMart2)+
  geom_point()+
  xlim(0.0, 0.2)+
  ylim(0,50)
## Warning: Removed 144 rows containing missing values (geom_point).

ggplot(aes(x = Item_Visibility, y = Item_outlet_Sales_Count), data = bigMart2)+
  geom_jitter(alpha=1/8)+
  xlim(0.0, 0.2)+
  ylim(0,50)
## Warning: Removed 412 rows containing missing values (geom_point).

ggplot(aes(x = Item_Visibility, y = Item_outlet_Sales_Count), data = bigMart2)+
  geom_jitter(alpha=1/8)+
  xlim(0.0, 0.2)+
  coord_trans(y="sqrt")
## Warning: Removed 387 rows containing missing values (geom_point).

ggplot(aes(x = Item_Visibility, y = Item_outlet_Sales_Count), data = bigMart2)+
  geom_point(alpha=1/8, position = position_jitter(h = 0))+
  xlim(0.0, 0.2)+
  coord_trans(y="sqrt")
## Warning: Removed 400 rows containing missing values (geom_point).

bigMart2.group_by_Outlet<-
bigMart2%>%
  group_by(Outlet_Identifier,
           Outlet_Establishment_Year,
           Outlet_Age,
           Outlet_Location_Type,
           Outlet_Size,
           Outlet_Type)%>%
  summarise(mean_IOS=mean(Item_Outlet_Sales),
            median_IOS=median(Item_Outlet_Sales),
            sum_IOS=sum(Item_Outlet_Sales),
            count_IOS=n())%>%
  arrange(desc(median_IOS))
  
head(bigMart2.group_by_Outlet,15)
## # A tibble: 10 x 10
## # Groups:   Outlet_Identifier, Outlet_Establishment_Year, Outlet_Age,
## #   Outlet_Location_Type, Outlet_Size [10]
##    Outlet_Identifier Outlet_Establishment_Year Outlet_Age
##               <fctr>                     <int>      <dbl>
##  1            OUT027                      1985         28
##  2            OUT035                      2004          9
##  3            OUT013                      1987         26
##  4            OUT017                      2007          6
##  5            OUT049                      1999         14
##  6            OUT046                      1997         16
##  7            OUT045                      2002         11
##  8            OUT018                      2009          4
##  9            OUT019                      1985         28
## 10            OUT010                      1998         15
## # ... with 7 more variables: Outlet_Location_Type <fctr>,
## #   Outlet_Size <fctr>, Outlet_Type <fctr>, mean_IOS <dbl>,
## #   median_IOS <dbl>, sum_IOS <dbl>, count_IOS <int>
#library(knitr)
#kable(cbind(bigMart2.group_by_Outlet, bigMart2.group_by_Outlet), "html") %>%
#  kable_styling() %>%
#  scroll_box(width = "500px", height = "200px")

ggplot(aes(x=Outlet_Age ,y=sum_IOS),data=bigMart2.group_by_Outlet)+
  geom_point()

ggplot(aes(x=Outlet_Age ,y=sum_IOS),data=bigMart2.group_by_Outlet)+
  geom_line()

ggplot(aes(x=median_IOS ,y=mean_IOS),data=bigMart2.group_by_Outlet)+
  geom_line(color="orange",size=1.1)

##it doesn't work in our data
ggplot(aes(x = Item_Visibility, y = Item_outlet_Sales_Count), data = bigMart2)+
  geom_point(alpha=1/8, position = position_jitter(h = 0), color = "orange")+
  xlim(0.0, 0.2)+
  geom_line(stat="summary",fun.y=sum)+
  geom_line(stat="summary",fun.y=quantile, probs=.1,
            linetype=2,color="blue")+
  geom_line(stat="summary",fun.y=quantile, probs=.9,
            linetype=2,color="blue")
## Warning: Ignoring unknown parameters: probs
## Warning: Ignoring unknown parameters: probs
## Warning: Removed 134 rows containing non-finite values (stat_summary).

## Warning: Removed 134 rows containing non-finite values (stat_summary).

## Warning: Removed 134 rows containing non-finite values (stat_summary).
## Warning: Removed 387 rows containing missing values (geom_point).

cor(bigMart2$Item_Visibility,bigMart2$Item_Outlet_Sales)
## [1] -0.1286246
cor(bigMart2$Item_Visibility,bigMart2$Item_outlet_Sales_Count)
## [1] -0.1610241
cor(bigMart2$Item_Visibility,bigMart2$Item_Weight)
## [1] NA
cor(bigMart2$Item_Visibility,bigMart2$Item_outlet_Sales_Count)
## [1] -0.1610241
cor.test(bigMart2$Item_Visibility,bigMart2$Item_outlet_Sales_Count,method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  bigMart2$Item_Visibility and bigMart2$Item_outlet_Sales_Count
## t = -15.061, df = 8521, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1816338 -0.1402730
## sample estimates:
##        cor 
## -0.1610241
library("corrplot")
## corrplot 0.84 loaded
M<-cor(
  bigMart2%>%
    select(Item_Weight
           ,Item_Visibility
           ,Item_MRP
           ,Item_Outlet_Sales
           ,Item_Identifier_Num
           ,Item_outlet_Sales_Count))

corrplot(M,method="number")

corrplot(M,method="circle")

ggplot(aes(x = Item_MRP, y = Item_Outlet_Sales), data = bigMart2)+
  geom_point(alpha=1/8, position = position_jitter(h = 0), color = "orange")+
  geom_line(stat="summary",fun.y=sum)+
  geom_line(stat="summary",fun.y=quantile, probs=.1,
            linetype=2,color="blue")+
  geom_line(stat="summary",fun.y=quantile, probs=.9,
            linetype=2,color="blue")
## Warning: Ignoring unknown parameters: probs
## Warning: Ignoring unknown parameters: probs