Get Used to Big Mart Sales Data
- In this work we try to understand data better.
- sources:
setwd("C:/Users/yetkinEser/Desktop/R/datas")
bigMart<-read.csv('bigMartTrain.csv')
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
library(ggplot2)
bigMart2<-
bigMart%>%
mutate(Item_Identifier_Str=substr(Item_Identifier,1,3),
Item_Identifier_Num=as.numeric(substr(Item_Identifier,4,6)),
Outlet_Age=2013-Outlet_Establishment_Year,
Item_Fat_Fixed=str_replace(
str_replace(
str_replace(Item_Fat_Content,"LF","Low Fat")
,"reg","Regular"),"low fat","Low Fat"),
Item_outlet_Sales_Count = Item_Outlet_Sales / Item_MRP,
PK=row_number())
names(bigMart2)
## [1] "Item_Identifier" "Item_Weight"
## [3] "Item_Fat_Content" "Item_Visibility"
## [5] "Item_Type" "Item_MRP"
## [7] "Outlet_Identifier" "Outlet_Establishment_Year"
## [9] "Outlet_Size" "Outlet_Location_Type"
## [11] "Outlet_Type" "Item_Outlet_Sales"
## [13] "Item_Identifier_Str" "Item_Identifier_Num"
## [15] "Outlet_Age" "Item_Fat_Fixed"
## [17] "Item_outlet_Sales_Count" "PK"
qplot(x=Item_Fat_Content,data=bigMart2)

qplot(x=Item_Fat_Fixed,data=bigMart2)

# Looking at Item Type
qplot(x=Item_Type,data=bigMart2)+
geom_bar(color="green")+
theme(axis.text = element_text(angle = 0,color="purple"))+
coord_flip()

# Looking at Item Type with facet wrap according to Outlet Identifier
qplot(x=Item_Type,data=bigMart2)+
geom_bar(color="green")+
theme(axis.text = element_text(angle = 0,color="purple"))+
coord_flip()+
facet_wrap(~Outlet_Identifier,nrow=2)

# Looking at Item Type with facet wrap according to Item_Identifier_Num
qplot(x=Item_Type,data=bigMart2)+
geom_bar(color="green")+
theme(axis.text = element_text(angle = 0,color="purple"))+
coord_flip()+
facet_wrap(~Item_Identifier_Num,nrow=2)

# Looking at Item Type with facet wrap according to Item_Identifier_Str
qplot(x=Item_Type,data=bigMart2)+
geom_bar(color="green")+
theme(axis.text = element_text(angle = 90,color="purple"))+
coord_flip()+
facet_wrap(~Item_Identifier_Str,nrow=2)

# Looking at Item_Outlet_Sales
qplot(x=Item_Outlet_Sales, data = bigMart2,binwidth = 250)+
geom_bar(color="green")+
theme(axis.text = element_text(angle = 90,color="purple"))+
scale_x_continuous(limits=c(0, 10000),breaks=seq(0, 10000, 500))
## Warning: Removed 8 rows containing non-finite values (stat_bin).
## Warning: Removed 8 rows containing non-finite values (stat_count).

# Looking at Item_Outlet_Sales according to Outlet_Identifier
qplot(x=Item_Outlet_Sales,data = bigMart2,binwidth = 250)+
geom_bar(color="green")+
theme(axis.text.x = element_text(angle = 90,color="purple"),
axis.text.y = element_text(angle = 30,color="tomato"))+
scale_x_continuous(limits=c(0,5000),breaks=seq(0,5000,500))+
scale_y_continuous(limits=c(0,250),breaks=seq(0,200,100))+
facet_wrap(~Outlet_Identifier,nrow=5)
## Warning: Removed 629 rows containing non-finite values (stat_bin).
## Warning: Removed 629 rows containing non-finite values (stat_count).
## Warning: Removed 2 rows containing missing values (geom_bar).

# Looking at Item_Outlet_Sales according to Outlet_Size
qplot(x=Item_Outlet_Sales,data = bigMart2,binwidth = 250)+
geom_bar(color="green")+
theme(axis.text.x = element_text(angle = 90,color="purple"),
axis.text.y = element_text(angle = 30,color="tomato"))+
scale_x_continuous(limits=c(0,5000),breaks=seq(0,5000,500))+
scale_y_continuous(limits=c(0,250),breaks=seq(0,200,100))+
facet_wrap(~Outlet_Size,nrow=4)
## Warning: Removed 629 rows containing non-finite values (stat_bin).
## Warning: Removed 629 rows containing non-finite values (stat_count).
## Warning: Removed 2 rows containing missing values (geom_bar).

# Looking at Item_Outlet_Sales according to Outlet_Size with is.na function
qplot(x=Item_Outlet_Sales,data = subset(bigMart2, !is.na(Outlet_Size)),binwidth = 250)+
geom_bar(color="green")+
theme(axis.text.x = element_text(angle = 90,color="purple"),
axis.text.y = element_text(angle = 30,color="tomato"))+
scale_x_continuous(limits=c(0,5000),breaks=seq(0,5000,500))+
scale_y_continuous(limits=c(0,250),breaks=seq(0,200,100))+
facet_wrap(~Outlet_Size,nrow=4)
## Warning: Removed 629 rows containing non-finite values (stat_bin).
## Warning: Removed 629 rows containing non-finite values (stat_count).
## Warning: Removed 2 rows containing missing values (geom_bar).

table(bigMart2$Outlet_Size)
##
## High Medium Small
## 2410 932 2793 2388
by(bigMart2$Item_Outlet_Sales,bigMart2$Outlet_Size,summary)
## bigMart2$Outlet_Size:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 33.29 554.78 1443.45 1822.63 2681.51 9664.75
## --------------------------------------------------------
## bigMart2$Outlet_Size: High
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 73.24 1072.60 2050.66 2298.99 3166.38 10256.65
## --------------------------------------------------------
## bigMart2$Outlet_Size: Medium
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 69.24 1270.35 2251.07 2681.60 3691.20 13086.97
## --------------------------------------------------------
## bigMart2$Outlet_Size: Small
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 33.96 601.05 1544.66 1912.15 2824.32 9779.94
# Looking at Item_Outlet_Sales according to Outlet_Size with is.na function
# And giving name to y label
qplot(x=Item_Outlet_Sales,data = subset(bigMart2, !is.na(Outlet_Size)),binwidth = 250,
ylab="Count Of Sales")+
geom_bar(color="green")+
theme(axis.text.x = element_text(angle = 90,color="purple"),
axis.text.y = element_text(angle = 30,color="tomato"))+
scale_x_continuous(limits=c(0,5000),breaks=seq(0,5000,500))+
scale_y_continuous(limits=c(0,250),breaks=seq(0,200,100))+
facet_wrap(~Outlet_Size,nrow=4)
## Warning: Removed 629 rows containing non-finite values (stat_bin).
## Warning: Removed 629 rows containing non-finite values (stat_count).
## Warning: Removed 2 rows containing missing values (geom_bar).

qplot(x=Item_Type,data=bigMart2,
ylab="Count Of Sales",color=I("black"),fill=I("#5760AB"))+
#geom_bar(color="green")+
theme(axis.text = element_text(angle = 0,color="purple"))+
coord_flip()

# Look at summary with different wat
summary(bigMart$Item_Outlet_Sales)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 33.29 834.25 1794.33 2181.29 3101.30 13086.97
summary(log10(bigMart$Item_Outlet_Sales+1))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.535 2.922 3.254 3.169 3.492 4.117
summary(sqrt(bigMart$Item_Outlet_Sales))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.77 28.88 42.36 42.94 55.69 114.40
# histogram with sqrt and log10
qplot(x=Item_Outlet_Sales,data = subset(bigMart2, !is.na(Outlet_Size)),binwidth = 50,
ylab="Count Of Sales")+
geom_bar(color="green")+
theme(axis.text.x = element_text(angle = 90,color="purple"),
axis.text.y = element_text(angle = 30,color="tomato"))+
scale_x_continuous(limits=c(0,5000),breaks=seq(0,5000,250))+
scale_y_continuous(limits=c(0,250),breaks=seq(0,200,100))
## Warning: Removed 629 rows containing non-finite values (stat_bin).
## Warning: Removed 629 rows containing non-finite values (stat_count).

# It is better now like normal distibution with sqrt and log10
qplot(x=sqrt(Item_Outlet_Sales),data = subset(bigMart2, !is.na(Outlet_Size)),binwidth = 1,
ylab="Count Of Sales",
xlab="SQRT of Outlet Sales")+
geom_bar(color="green")+
theme(axis.text.x = element_text(angle = 90,color="purple"),
axis.text.y = element_text(angle = 30,color="tomato"))+
scale_x_continuous(limits=c(0,100),breaks=seq(0,70,15))+
scale_y_continuous(limits=c(0,200),breaks=seq(0,200,100))
## Warning: Removed 8 rows containing non-finite values (stat_bin).
## Warning: Removed 8 rows containing non-finite values (stat_count).

qplot(x=log10(Item_Outlet_Sales+1),data = subset(bigMart2, !is.na(Outlet_Size)),binwidth = 0.01,
ylab="Count Of Sales",
xlab="LOG10 of Outlet Sales")+
geom_bar(color="green")+
theme(axis.text.x = element_text(angle = 90,color="purple"),
axis.text.y = element_text(angle = 30,color="tomato"))+
scale_x_continuous(limits=c(1.5,4.5),breaks=seq(1.5,4.5,0.5))+
scale_y_continuous(limits=c(0,200),breaks=seq(0,200,100))

# it is bettew with scale_x_log10()
qplot(x=Item_Outlet_Sales,data = subset(bigMart2, !is.na(Outlet_Size)),binwidth = 0.01,
ylab="Count Of Sales",
xlab="LOG10 of Outlet Sales")+
geom_bar(color="green")+
theme(axis.text.x = element_text(angle = 90,color="purple"),
axis.text.y = element_text(angle = 30,color="tomato"))+
scale_x_continuous(limits=c(1.5,4.5),breaks=seq(1.5,4.5,0.5))+
scale_y_continuous(limits=c(0,200),breaks=seq(0,200,100))+
scale_x_log10()
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.

# it is bettew with scale_x_log10()
qplot(x=Item_Outlet_Sales,data = subset(bigMart2, !is.na(Outlet_Size)),binwidth = 0.01,
ylab="Count Of Sales",
xlab="LOG10 of Outlet Sales",
geom="freqpoly",
color=Outlet_Size)+
geom_bar(color="green")+
theme(axis.text.x = element_text(angle = 90,color="purple"),
axis.text.y = element_text(angle = 30,color="tomato"))+
scale_x_continuous(limits=c(1.5,4.5),breaks=seq(1.5,4.5,0.5))+
scale_y_continuous(limits=c(0,40),breaks=seq(0,40,10))+
scale_x_log10()
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.

# First Box-Plot
qplot(x=Outlet_Size, y=Item_Outlet_Sales,
data = subset(bigMart2, !(Outlet_Size=="")),
geom = "boxplot")+
theme(axis.text.x = element_text(angle = 90,color="purple"),
axis.text.y = element_text(angle = 30,color="tomato"))

#it is better
qplot(x=Outlet_Size, y=Item_Outlet_Sales,
data = subset(bigMart2, !(Outlet_Size=="")),
geom = "boxplot")+
theme(axis.text.x = element_text(angle = 90,color="purple"),
axis.text.y = element_text(angle = 30,color="tomato"))+
coord_cartesian(ylim=c(0,5000))

library(knitr)
by(bigMart2$Item_Outlet_Sales,bigMart2$Outlet_Size,summary)
## bigMart2$Outlet_Size:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 33.29 554.78 1443.45 1822.63 2681.51 9664.75
## --------------------------------------------------------
## bigMart2$Outlet_Size: High
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 73.24 1072.60 2050.66 2298.99 3166.38 10256.65
## --------------------------------------------------------
## bigMart2$Outlet_Size: Medium
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 69.24 1270.35 2251.07 2681.60 3691.20 13086.97
## --------------------------------------------------------
## bigMart2$Outlet_Size: Small
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 33.96 601.05 1544.66 1912.15 2824.32 9779.94
## Udacity - Explore Two Variables
qplot(x = Item_Weight, y = Item_Visibility,data = bigMart2)
## Warning: Removed 1463 rows containing missing values (geom_point).

qplot(x = Item_Visibility, y = Item_Outlet_Sales,data = bigMart2)

#More Formal GGPLOT
ggplot(aes(x = Item_Visibility, y = Item_Outlet_Sales), data = bigMart2)+
geom_point()

ggplot(aes(x = Item_Visibility, y = Item_Outlet_Sales), data = bigMart2)+
geom_point()+
xlim(0.0, 0.2)+
ylim(0,10000)
## Warning: Removed 142 rows containing missing values (geom_point).

ggplot(aes(x = Item_Visibility, y = Item_outlet_Sales_Count), data = bigMart2)+
geom_point()+
xlim(0.0, 0.2)+
ylim(0,50)
## Warning: Removed 144 rows containing missing values (geom_point).

ggplot(aes(x = Item_Visibility, y = Item_outlet_Sales_Count), data = bigMart2)+
geom_jitter(alpha=1/8)+
xlim(0.0, 0.2)+
ylim(0,50)
## Warning: Removed 412 rows containing missing values (geom_point).

ggplot(aes(x = Item_Visibility, y = Item_outlet_Sales_Count), data = bigMart2)+
geom_jitter(alpha=1/8)+
xlim(0.0, 0.2)+
coord_trans(y="sqrt")
## Warning: Removed 387 rows containing missing values (geom_point).

ggplot(aes(x = Item_Visibility, y = Item_outlet_Sales_Count), data = bigMart2)+
geom_point(alpha=1/8, position = position_jitter(h = 0))+
xlim(0.0, 0.2)+
coord_trans(y="sqrt")
## Warning: Removed 400 rows containing missing values (geom_point).

bigMart2.group_by_Outlet<-
bigMart2%>%
group_by(Outlet_Identifier,
Outlet_Establishment_Year,
Outlet_Age,
Outlet_Location_Type,
Outlet_Size,
Outlet_Type)%>%
summarise(mean_IOS=mean(Item_Outlet_Sales),
median_IOS=median(Item_Outlet_Sales),
sum_IOS=sum(Item_Outlet_Sales),
count_IOS=n())%>%
arrange(desc(median_IOS))
head(bigMart2.group_by_Outlet,15)
## # A tibble: 10 x 10
## # Groups: Outlet_Identifier, Outlet_Establishment_Year, Outlet_Age,
## # Outlet_Location_Type, Outlet_Size [10]
## Outlet_Identifier Outlet_Establishment_Year Outlet_Age
## <fctr> <int> <dbl>
## 1 OUT027 1985 28
## 2 OUT035 2004 9
## 3 OUT013 1987 26
## 4 OUT017 2007 6
## 5 OUT049 1999 14
## 6 OUT046 1997 16
## 7 OUT045 2002 11
## 8 OUT018 2009 4
## 9 OUT019 1985 28
## 10 OUT010 1998 15
## # ... with 7 more variables: Outlet_Location_Type <fctr>,
## # Outlet_Size <fctr>, Outlet_Type <fctr>, mean_IOS <dbl>,
## # median_IOS <dbl>, sum_IOS <dbl>, count_IOS <int>
#library(knitr)
#kable(cbind(bigMart2.group_by_Outlet, bigMart2.group_by_Outlet), "html") %>%
# kable_styling() %>%
# scroll_box(width = "500px", height = "200px")
ggplot(aes(x=Outlet_Age ,y=sum_IOS),data=bigMart2.group_by_Outlet)+
geom_point()

ggplot(aes(x=Outlet_Age ,y=sum_IOS),data=bigMart2.group_by_Outlet)+
geom_line()

ggplot(aes(x=median_IOS ,y=mean_IOS),data=bigMart2.group_by_Outlet)+
geom_line(color="orange",size=1.1)

##it doesn't work in our data
ggplot(aes(x = Item_Visibility, y = Item_outlet_Sales_Count), data = bigMart2)+
geom_point(alpha=1/8, position = position_jitter(h = 0), color = "orange")+
xlim(0.0, 0.2)+
geom_line(stat="summary",fun.y=sum)+
geom_line(stat="summary",fun.y=quantile, probs=.1,
linetype=2,color="blue")+
geom_line(stat="summary",fun.y=quantile, probs=.9,
linetype=2,color="blue")
## Warning: Ignoring unknown parameters: probs
## Warning: Ignoring unknown parameters: probs
## Warning: Removed 134 rows containing non-finite values (stat_summary).
## Warning: Removed 134 rows containing non-finite values (stat_summary).
## Warning: Removed 134 rows containing non-finite values (stat_summary).
## Warning: Removed 387 rows containing missing values (geom_point).

cor(bigMart2$Item_Visibility,bigMart2$Item_Outlet_Sales)
## [1] -0.1286246
cor(bigMart2$Item_Visibility,bigMart2$Item_outlet_Sales_Count)
## [1] -0.1610241
cor(bigMart2$Item_Visibility,bigMart2$Item_Weight)
## [1] NA
cor(bigMart2$Item_Visibility,bigMart2$Item_outlet_Sales_Count)
## [1] -0.1610241
cor.test(bigMart2$Item_Visibility,bigMart2$Item_outlet_Sales_Count,method = "pearson")
##
## Pearson's product-moment correlation
##
## data: bigMart2$Item_Visibility and bigMart2$Item_outlet_Sales_Count
## t = -15.061, df = 8521, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1816338 -0.1402730
## sample estimates:
## cor
## -0.1610241
library("corrplot")
## corrplot 0.84 loaded
M<-cor(
bigMart2%>%
select(Item_Weight
,Item_Visibility
,Item_MRP
,Item_Outlet_Sales
,Item_Identifier_Num
,Item_outlet_Sales_Count))
corrplot(M,method="number")

corrplot(M,method="circle")

ggplot(aes(x = Item_MRP, y = Item_Outlet_Sales), data = bigMart2)+
geom_point(alpha=1/8, position = position_jitter(h = 0), color = "orange")+
geom_line(stat="summary",fun.y=sum)+
geom_line(stat="summary",fun.y=quantile, probs=.1,
linetype=2,color="blue")+
geom_line(stat="summary",fun.y=quantile, probs=.9,
linetype=2,color="blue")
## Warning: Ignoring unknown parameters: probs
## Warning: Ignoring unknown parameters: probs
