Get Used to Big Mart Sales Data
- In this work we try to understand data better.
- sources:
setwd("C:/Users/yetkinEser/Desktop/R/datas")
bigMart<-read.csv('bigMartTrain.csv')
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
library(ggplot2)
bigMart2<-
bigMart%>%
mutate(Item_Identifier_Str=substr(Item_Identifier,1,3),
Item_Identifier_Num=as.numeric(substr(Item_Identifier,4,6)),
Outlet_Age=2013-Outlet_Establishment_Year,
Item_Fat_Fixed=str_replace(
str_replace(
str_replace(Item_Fat_Content,"LF","Low Fat")
,"reg","Regular"),"low fat","Low Fat"),
Item_outlet_Sales_Count = Item_Outlet_Sales / Item_MRP,
PK=row_number())
names(bigMart2)
## [1] "Item_Identifier" "Item_Weight"
## [3] "Item_Fat_Content" "Item_Visibility"
## [5] "Item_Type" "Item_MRP"
## [7] "Outlet_Identifier" "Outlet_Establishment_Year"
## [9] "Outlet_Size" "Outlet_Location_Type"
## [11] "Outlet_Type" "Item_Outlet_Sales"
## [13] "Item_Identifier_Str" "Item_Identifier_Num"
## [15] "Outlet_Age" "Item_Fat_Fixed"
## [17] "Item_outlet_Sales_Count" "PK"
qplot(x=Item_Fat_Content,data=bigMart2)
data:image/s3,"s3://crabby-images/982dc/982dc579279069aa359790ca883354b6ef61d735" alt=""
qplot(x=Item_Fat_Fixed,data=bigMart2)
data:image/s3,"s3://crabby-images/714fe/714fef3881c37b12d4f065cb4d5afce37bfa3576" alt=""
# Looking at Item Type
qplot(x=Item_Type,data=bigMart2)+
geom_bar(color="green")+
theme(axis.text = element_text(angle = 0,color="purple"))+
coord_flip()
data:image/s3,"s3://crabby-images/6a8ac/6a8ac703ac00d83759cc361c18f4e85b7d7e6b6e" alt=""
# Looking at Item Type with facet wrap according to Outlet Identifier
qplot(x=Item_Type,data=bigMart2)+
geom_bar(color="green")+
theme(axis.text = element_text(angle = 0,color="purple"))+
coord_flip()+
facet_wrap(~Outlet_Identifier,nrow=2)
data:image/s3,"s3://crabby-images/ee8a2/ee8a2e6e88de0e7e24542b27a1380e2a89f062d3" alt=""
# Looking at Item Type with facet wrap according to Item_Identifier_Num
qplot(x=Item_Type,data=bigMart2)+
geom_bar(color="green")+
theme(axis.text = element_text(angle = 0,color="purple"))+
coord_flip()+
facet_wrap(~Item_Identifier_Num,nrow=2)
data:image/s3,"s3://crabby-images/6fb65/6fb65bf81228c1b6af99f8da0a2dac5fed52a14e" alt=""
# Looking at Item Type with facet wrap according to Item_Identifier_Str
qplot(x=Item_Type,data=bigMart2)+
geom_bar(color="green")+
theme(axis.text = element_text(angle = 90,color="purple"))+
coord_flip()+
facet_wrap(~Item_Identifier_Str,nrow=2)
data:image/s3,"s3://crabby-images/0eb94/0eb942b7a0dc1dbae0727711c8558bdf2935b8d1" alt=""
# Looking at Item_Outlet_Sales
qplot(x=Item_Outlet_Sales, data = bigMart2,binwidth = 250)+
geom_bar(color="green")+
theme(axis.text = element_text(angle = 90,color="purple"))+
scale_x_continuous(limits=c(0, 10000),breaks=seq(0, 10000, 500))
## Warning: Removed 8 rows containing non-finite values (stat_bin).
## Warning: Removed 8 rows containing non-finite values (stat_count).
data:image/s3,"s3://crabby-images/72231/72231851e4ede869981fea363cd8011574995d56" alt=""
# Looking at Item_Outlet_Sales according to Outlet_Identifier
qplot(x=Item_Outlet_Sales,data = bigMart2,binwidth = 250)+
geom_bar(color="green")+
theme(axis.text.x = element_text(angle = 90,color="purple"),
axis.text.y = element_text(angle = 30,color="tomato"))+
scale_x_continuous(limits=c(0,5000),breaks=seq(0,5000,500))+
scale_y_continuous(limits=c(0,250),breaks=seq(0,200,100))+
facet_wrap(~Outlet_Identifier,nrow=5)
## Warning: Removed 629 rows containing non-finite values (stat_bin).
## Warning: Removed 629 rows containing non-finite values (stat_count).
## Warning: Removed 2 rows containing missing values (geom_bar).
data:image/s3,"s3://crabby-images/f8bb9/f8bb9d35f723009931888ec6a09d48a5e4dcc666" alt=""
# Looking at Item_Outlet_Sales according to Outlet_Size
qplot(x=Item_Outlet_Sales,data = bigMart2,binwidth = 250)+
geom_bar(color="green")+
theme(axis.text.x = element_text(angle = 90,color="purple"),
axis.text.y = element_text(angle = 30,color="tomato"))+
scale_x_continuous(limits=c(0,5000),breaks=seq(0,5000,500))+
scale_y_continuous(limits=c(0,250),breaks=seq(0,200,100))+
facet_wrap(~Outlet_Size,nrow=4)
## Warning: Removed 629 rows containing non-finite values (stat_bin).
## Warning: Removed 629 rows containing non-finite values (stat_count).
## Warning: Removed 2 rows containing missing values (geom_bar).
data:image/s3,"s3://crabby-images/77578/77578a0f7066a5ebd4df855adc585cd0ba957403" alt=""
# Looking at Item_Outlet_Sales according to Outlet_Size with is.na function
qplot(x=Item_Outlet_Sales,data = subset(bigMart2, !is.na(Outlet_Size)),binwidth = 250)+
geom_bar(color="green")+
theme(axis.text.x = element_text(angle = 90,color="purple"),
axis.text.y = element_text(angle = 30,color="tomato"))+
scale_x_continuous(limits=c(0,5000),breaks=seq(0,5000,500))+
scale_y_continuous(limits=c(0,250),breaks=seq(0,200,100))+
facet_wrap(~Outlet_Size,nrow=4)
## Warning: Removed 629 rows containing non-finite values (stat_bin).
## Warning: Removed 629 rows containing non-finite values (stat_count).
## Warning: Removed 2 rows containing missing values (geom_bar).
data:image/s3,"s3://crabby-images/77578/77578a0f7066a5ebd4df855adc585cd0ba957403" alt=""
table(bigMart2$Outlet_Size)
##
## High Medium Small
## 2410 932 2793 2388
by(bigMart2$Item_Outlet_Sales,bigMart2$Outlet_Size,summary)
## bigMart2$Outlet_Size:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 33.29 554.78 1443.45 1822.63 2681.51 9664.75
## --------------------------------------------------------
## bigMart2$Outlet_Size: High
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 73.24 1072.60 2050.66 2298.99 3166.38 10256.65
## --------------------------------------------------------
## bigMart2$Outlet_Size: Medium
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 69.24 1270.35 2251.07 2681.60 3691.20 13086.97
## --------------------------------------------------------
## bigMart2$Outlet_Size: Small
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 33.96 601.05 1544.66 1912.15 2824.32 9779.94
# Looking at Item_Outlet_Sales according to Outlet_Size with is.na function
# And giving name to y label
qplot(x=Item_Outlet_Sales,data = subset(bigMart2, !is.na(Outlet_Size)),binwidth = 250,
ylab="Count Of Sales")+
geom_bar(color="green")+
theme(axis.text.x = element_text(angle = 90,color="purple"),
axis.text.y = element_text(angle = 30,color="tomato"))+
scale_x_continuous(limits=c(0,5000),breaks=seq(0,5000,500))+
scale_y_continuous(limits=c(0,250),breaks=seq(0,200,100))+
facet_wrap(~Outlet_Size,nrow=4)
## Warning: Removed 629 rows containing non-finite values (stat_bin).
## Warning: Removed 629 rows containing non-finite values (stat_count).
## Warning: Removed 2 rows containing missing values (geom_bar).
data:image/s3,"s3://crabby-images/5cd49/5cd49b31f30f4bfbda5551aea7fae004d9fb9ae3" alt=""
qplot(x=Item_Type,data=bigMart2,
ylab="Count Of Sales",color=I("black"),fill=I("#5760AB"))+
#geom_bar(color="green")+
theme(axis.text = element_text(angle = 0,color="purple"))+
coord_flip()
data:image/s3,"s3://crabby-images/9c55e/9c55e5ba4c8720b90e123b0762c0b8744619d70c" alt=""
# Look at summary with different wat
summary(bigMart$Item_Outlet_Sales)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 33.29 834.25 1794.33 2181.29 3101.30 13086.97
summary(log10(bigMart$Item_Outlet_Sales+1))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.535 2.922 3.254 3.169 3.492 4.117
summary(sqrt(bigMart$Item_Outlet_Sales))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.77 28.88 42.36 42.94 55.69 114.40
# histogram with sqrt and log10
qplot(x=Item_Outlet_Sales,data = subset(bigMart2, !is.na(Outlet_Size)),binwidth = 50,
ylab="Count Of Sales")+
geom_bar(color="green")+
theme(axis.text.x = element_text(angle = 90,color="purple"),
axis.text.y = element_text(angle = 30,color="tomato"))+
scale_x_continuous(limits=c(0,5000),breaks=seq(0,5000,250))+
scale_y_continuous(limits=c(0,250),breaks=seq(0,200,100))
## Warning: Removed 629 rows containing non-finite values (stat_bin).
## Warning: Removed 629 rows containing non-finite values (stat_count).
data:image/s3,"s3://crabby-images/5f925/5f9250db9df759ada643b9de729fb8ff4a0229a7" alt=""
# It is better now like normal distibution with sqrt and log10
qplot(x=sqrt(Item_Outlet_Sales),data = subset(bigMart2, !is.na(Outlet_Size)),binwidth = 1,
ylab="Count Of Sales",
xlab="SQRT of Outlet Sales")+
geom_bar(color="green")+
theme(axis.text.x = element_text(angle = 90,color="purple"),
axis.text.y = element_text(angle = 30,color="tomato"))+
scale_x_continuous(limits=c(0,100),breaks=seq(0,70,15))+
scale_y_continuous(limits=c(0,200),breaks=seq(0,200,100))
## Warning: Removed 8 rows containing non-finite values (stat_bin).
## Warning: Removed 8 rows containing non-finite values (stat_count).
data:image/s3,"s3://crabby-images/556f2/556f22cfd6721acdcd5ab75165660d7ba55943a4" alt=""
qplot(x=log10(Item_Outlet_Sales+1),data = subset(bigMart2, !is.na(Outlet_Size)),binwidth = 0.01,
ylab="Count Of Sales",
xlab="LOG10 of Outlet Sales")+
geom_bar(color="green")+
theme(axis.text.x = element_text(angle = 90,color="purple"),
axis.text.y = element_text(angle = 30,color="tomato"))+
scale_x_continuous(limits=c(1.5,4.5),breaks=seq(1.5,4.5,0.5))+
scale_y_continuous(limits=c(0,200),breaks=seq(0,200,100))
data:image/s3,"s3://crabby-images/6aa72/6aa72e3f29c0ca438162bec2f7eae0c823ad0827" alt=""
# it is bettew with scale_x_log10()
qplot(x=Item_Outlet_Sales,data = subset(bigMart2, !is.na(Outlet_Size)),binwidth = 0.01,
ylab="Count Of Sales",
xlab="LOG10 of Outlet Sales")+
geom_bar(color="green")+
theme(axis.text.x = element_text(angle = 90,color="purple"),
axis.text.y = element_text(angle = 30,color="tomato"))+
scale_x_continuous(limits=c(1.5,4.5),breaks=seq(1.5,4.5,0.5))+
scale_y_continuous(limits=c(0,200),breaks=seq(0,200,100))+
scale_x_log10()
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
data:image/s3,"s3://crabby-images/82c50/82c506ffbef734e9425a473656e5d7955e9b87ec" alt=""
# it is bettew with scale_x_log10()
qplot(x=Item_Outlet_Sales,data = subset(bigMart2, !is.na(Outlet_Size)),binwidth = 0.01,
ylab="Count Of Sales",
xlab="LOG10 of Outlet Sales",
geom="freqpoly",
color=Outlet_Size)+
geom_bar(color="green")+
theme(axis.text.x = element_text(angle = 90,color="purple"),
axis.text.y = element_text(angle = 30,color="tomato"))+
scale_x_continuous(limits=c(1.5,4.5),breaks=seq(1.5,4.5,0.5))+
scale_y_continuous(limits=c(0,40),breaks=seq(0,40,10))+
scale_x_log10()
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
data:image/s3,"s3://crabby-images/428fd/428fdcab9532d8155ee2eeed8ef8918899a8a6e5" alt=""
# First Box-Plot
qplot(x=Outlet_Size, y=Item_Outlet_Sales,
data = subset(bigMart2, !(Outlet_Size=="")),
geom = "boxplot")+
theme(axis.text.x = element_text(angle = 90,color="purple"),
axis.text.y = element_text(angle = 30,color="tomato"))
data:image/s3,"s3://crabby-images/e3aba/e3aba5a6a2db311e5ed14a8f9c4076425d762827" alt=""
#it is better
qplot(x=Outlet_Size, y=Item_Outlet_Sales,
data = subset(bigMart2, !(Outlet_Size=="")),
geom = "boxplot")+
theme(axis.text.x = element_text(angle = 90,color="purple"),
axis.text.y = element_text(angle = 30,color="tomato"))+
coord_cartesian(ylim=c(0,5000))
data:image/s3,"s3://crabby-images/ff7ff/ff7ff5333133e482f1e11767709ee1cba8f528cc" alt=""
library(knitr)
by(bigMart2$Item_Outlet_Sales,bigMart2$Outlet_Size,summary)
## bigMart2$Outlet_Size:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 33.29 554.78 1443.45 1822.63 2681.51 9664.75
## --------------------------------------------------------
## bigMart2$Outlet_Size: High
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 73.24 1072.60 2050.66 2298.99 3166.38 10256.65
## --------------------------------------------------------
## bigMart2$Outlet_Size: Medium
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 69.24 1270.35 2251.07 2681.60 3691.20 13086.97
## --------------------------------------------------------
## bigMart2$Outlet_Size: Small
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 33.96 601.05 1544.66 1912.15 2824.32 9779.94
## Udacity - Explore Two Variables
qplot(x = Item_Weight, y = Item_Visibility,data = bigMart2)
## Warning: Removed 1463 rows containing missing values (geom_point).
data:image/s3,"s3://crabby-images/29c3a/29c3a413bc9f5d720ed7403f24f9c3b502f84555" alt=""
qplot(x = Item_Visibility, y = Item_Outlet_Sales,data = bigMart2)
data:image/s3,"s3://crabby-images/5ccfe/5ccfe5c530aa52915ec34b7123a60f0cfbaab888" alt=""
#More Formal GGPLOT
ggplot(aes(x = Item_Visibility, y = Item_Outlet_Sales), data = bigMart2)+
geom_point()
data:image/s3,"s3://crabby-images/5ccfe/5ccfe5c530aa52915ec34b7123a60f0cfbaab888" alt=""
ggplot(aes(x = Item_Visibility, y = Item_Outlet_Sales), data = bigMart2)+
geom_point()+
xlim(0.0, 0.2)+
ylim(0,10000)
## Warning: Removed 142 rows containing missing values (geom_point).
data:image/s3,"s3://crabby-images/23575/235752345df71aa6a62291b3686fa1aa10f1dc42" alt=""
ggplot(aes(x = Item_Visibility, y = Item_outlet_Sales_Count), data = bigMart2)+
geom_point()+
xlim(0.0, 0.2)+
ylim(0,50)
## Warning: Removed 144 rows containing missing values (geom_point).
data:image/s3,"s3://crabby-images/1d712/1d712a7be3df2eb86b5998d81f44f2c3f683e3e4" alt=""
ggplot(aes(x = Item_Visibility, y = Item_outlet_Sales_Count), data = bigMart2)+
geom_jitter(alpha=1/8)+
xlim(0.0, 0.2)+
ylim(0,50)
## Warning: Removed 412 rows containing missing values (geom_point).
data:image/s3,"s3://crabby-images/affc7/affc7a70314dffd6ef5d75696d6d90543e4ec187" alt=""
ggplot(aes(x = Item_Visibility, y = Item_outlet_Sales_Count), data = bigMart2)+
geom_jitter(alpha=1/8)+
xlim(0.0, 0.2)+
coord_trans(y="sqrt")
## Warning: Removed 387 rows containing missing values (geom_point).
data:image/s3,"s3://crabby-images/5fe9f/5fe9f569db18de96b354f1a22c4e198e2711ab94" alt=""
ggplot(aes(x = Item_Visibility, y = Item_outlet_Sales_Count), data = bigMart2)+
geom_point(alpha=1/8, position = position_jitter(h = 0))+
xlim(0.0, 0.2)+
coord_trans(y="sqrt")
## Warning: Removed 400 rows containing missing values (geom_point).
data:image/s3,"s3://crabby-images/b86fd/b86fdb10a25010159c604841fcd746d46cd31bdc" alt=""
bigMart2.group_by_Outlet<-
bigMart2%>%
group_by(Outlet_Identifier,
Outlet_Establishment_Year,
Outlet_Age,
Outlet_Location_Type,
Outlet_Size,
Outlet_Type)%>%
summarise(mean_IOS=mean(Item_Outlet_Sales),
median_IOS=median(Item_Outlet_Sales),
sum_IOS=sum(Item_Outlet_Sales),
count_IOS=n())%>%
arrange(desc(median_IOS))
head(bigMart2.group_by_Outlet,15)
## # A tibble: 10 x 10
## # Groups: Outlet_Identifier, Outlet_Establishment_Year, Outlet_Age,
## # Outlet_Location_Type, Outlet_Size [10]
## Outlet_Identifier Outlet_Establishment_Year Outlet_Age
## <fctr> <int> <dbl>
## 1 OUT027 1985 28
## 2 OUT035 2004 9
## 3 OUT013 1987 26
## 4 OUT017 2007 6
## 5 OUT049 1999 14
## 6 OUT046 1997 16
## 7 OUT045 2002 11
## 8 OUT018 2009 4
## 9 OUT019 1985 28
## 10 OUT010 1998 15
## # ... with 7 more variables: Outlet_Location_Type <fctr>,
## # Outlet_Size <fctr>, Outlet_Type <fctr>, mean_IOS <dbl>,
## # median_IOS <dbl>, sum_IOS <dbl>, count_IOS <int>
#library(knitr)
#kable(cbind(bigMart2.group_by_Outlet, bigMart2.group_by_Outlet), "html") %>%
# kable_styling() %>%
# scroll_box(width = "500px", height = "200px")
ggplot(aes(x=Outlet_Age ,y=sum_IOS),data=bigMart2.group_by_Outlet)+
geom_point()
data:image/s3,"s3://crabby-images/d6724/d672468132f3aab4d50154df2c9d7588f0af5f83" alt=""
ggplot(aes(x=Outlet_Age ,y=sum_IOS),data=bigMart2.group_by_Outlet)+
geom_line()
data:image/s3,"s3://crabby-images/cd098/cd098665937e46d5fddf5c5574b955c087343138" alt=""
ggplot(aes(x=median_IOS ,y=mean_IOS),data=bigMart2.group_by_Outlet)+
geom_line(color="orange",size=1.1)
data:image/s3,"s3://crabby-images/7d785/7d785d0877f223a28e64ff83355841072d3d8df4" alt=""
##it doesn't work in our data
ggplot(aes(x = Item_Visibility, y = Item_outlet_Sales_Count), data = bigMart2)+
geom_point(alpha=1/8, position = position_jitter(h = 0), color = "orange")+
xlim(0.0, 0.2)+
geom_line(stat="summary",fun.y=sum)+
geom_line(stat="summary",fun.y=quantile, probs=.1,
linetype=2,color="blue")+
geom_line(stat="summary",fun.y=quantile, probs=.9,
linetype=2,color="blue")
## Warning: Ignoring unknown parameters: probs
## Warning: Ignoring unknown parameters: probs
## Warning: Removed 134 rows containing non-finite values (stat_summary).
## Warning: Removed 134 rows containing non-finite values (stat_summary).
## Warning: Removed 134 rows containing non-finite values (stat_summary).
## Warning: Removed 387 rows containing missing values (geom_point).
data:image/s3,"s3://crabby-images/9aac3/9aac3c3e3cd3f5314089a2c0334848bf9e9058f9" alt=""
cor(bigMart2$Item_Visibility,bigMart2$Item_Outlet_Sales)
## [1] -0.1286246
cor(bigMart2$Item_Visibility,bigMart2$Item_outlet_Sales_Count)
## [1] -0.1610241
cor(bigMart2$Item_Visibility,bigMart2$Item_Weight)
## [1] NA
cor(bigMart2$Item_Visibility,bigMart2$Item_outlet_Sales_Count)
## [1] -0.1610241
cor.test(bigMart2$Item_Visibility,bigMart2$Item_outlet_Sales_Count,method = "pearson")
##
## Pearson's product-moment correlation
##
## data: bigMart2$Item_Visibility and bigMart2$Item_outlet_Sales_Count
## t = -15.061, df = 8521, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1816338 -0.1402730
## sample estimates:
## cor
## -0.1610241
library("corrplot")
## corrplot 0.84 loaded
M<-cor(
bigMart2%>%
select(Item_Weight
,Item_Visibility
,Item_MRP
,Item_Outlet_Sales
,Item_Identifier_Num
,Item_outlet_Sales_Count))
corrplot(M,method="number")
data:image/s3,"s3://crabby-images/57231/572318cd4ecbee9d4296ed156dc3d7d0177468bd" alt=""
corrplot(M,method="circle")
data:image/s3,"s3://crabby-images/4e681/4e68156072a08f721678a8546c78d307fef7cee5" alt=""
ggplot(aes(x = Item_MRP, y = Item_Outlet_Sales), data = bigMart2)+
geom_point(alpha=1/8, position = position_jitter(h = 0), color = "orange")+
geom_line(stat="summary",fun.y=sum)+
geom_line(stat="summary",fun.y=quantile, probs=.1,
linetype=2,color="blue")+
geom_line(stat="summary",fun.y=quantile, probs=.9,
linetype=2,color="blue")
## Warning: Ignoring unknown parameters: probs
## Warning: Ignoring unknown parameters: probs
data:image/s3,"s3://crabby-images/56b46/56b46016ae6e047902c2d07c582882ad8b8a71e8" alt=""