ODD 02/2018 Retail Sales Data Analysis

Downloading the row data

library(tidyverse)

## -- Attaching packages ---------------------------------------------------------------------------- tidyverse 1.2.1 --

## <U+221A> ggplot2 3.0.0     <U+221A> purrr   0.2.5
## <U+221A> tibble  1.4.2     <U+221A> dplyr   0.7.6
## <U+221A> tidyr   0.8.1     <U+221A> stringr 1.3.1
## <U+221A> readr   1.1.1     <U+221A> forcats 0.3.0

## -- Conflicts ------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

# Create a temporary file
tmp<-tempfile(fileext=".xlsx")
# Download file from repository to the temp file
download.file("https://github.com/MEF-BDA503/pj18-aturhal/blob/master/odd_retail_sales_2018_02.xlsx?raw=true",destfile=tmp, mode="wb")
# Read that excel file using readxl package's read_excel function. You might need to adjust the parameters (skip, col_names) according to your raw file's format.
raw_data<-readxl::read_excel(tmp,skip=7,col_names=FALSE)
# # Remove the temp file
file.remove(tmp)

## [1] TRUE

Civilize the data

# Remove the last two rows because they are irrelevant (total and empty rows)
raw_data <- raw_data %>% slice(-c(43,44))

# Let's see our raw data
head(raw_data)

## # A tibble: 6 x 10
##   X__1          X__2  X__3  X__4  X__5  X__6  X__7  X__8  X__9 X__10
##   <chr>        <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALFA ROMEO      NA    14    14    NA    NA     0     0    14    14
## 2 ASTON MARTIN    NA     1     1    NA    NA     0     0     1     1
## 3 AUDI            NA  1033  1033    NA    NA     0     0  1033  1033
## 4 BENTLEY         NA     3     3    NA    NA     0     0     3     3
## 5 BMW              0  1003  1003    NA    NA     0     0  1003  1003
## 6 CHERY           NA    NA     0    NA    NA     0     0     0     0

# Use the same column names in your data.
colnames(raw_data) <- c("brand_name","auto_dom","auto_imp","auto_total","comm_dom","comm_imp","comm_total","total_dom","total_imp","total_total")
# Now we replace NA values with 0 and label the time period with year and month, so when we merge the data we won't be confused.
car_data_feb_18 <- raw_data %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year=2018,month=2)

print(car_data_feb_18,width=Inf)

## # A tibble: 43 x 12
##    brand_name   auto_dom auto_imp auto_total comm_dom comm_imp comm_total
##    <chr>           <dbl>    <dbl>      <dbl>    <dbl>    <dbl>      <dbl>
##  1 ALFA ROMEO          0       14         14        0        0          0
##  2 ASTON MARTIN        0        1          1        0        0          0
##  3 AUDI                0     1033       1033        0        0          0
##  4 BENTLEY             0        3          3        0        0          0
##  5 BMW                 0     1003       1003        0        0          0
##  6 CHERY               0        0          0        0        0          0
##  7 CITROEN             0     1128       1128        0      291        291
##  8 DACIA               0     1298       1298        0      302        302
##  9 DS                  0        0          0        0        0          0
## 10 FERRARI             0        1          1        0        0          0
##    total_dom total_imp total_total  year month
##        <dbl>     <dbl>       <dbl> <dbl> <dbl>
##  1         0        14          14  2018     2
##  2         0         1           1  2018     2
##  3         0      1033        1033  2018     2
##  4         0         3           3  2018     2
##  5         0      1003        1003  2018     2
##  6         0         0           0  2018     2
##  7         0      1419        1419  2018     2
##  8         0      1600        1600  2018     2
##  9         0         0           0  2018     2
## 10         0         1           1  2018     2
## # ... with 33 more rows

Save the data

saveRDS(car_data_feb_18,file="~/Data Science/Big Data Essentials/odd_car_sales_data_feb_18.rds")
# You can read that file by readRDS and assigning to an object 
# e.g 
#rds_data <- readRDS("~/Data Science/Big Data Essentials/odd_car_sales_data_feb_18.rds")

Analyze the data

#List of total sales of brands that have sold more automobiles than commercial vehicles ordered in decreasing total sales
car_data_feb_18 %>%
  slice(-c(43,44)) %>%
  filter(auto_total > comm_total) %>%
  select(brand_name,auto_total, comm_total) %>%
  arrange(desc(auto_total))

## # A tibble: 32 x 3
##    brand_name auto_total comm_total
##    <chr>           <dbl>      <dbl>
##  1 RENAULT          6552        793
##  2 VOLKSWAGEN       3763       1516
##  3 HYUNDAI          2920         85
##  4 SKODA            2161          0
##  5 TOYOTA           1906        267
##  6 HONDA            1720          0
##  7 PEUGEOT          1625        277
##  8 OPEL             1568          0
##  9 NISSAN           1470        154
## 10 DACIA            1298        302
## # ... with 22 more rows

#List of total automobile sales of brands and their percentages to total respectively ordered in decreasing percentages 
car_data_feb_18 %>%
  slice(-c(43,44)) %>%
  transmute(brand_name, auto_total, auto_percents = round(auto_total/sum(auto_total)*100, 2)) %>%
  arrange(desc(auto_percents))

## # A tibble: 42 x 3
##    brand_name auto_total auto_percents
##    <chr>           <dbl>         <dbl>
##  1 RENAULT          6552         18.4 
##  2 VOLKSWAGEN       3763         10.6 
##  3 HYUNDAI          2920          8.2 
##  4 FIAT             2377          6.68
##  5 SKODA            2161          6.07
##  6 TOYOTA           1906          5.35
##  7 FORD             1770          4.97
##  8 HONDA            1720          4.83
##  9 PEUGEOT          1625          4.56
## 10 OPEL             1568          4.4 
## # ... with 32 more rows

ODD 02/2018 Retail Sales Data Analysis

Ahmet Turhal

Oct 21, 2018