Week2 Assignment

Preparation

I started with downloading ODD May 2018 Retail Sales Data from its website Then I put this data to my github repository, changing its name to odd_retail_sales_2018_05.xlsx

First I installed and loaded the readxl & tidyverse package

#install.packages("readxl")
#install.packages("tidyverse")

library(readxl)
library(tidyverse)

## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --

## <U+221A> ggplot2 3.0.0     <U+221A> purrr   0.2.5
## <U+221A> tibble  1.4.2     <U+221A> dplyr   0.7.7
## <U+221A> tidyr   0.8.1     <U+221A> stringr 1.3.1
## <U+221A> readr   1.1.1     <U+221A> forcats 0.3.0

## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Download Data

Create a temporary file

tmp<-tempfile(fileext=".xlsx")

Download file from repository to the temp file

download.file("https://github.com/MEF-BDA503/pj18-EmreKemerci/blob/master/AssignmentWeek2/odd_retail_sales_2018_05.xlsx?raw=true",mode = 'wb',destfile=tmp)

Read that excel file using readxl package’s read_excel function. You might need to adjust the parameters (skip, col_names) according to your raw file’s format.

raw_data<-readxl::read_excel(tmp,skip=7,col_names=FALSE)

When I first run readxl function, it gave me error – Evaluation error: error -103 with zipfile in unzGetCurrentFileInfo then I insert mode=‘wb’ to download.file function

Remove the temp file

file.remove(tmp)

## [1] TRUE

I checked head and tail of my data

head(raw_data)

## # A tibble: 6 x 10
##   X__1          X__2  X__3  X__4  X__5  X__6  X__7  X__8  X__9 X__10
##   <chr>        <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALFA ROMEO      NA    25    25    NA    NA     0     0    25    25
## 2 ASTON MARTIN    NA     1     1    NA    NA     0     0     1     1
## 3 AUDI            NA  1630  1630    NA    NA     0     0  1630  1630
## 4 BENTLEY         NA     1     1    NA    NA     0     0     1     1
## 5 BMW             NA  1910  1910    NA    NA     0     0  1910  1910
## 6 CITROEN         NA   980   980    NA   634   634     0  1614  1614

tail(raw_data)

## # A tibble: 6 x 10
##   X__1        X__2  X__3  X__4  X__5  X__6  X__7  X__8  X__9 X__10
##   <chr>      <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SUZUKI        NA   383   383    NA    NA     0     0   383   383
## 2 TOYOTA      2438   370  2808    NA   352   352  2438   722  3160
## 3 VOLKSWAGEN    NA  5913  5913    NA  1931  1931     0  7844  7844
## 4 VOLVO         NA   590   590    NA    NA     0     0   590   590
## 5 <NA>          NA    NA    NA    NA    NA    NA    NA    NA    NA
## 6 TOPLAM:    19008 38219 57227  8230  7298 15528 27238 45517 72755

Since the last 2 rows are blank and total, I removed these rows

raw_data <- raw_data %>% slice(-c(43,44))

## Warning: package 'bindrcpp' was built under R version 3.4.4

tail(raw_data)

## # A tibble: 6 x 10
##   X__1        X__2  X__3  X__4  X__5  X__6  X__7  X__8  X__9 X__10
##   <chr>      <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SSANGYONG     NA    19    19    NA    12    12     0    31    31
## 2 SUBARU        NA   144   144    NA    NA     0     0   144   144
## 3 SUZUKI        NA   383   383    NA    NA     0     0   383   383
## 4 TOYOTA      2438   370  2808    NA   352   352  2438   722  3160
## 5 VOLKSWAGEN    NA  5913  5913    NA  1931  1931     0  7844  7844
## 6 VOLVO         NA   590   590    NA    NA     0     0   590   590

head(raw_data)

## # A tibble: 6 x 10
##   X__1          X__2  X__3  X__4  X__5  X__6  X__7  X__8  X__9 X__10
##   <chr>        <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALFA ROMEO      NA    25    25    NA    NA     0     0    25    25
## 2 ASTON MARTIN    NA     1     1    NA    NA     0     0     1     1
## 3 AUDI            NA  1630  1630    NA    NA     0     0  1630  1630
## 4 BENTLEY         NA     1     1    NA    NA     0     0     1     1
## 5 BMW             NA  1910  1910    NA    NA     0     0  1910  1910
## 6 CITROEN         NA   980   980    NA   634   634     0  1614  1614

Make Data Civilized

Define column names

colnames(raw_data) <- c("brand_name","auto_dom","auto_imp","auto_total","comm_dom","comm_imp","comm_total","total_dom","total_imp","total_total")

and remove NA values with 0 and label the time period with year and month in order to prevent confusion when we merge data ( mutate() adds new variables and preserves existing; transmute() drops existing variables )

car_data_may_18 <- raw_data %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year=2018,month=5)

print(car_data_may_18,width=Inf)

## # A tibble: 42 x 12
##    brand_name   auto_dom auto_imp auto_total comm_dom comm_imp comm_total
##    <chr>           <dbl>    <dbl>      <dbl>    <dbl>    <dbl>      <dbl>
##  1 ALFA ROMEO          0       25         25        0        0          0
##  2 ASTON MARTIN        0        1          1        0        0          0
##  3 AUDI                0     1630       1630        0        0          0
##  4 BENTLEY             0        1          1        0        0          0
##  5 BMW                 0     1910       1910        0        0          0
##  6 CITROEN             0      980        980        0      634        634
##  7 DACIA               0     3193       3193        0      328        328
##  8 DS                  0        9          9        0        0          0
##  9 FERRARI             0        1          1        0        0          0
## 10 FIAT             5309      178       5487     2731      362       3093
##    total_dom total_imp total_total  year month
##        <dbl>     <dbl>       <dbl> <dbl> <dbl>
##  1         0        25          25  2018     5
##  2         0         1           1  2018     5
##  3         0      1630        1630  2018     5
##  4         0         1           1  2018     5
##  5         0      1910        1910  2018     5
##  6         0      1614        1614  2018     5
##  7         0      3521        3521  2018     5
##  8         0         9           9  2018     5
##  9         0         1           1  2018     5
## 10      8040       540        8580  2018     5
## # ... with 32 more rows

Saving Civilized Data

One of the best methods is to save your data to an RDS or RData file. The difference is RDS can hold only one object but RData can hold many. Since we have only one data frame here we will go with RDS.

You can read that file by readRDS and assigning to an object rds_data <- readRDS("~/YOUR_OWN_PATH/odd_car_sales_data_sep_18.rds")

saveRDS(car_data_may_18,file="data_may_18.rds")

Time to make some analysis

car_data_may_18 %>% 
  select(brand_name,auto_dom) %>% 
  arrange(desc(auto_dom)) %>% 
  filter(auto_dom>0)

## # A tibble: 6 x 2
##   brand_name auto_dom
##   <chr>         <dbl>
## 1 RENAULT        6586
## 2 FIAT           5309
## 3 TOYOTA         2438
## 4 HYUNDAI        2142
## 5 HONDA          2132
## 6 FORD            401

There are only 6 auto brands domesticly produced, check also commercial ones

car_data_may_18 %>% 
  select(brand_name,comm_dom) %>% 
  arrange(desc(comm_dom)) %>% 
  filter(comm_dom>0)

## # A tibble: 5 x 2
##   brand_name comm_dom
##   <chr>         <dbl>
## 1 FORD           5136
## 2 FIAT           2731
## 3 ISUZU           265
## 4 KARSAN           76
## 5 MITSUBISHI       22

car_data_may_18 %>% 
  select(brand_name,auto_dom,comm_dom) %>% 
  arrange(desc(auto_dom)) %>% 
  filter(comm_dom>0) %>% filter(auto_dom>0)

## # A tibble: 2 x 3
##   brand_name auto_dom comm_dom
##   <chr>         <dbl>    <dbl>
## 1 FIAT           5309     2731
## 2 FORD            401     5136

Only Ford & Fiat are the domestic producers of both auto & commercial vehicle. What about imports?

car_data_may_18 %>% 
  select(brand_name,auto_imp,comm_imp) %>% 
  arrange(desc(auto_imp)) %>% 
  filter(comm_imp>0) %>% filter(auto_imp>0)

## # A tibble: 14 x 3
##    brand_name    auto_imp comm_imp
##    <chr>            <dbl>    <dbl>
##  1 VOLKSWAGEN        5913     1931
##  2 PEUGEOT           3819      404
##  3 DACIA             3193      328
##  4 FORD              2803      216
##  5 RENAULT           2694      969
##  6 NISSAN            2508      116
##  7 HYUNDAI           2463      199
##  8 MERCEDES-BENZ     2153     1204
##  9 CITROEN            980      634
## 10 KIA                742      221
## 11 TOYOTA             370      352
## 12 FIAT               178      362
## 13 MITSUBISHI          57      184
## 14 SSANGYONG           19       12

I want to see if there is any brand which has sales value both in domestic and import columns.

car_data_may_18 %>% 
  select(brand_name,total_dom,total_imp) %>% 
   
  filter(total_dom>0) %>% filter(total_imp>0)

## # A tibble: 8 x 3
##   brand_name total_dom total_imp
##   <chr>          <dbl>     <dbl>
## 1 FIAT            8040       540
## 2 FORD            5537      3019
## 3 HONDA           2132       588
## 4 HYUNDAI         2142      2662
## 5 ISUZU            265        33
## 6 MITSUBISHI        22       241
## 7 RENAULT         6586      3663
## 8 TOYOTA          2438       722

8 brands have both sales from domestic production and import,

Which brand’s import sale has the highest portion from its total sales?

car_data_may_18 %>% 
  select(brand_name,total_dom,total_imp) %>% 
   
  filter(total_dom>0) %>% filter(total_imp>0) %>% 
  mutate(percentage=total_imp/(total_dom+total_imp)) %>% arrange(desc(percentage))

## # A tibble: 8 x 4
##   brand_name total_dom total_imp percentage
##   <chr>          <dbl>     <dbl>      <dbl>
## 1 MITSUBISHI        22       241     0.916 
## 2 HYUNDAI         2142      2662     0.554 
## 3 RENAULT         6586      3663     0.357 
## 4 FORD            5537      3019     0.353 
## 5 TOYOTA          2438       722     0.228 
## 6 HONDA           2132       588     0.216 
## 7 ISUZU            265        33     0.111 
## 8 FIAT            8040       540     0.0629