install.packages("readxl")
## 
## The downloaded binary packages are in
##  /var/folders/qv/khc24w491dn9d18j77490j380000gn/T//RtmpPXCd6Y/downloaded_packages
install.packages("tidyverse")
## 
## The downloaded binary packages are in
##  /var/folders/qv/khc24w491dn9d18j77490j380000gn/T//RtmpPXCd6Y/downloaded_packages
library(readxl)
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.7
## ✔ tidyr   0.8.2     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ─────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

Download Data

Create a temporary file

tmp<-tempfile(fileext=".xlsx")

Download file from repository to the temp file

download.file("https://github.com/MEF-BDA503/pj18-mehmetakk/blob/master/odd_retail_sales_2016_09.xlsx?raw=true",mode = 'wb',destfile=tmp)

Read that excel file using readxl package’s read_excel function. You might need to adjust the parameters (skip, col_names) according to your raw file’s format.

raw_data<-readxl::read_excel(tmp,skip=7,col_names=FALSE)

When I first run readxl function, it gave me error – Evaluation error: error -103 with zipfile in unzGetCurrentFileInfo then I insert mode=‘wb’ to download.file function

Remove the temp file

file.remove(tmp)
## [1] TRUE

I checked head and tail of my data

head(raw_data)
## # A tibble: 6 x 10
##   X__1          X__2  X__3  X__4  X__5  X__6  X__7  X__8  X__9 X__10
##   <chr>        <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALFA ROMEO      NA    32    32    NA    NA     0     0    32    32
## 2 ASTON MARTIN    NA     4     4    NA    NA     0     0     4     4
## 3 AUDI            NA  1541  1541    NA    NA     0     0  1541  1541
## 4 BENTLEY         NA     3     3    NA    NA     0     0     3     3
## 5 BMW             NA  1806  1806    NA    NA     0     0  1806  1806
## 6 CHERY           NA    NA     0    NA    NA     0     0     0     0
tail(raw_data)
## # A tibble: 6 x 10
##   X__1        X__2  X__3  X__4  X__5  X__6  X__7  X__8  X__9 X__10
##   <chr>      <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 TATA          NA    NA     0    NA    NA     0     0     0     0
## 2 TOYOTA      2069   858  2927    NA   543   543  2069  1401  3470
## 3 VOLKSWAGEN    NA  8028  8028    NA  2284  2284     0 10312 10312
## 4 VOLVO         NA   222   222    NA    NA     0     0   222   222
## 5 <NA>          NA    NA    NA    NA    NA    NA    NA    NA    NA
## 6 TOPLAM:    10971 40369 51340  8374  7879 16253 19345 48248 67593

Since the last 2 rows are blank and total, I removed these rows

raw_data <- raw_data %>% slice(-c(43,44))
tail(raw_data)
## # A tibble: 6 x 10
##   X__1        X__2  X__3  X__4  X__5  X__6  X__7  X__8  X__9 X__10
##   <chr>      <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 TATA          NA    NA     0    NA    NA     0     0     0     0
## 2 TOYOTA      2069   858  2927    NA   543   543  2069  1401  3470
## 3 VOLKSWAGEN    NA  8028  8028    NA  2284  2284     0 10312 10312
## 4 VOLVO         NA   222   222    NA    NA     0     0   222   222
## 5 <NA>          NA    NA    NA    NA    NA    NA    NA    NA    NA
## 6 TOPLAM:    10971 40369 51340  8374  7879 16253 19345 48248 67593
head(raw_data) 
## # A tibble: 6 x 10
##   X__1          X__2  X__3  X__4  X__5  X__6  X__7  X__8  X__9 X__10
##   <chr>        <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALFA ROMEO      NA    32    32    NA    NA     0     0    32    32
## 2 ASTON MARTIN    NA     4     4    NA    NA     0     0     4     4
## 3 AUDI            NA  1541  1541    NA    NA     0     0  1541  1541
## 4 BENTLEY         NA     3     3    NA    NA     0     0     3     3
## 5 BMW             NA  1806  1806    NA    NA     0     0  1806  1806
## 6 CHERY           NA    NA     0    NA    NA     0     0     0     0

Make Data Civilized

Define column names

colnames(raw_data) <- c("brand_name","auto_dom","auto_imp","auto_total","comm_dom","comm_imp","comm_total","total_dom","total_imp","total_total")

and remove NA values with 0 and label the time period with year and month in order to prevent confusion when we merge data ( mutate() adds new variables and preserves existing; transmute() drops existing variables )

car_data_sep_16 <- raw_data %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year=2016,month=9)
print(car_data_sep_16,width=Inf)
## # A tibble: 48 x 12
##    brand_name   auto_dom auto_imp auto_total comm_dom comm_imp comm_total
##    <chr>           <dbl>    <dbl>      <dbl>    <dbl>    <dbl>      <dbl>
##  1 ALFA ROMEO          0       32         32        0        0          0
##  2 ASTON MARTIN        0        4          4        0        0          0
##  3 AUDI                0     1541       1541        0        0          0
##  4 BENTLEY             0        3          3        0        0          0
##  5 BMW                 0     1806       1806        0        0          0
##  6 CHERY               0        0          0        0        0          0
##  7 CITROEN             0     1519       1519       88      528        616
##  8 DACIA               0     3116       3116        0      502        502
##  9 DS                  0       32         32        0        0          0
## 10 FERRARI             0        1          1        0        0          0
##    total_dom total_imp total_total  year month
##        <dbl>     <dbl>       <dbl> <dbl> <dbl>
##  1         0        32          32  2016     9
##  2         0         4           4  2016     9
##  3         0      1541        1541  2016     9
##  4         0         3           3  2016     9
##  5         0      1806        1806  2016     9
##  6         0         0           0  2016     9
##  7        88      2047        2135  2016     9
##  8         0      3618        3618  2016     9
##  9         0        32          32  2016     9
## 10         0         1           1  2016     9
## # ... with 38 more rows

Saving Civilized Data

One of the best methods is to save your data to an RDS or RData file. The difference is RDS can hold only one object but RData can hold many. Since we have only one data frame here we will go with RDS.

You can read that file by readRDS and assigning to an object rds_data <- readRDS("~/YOUR_OWN_PATH/odd_car_sales_data_sep_18.rds")

saveRDS(car_data_sep_16,file="car_data_sep_16.rds")

Time to make some analysis

car_data_sep_16 %>% 
  select(brand_name,auto_dom) %>% 
  arrange(desc(auto_dom)) %>% 
  filter(auto_dom>0)
## # A tibble: 7 x 2
##   brand_name auto_dom
##   <chr>         <dbl>
## 1 TOPLAM:       10971
## 2 FIAT           3491
## 3 RENAULT        2806
## 4 TOYOTA         2069
## 5 HYUNDAI        1778
## 6 FORD            438
## 7 HONDA           389