Downloaded These Packages

install.packages("readxl")
## 
## The downloaded binary packages are in
##  /var/folders/qv/khc24w491dn9d18j77490j380000gn/T//RtmpyZzxEJ/downloaded_packages
install.packages("tidyverse")
## 
## The downloaded binary packages are in
##  /var/folders/qv/khc24w491dn9d18j77490j380000gn/T//RtmpyZzxEJ/downloaded_packages

Used This Libraries

library(readxl)
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.7
## ✔ tidyr   0.8.2     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ─────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

Download Data

Create a temporary file

tmp<-tempfile(fileext=".xlsx")

Download file from repository to the temp file

download.file("https://github.com/MEF-BDA503/pj18-mkaracabey/blob/master/oto_sales_analysis/2017_4_sales_oto.xlsx?raw=true",mode = 'wb',destfile=tmp)

Excel Data Read This Way

raw_data<-readxl::read_excel(tmp,skip=7,col_names=FALSE)

When I first run readxl function, it gave me error – Evaluation error: error -103 with zipfile in unzGetCurrentFileInfo then I insert mode=‘wb’ to download.file function

Remove the temp file

file.remove(tmp)
## [1] TRUE

head and tail of excel data

head(raw_data)
## # A tibble: 6 x 10
##   X__1          X__2  X__3  X__4  X__5  X__6  X__7  X__8  X__9 X__10
##   <chr>        <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALFA ROMEO      NA    27    27    NA    NA     0     0    27    27
## 2 ASTON MARTIN    NA     2     2    NA    NA     0     0     2     2
## 3 AUDI            NA  1673  1673    NA    NA     0     0  1673  1673
## 4 BENTLEY         NA     1     1    NA    NA     0     0     1     1
## 5 BMW             NA   877   877    NA    NA     0     0   877   877
## 6 CHERY           NA     0     0    NA    NA     0     0     0     0
tail(raw_data)
## # A tibble: 6 x 10
##   X__1        X__2  X__3  X__4  X__5  X__6  X__7  X__8  X__9 X__10
##   <chr>      <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SUZUKI        NA   206   206    NA    NA     0     0   206   206
## 2 TOYOTA      1975   427  2402    NA   304   304  1975   731  2706
## 3 VOLKSWAGEN    NA  6672  6672    NA  2362  2362     0  9034  9034
## 4 VOLVO         NA   340   340    NA    NA     0     0   340   340
## 5 <NA>          NA    NA    NA    NA    NA    NA    NA    NA    NA
## 6 TOPLAM:    17486 40512 57998  9060  8930 17990 26546 49442 75988

Since the last 2 rows are blank and total, I removed these rows

raw_data <- raw_data %>% slice(-c(43,44))
tail(raw_data)
## # A tibble: 6 x 10
##   X__1       X__2  X__3  X__4  X__5  X__6  X__7  X__8  X__9 X__10
##   <chr>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SSANGYONG    NA    27    27    NA    12    12     0    39    39
## 2 SUBARU       NA   129   129    NA    NA     0     0   129   129
## 3 SUZUKI       NA   206   206    NA    NA     0     0   206   206
## 4 VOLVO        NA   340   340    NA    NA     0     0   340   340
## 5 <NA>         NA    NA    NA    NA    NA    NA    NA    NA    NA
## 6 TOPLAM:   17486 40512 57998  9060  8930 17990 26546 49442 75988
head(raw_data) 
## # A tibble: 6 x 10
##   X__1          X__2  X__3  X__4  X__5  X__6  X__7  X__8  X__9 X__10
##   <chr>        <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALFA ROMEO      NA    27    27    NA    NA     0     0    27    27
## 2 ASTON MARTIN    NA     2     2    NA    NA     0     0     2     2
## 3 AUDI            NA  1673  1673    NA    NA     0     0  1673  1673
## 4 BENTLEY         NA     1     1    NA    NA     0     0     1     1
## 5 BMW             NA   877   877    NA    NA     0     0   877   877
## 6 CHERY           NA     0     0    NA    NA     0     0     0     0

Make Data Civilized

Define column names

colnames(raw_data) <- c("brand_name","auto_dom","auto_imp","auto_total","comm_dom","comm_imp","comm_total","total_dom","total_imp","total_total")

and remove NA values with 0 and label the time period with year and month in order to prevent confusion when we merge data ( mutate() adds new variables and preserves existing; transmute() drops existing variables )

car_data_april_17 <- raw_data %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year=2017,month=4)
print(car_data_april_17,width=Inf)
## # A tibble: 45 x 12
##    brand_name   auto_dom auto_imp auto_total comm_dom comm_imp comm_total
##    <chr>           <dbl>    <dbl>      <dbl>    <dbl>    <dbl>      <dbl>
##  1 ALFA ROMEO          0       27         27        0        0          0
##  2 ASTON MARTIN        0        2          2        0        0          0
##  3 AUDI                0     1673       1673        0        0          0
##  4 BENTLEY             0        1          1        0        0          0
##  5 BMW                 0      877        877        0        0          0
##  6 CHERY               0        0          0        0        0          0
##  7 CITROEN             0     1654       1654        0      807        807
##  8 DACIA               0     3557       3557        0      615        615
##  9 DS                  0       13         13        0        0          0
## 10 FERRARI             0        1          1        0        0          0
##    total_dom total_imp total_total  year month
##        <dbl>     <dbl>       <dbl> <dbl> <dbl>
##  1         0        27          27  2017     4
##  2         0         2           2  2017     4
##  3         0      1673        1673  2017     4
##  4         0         1           1  2017     4
##  5         0       877         877  2017     4
##  6         0         0           0  2017     4
##  7         0      2461        2461  2017     4
##  8         0      4172        4172  2017     4
##  9         0        13          13  2017     4
## 10         0         1           1  2017     4
## # ... with 35 more rows

Saving Civilized Data

One of the best methods is to save your data to an RDS or RData file. The difference is RDS can hold only one object but RData can hold many. Since we have only one data frame here we will go with RDS.

You can read that file by readRDS and assigning to an object rds_data <- readRDS("~/YOUR_OWN_PATH/odd_car_sales_data_april_17.rds")

saveRDS(car_data_april_17,file="data_april_17.rds")

Time to make some analysis

car_data_april_17 %>% 
  select(brand_name,auto_dom) %>% 
  arrange(auto_dom) %>% 
  filter(auto_dom>0)
## # A tibble: 6 x 2
##   brand_name auto_dom
##   <chr>         <dbl>
## 1 FORD           1105
## 2 HONDA          1626
## 3 HYUNDAI        2293
## 4 FIAT           4700
## 5 RENAULT        5787
## 6 TOPLAM:       17486

On above, companies of maked car sales.


On the below, Some Filter Added to show sales of car clearly in April of 2017.

car_data_april_17 %>% 
  select(brand_name,auto_dom,comm_dom,total_imp,total_dom) %>% 
  arrange(auto_dom) %>% arrange(total_imp) %>%
  filter(comm_dom>0) %>% filter(auto_dom>0) %>% filter(total_imp >0)
## # A tibble: 3 x 5
##   brand_name auto_dom comm_dom total_imp total_dom
##   <chr>         <dbl>    <dbl>     <dbl>     <dbl>
## 1 FIAT           4700     3752       787      8452
## 2 FORD           1105     4987      3114      6092
## 3 TOPLAM:       17486     9060     49442     26546