ODD Homework - Car Sales Data Import

Downloading Raw Data

# Create a temporary file
tmp<-tempfile(fileext=".xlsx")
# Download file from repository to the temp file
download.file("https://github.com/MEF-BDA503/pj18-muratorhan/blob/master/odd_retail_sales_2016_06.xlsx?raw=true", destfile=tmp, mode='wb')
# Read that excel file using readxl package's read_excel function. You might need to adjust the parameters (skip, col_names) according to your raw file's format.
raw_data<-readxl::read_excel(tmp,skip=7,col_names=FALSE)
# Remove the temp file
file.remove(tmp)
## [1] TRUE

[1] TRUE

# Remove the last two rows because they are irrelevant (total and empty rows)
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## <U+221A> ggplot2 3.0.0     <U+221A> purrr   0.2.5
## <U+221A> tibble  1.4.2     <U+221A> dplyr   0.7.6
## <U+221A> tidyr   0.8.1     <U+221A> stringr 1.3.1
## <U+221A> readr   1.1.1     <U+221A> forcats 0.3.0
## -- Conflicts ----------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
raw_data <- raw_data %>% slice(-c(49,50))

# Let's see our raw data
head(raw_data)
## # A tibble: 6 x 10
##   X__1          X__2  X__3  X__4  X__5  X__6  X__7  X__8  X__9 X__10
##   <chr>        <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALFA ROMEO      NA    48    48    NA    NA     0     0    48    48
## 2 ASTON MARTIN    NA     3     3    NA    NA     0     0     3     3
## 3 AUDI            NA  1961  1961    NA    NA     0     0  1961  1961
## 4 BENTLEY         NA     0     0    NA    NA     0     0     0     0
## 5 BMW             NA  2640  2640    NA    NA     0     0  2640  2640
## 6 CHERY           NA    25    25    NA    NA     0     0    25    25

Making Data Civilized

# Use the same column names in your data.
colnames(raw_data) <- c("brand_name","auto_dom","auto_imp","auto_total","comm_dom","comm_imp","comm_total","total_dom","total_imp","total_total")
# Now we replace NA values with 0 and label the time period with year and month, so when we merge the data we won't be confused.
car_data_june_16 <- raw_data %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year=2016,month=6)

print(car_data_june_16,width=Inf)
## # A tibble: 48 x 12
##    brand_name   auto_dom auto_imp auto_total comm_dom comm_imp comm_total
##    <chr>           <dbl>    <dbl>      <dbl>    <dbl>    <dbl>      <dbl>
##  1 ALFA ROMEO          0       48         48        0        0          0
##  2 ASTON MARTIN        0        3          3        0        0          0
##  3 AUDI                0     1961       1961        0        0          0
##  4 BENTLEY             0        0          0        0        0          0
##  5 BMW                 0     2640       2640        0        0          0
##  6 CHERY               0       25         25        0        0          0
##  7 CITROEN             0     1613       1613      131      690        821
##  8 DACIA               0     3799       3799        0      429        429
##  9 DS                  0       14         14        0        0          0
## 10 FERRARI             0        1          1        0        0          0
##    total_dom total_imp total_total  year month
##        <dbl>     <dbl>       <dbl> <dbl> <dbl>
##  1         0        48          48  2016     6
##  2         0         3           3  2016     6
##  3         0      1961        1961  2016     6
##  4         0         0           0  2016     6
##  5         0      2640        2640  2016     6
##  6         0        25          25  2016     6
##  7       131      2303        2434  2016     6
##  8         0      4228        4228  2016     6
##  9         0        14          14  2016     6
## 10         0         1           1  2016     6
## # ... with 38 more rows

Saving Your Civilized Data

saveRDS(car_data_june_16,file="~/C:\\Users\\Murat\\Desktop\\BIG DATA YL\\BDA 503 DATA ANALYTICS(R)\\W2\\ODD/odd_car_sales_data_june_16.rds")
# You can read that file by readRDS and assigning to an object 
# e.g 
# rds_data <- readRDS("~/C:\Users\Murat\Desktop\BIG DATA YL\BDA 503 DATA ANALYTICS(R)\W2\ODD/odd_car_sales_data_june_16.rds")

Finishing With Some Analysis

car_data_june_16 %>% 
  filter(auto_total > 0 & comm_total > 0) %>%
  select(brand_name,total_total) %>%
  arrange(desc(total_total))
## # A tibble: 14 x 2
##    brand_name    total_total
##    <chr>               <dbl>
##  1 VOLKSWAGEN          12406
##  2 RENAULT             12004
##  3 FORD                11307
##  4 FIAT                 9486
##  5 MERCEDES-BENZ        4546
##  6 DACIA                4228
##  7 HYUNDAI              4201
##  8 PEUGEOT              3463
##  9 TOYOTA               3415
## 10 NISSAN               2705
## 11 CITROEN              2434
## 12 KIA                  1700
## 13 MITSUBISHI            490
## 14 SSANGYONG             108