From Raw to Civilized Data

First, July 2017 sales data on Otomotiv Distibütörleri Dernegi website was found and downloaded. The downloaded excel`s name changed as “odd_retail_sales_2017_07.xlsx”.

Download Raw Data

Our raw excel file is in our repository. We can automatically download that file and put it in a temporary file. Then we can read that excel document into R and remove the temp file.

# Create a temporary file
tmp=tempfile(fileext=".xlsx")

# Download file from repository to the temp file
download.file("https://github.com/MEF-BDA503/pj18-baysalu/blob/master/files/odd_retail_sales_2017_07.xlsx?raw=true",destfile=tmp,mode='wb')

# Read that excel file using readxl package's read_excel function. You might need to adjust the parameters (skip, col_names) according to your raw file's format.
raw_data=readxl::read_excel(tmp,skip=7,col_names=FALSE)

# Remove the temp file
file.remove(tmp)
## [1] TRUE
# Remove the last two rows because they are irrelevant (total and empty rows)
raw_data = raw_data %>% slice(-c(46,47))

# Let's see our raw data
head(raw_data)
## # A tibble: 6 x 10
##   X__1          X__2  X__3  X__4  X__5  X__6  X__7  X__8  X__9 X__10
##   <chr>        <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALFA ROMEO      NA    18    18    NA    NA     0     0    18    18
## 2 ASTON MARTIN    NA     2     2    NA    NA     0     0     2     2
## 3 AUDI            NA  1370  1370    NA    NA     0     0  1370  1370
## 4 BENTLEY         NA     0     0    NA    NA     0     0     0     0
## 5 BMW             NA  1904  1904    NA    NA     0     0  1904  1904
## 6 CHERY           NA    NA     0    NA    NA     0     0     0     0
tail(raw_data)
## # A tibble: 6 x 10
##   X__1        X__2  X__3  X__4  X__5  X__6  X__7  X__8  X__9 X__10
##   <chr>      <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SSANGYONG     NA    32    32    NA    56    56     0    88    88
## 2 SUBARU        NA   103   103    NA    NA     0     0   103   103
## 3 SUZUKI        NA   446   446    NA    NA     0     0   446   446
## 4 TOYOTA      3647   852  4499    NA   375   375  3647  1227  4874
## 5 VOLKSWAGEN    NA  7006  7006    NA  2376  2376     0  9382  9382
## 6 VOLVO         NA   275   275    NA    NA     0     0   275   275

It’s ok but needs some work.

Make Data Civilized

In order to make the data standardized and workable, we need to define column names and remove NA values for this example.

# Add the same column names.
colnames(raw_data) = c("brand_name","auto_dom","auto_imp","auto_total","comm_dom","comm_imp","comm_total","total_dom","total_imp","total_total")
# Now we replace NA values with 0 and label the time period with year and month, so when we merge the data, we won't be confused.
car_data_jul_17 = raw_data %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year=2017,month=7)

print(car_data_jul_17,width=Inf)
## # A tibble: 45 x 12
##    brand_name   auto_dom auto_imp auto_total comm_dom comm_imp comm_total
##    <chr>           <dbl>    <dbl>      <dbl>    <dbl>    <dbl>      <dbl>
##  1 ALFA ROMEO          0       18         18        0        0          0
##  2 ASTON MARTIN        0        2          2        0        0          0
##  3 AUDI                0     1370       1370        0        0          0
##  4 BENTLEY             0        0          0        0        0          0
##  5 BMW                 0     1904       1904        0        0          0
##  6 CHERY               0        0          0        0        0          0
##  7 CITROEN             0     1339       1339        0      479        479
##  8 DACIA               0     3933       3933        0      642        642
##  9 DS                  0       38         38        0        0          0
## 10 FERRARI             0        0          0        0        0          0
##    total_dom total_imp total_total  year month
##        <dbl>     <dbl>       <dbl> <dbl> <dbl>
##  1         0        18          18  2017     7
##  2         0         2           2  2017     7
##  3         0      1370        1370  2017     7
##  4         0         0           0  2017     7
##  5         0      1904        1904  2017     7
##  6         0         0           0  2017     7
##  7         0      1818        1818  2017     7
##  8         0      4575        4575  2017     7
##  9         0        38          38  2017     7
## 10         0         0           0  2017     7
## # ... with 35 more rows

Save Your Civilized Data

One of the best methods is to save your data to an RDS or RData file. The difference is RDS can hold only one object but RData can hold many. Since we have only one data frame here we will go with RDS.

saveRDS(car_data_jul_17,file="C:/Ufuk/BDA_503_Data_Analytics_Essentials/Homeworks/Week2/odd_car_sales_data_jul_17.rds")
# You can read that file by readRDS and assigning to an object 
# e.g 
# rds_data <- readRDS("~/YOUR_OWN_PATH/odd_car_sales_data_sep_18.rds")
rds_data = readRDS("C:/Ufuk/BDA_503_Data_Analytics_Essentials/Homeworks/Week2/odd_car_sales_data_jul_17.rds")

# to check rds_data
print(rds_data,width=Inf)
## # A tibble: 45 x 12
##    brand_name   auto_dom auto_imp auto_total comm_dom comm_imp comm_total
##    <chr>           <dbl>    <dbl>      <dbl>    <dbl>    <dbl>      <dbl>
##  1 ALFA ROMEO          0       18         18        0        0          0
##  2 ASTON MARTIN        0        2          2        0        0          0
##  3 AUDI                0     1370       1370        0        0          0
##  4 BENTLEY             0        0          0        0        0          0
##  5 BMW                 0     1904       1904        0        0          0
##  6 CHERY               0        0          0        0        0          0
##  7 CITROEN             0     1339       1339        0      479        479
##  8 DACIA               0     3933       3933        0      642        642
##  9 DS                  0       38         38        0        0          0
## 10 FERRARI             0        0          0        0        0          0
##    total_dom total_imp total_total  year month
##        <dbl>     <dbl>       <dbl> <dbl> <dbl>
##  1         0        18          18  2017     7
##  2         0         2           2  2017     7
##  3         0      1370        1370  2017     7
##  4         0         0           0  2017     7
##  5         0      1904        1904  2017     7
##  6         0         0           0  2017     7
##  7         0      1818        1818  2017     7
##  8         0      4575        4575  2017     7
##  9         0        38          38  2017     7
## 10         0         0           0  2017     7
## # ... with 35 more rows
head(rds_data)
## # A tibble: 6 x 12
##   brand_name auto_dom auto_imp auto_total comm_dom comm_imp comm_total
##   <chr>         <dbl>    <dbl>      <dbl>    <dbl>    <dbl>      <dbl>
## 1 ALFA ROMEO        0       18         18        0        0          0
## 2 ASTON MAR~        0        2          2        0        0          0
## 3 AUDI              0     1370       1370        0        0          0
## 4 BENTLEY           0        0          0        0        0          0
## 5 BMW               0     1904       1904        0        0          0
## 6 CHERY             0        0          0        0        0          0
## # ... with 5 more variables: total_dom <dbl>, total_imp <dbl>,
## #   total_total <dbl>, year <dbl>, month <dbl>
tail(rds_data)
## # A tibble: 6 x 12
##   brand_name auto_dom auto_imp auto_total comm_dom comm_imp comm_total
##   <chr>         <dbl>    <dbl>      <dbl>    <dbl>    <dbl>      <dbl>
## 1 SSANGYONG         0       32         32        0       56         56
## 2 SUBARU            0      103        103        0        0          0
## 3 SUZUKI            0      446        446        0        0          0
## 4 TOYOTA         3647      852       4499        0      375        375
## 5 VOLKSWAGEN        0     7006       7006        0     2376       2376
## 6 VOLVO             0      275        275        0        0          0
## # ... with 5 more variables: total_dom <dbl>, total_imp <dbl>,
## #   total_total <dbl>, year <dbl>, month <dbl>

Finish With Some Analysis

The list of brands with no sales in July 2017:

car_data_jul_17 %>% 
  filter(auto_total == 0 & comm_total == 0) %>%
  select(-auto_dom:-month)
## # A tibble: 5 x 1
##   brand_name
##   <chr>     
## 1 BENTLEY   
## 2 CHERY     
## 3 FERRARI   
## 4 GEELY     
## 5 OTOKAR

The list of import sales of brands with both automobile and commercial vehicle sales ordered in decreasing total import sales:

car_data_jul_17 %>% 
  filter(auto_imp > 0 & comm_imp > 0) %>%
  select(-auto_dom,-auto_total,-comm_dom,-comm_total,-total_total,-total_dom,-year,-month) %>%
  transmute(brand_name,auto_imp,comm_imp,total_imp,
  more_sales = ifelse(auto_imp>comm_imp,"auto","comm")) %>%
  arrange(desc(total_imp))
## # A tibble: 14 x 5
##    brand_name    auto_imp comm_imp total_imp more_sales
##    <chr>            <dbl>    <dbl>     <dbl> <chr>     
##  1 VOLKSWAGEN        7006     2376      9382 auto      
##  2 RENAULT           3673     1499      5172 auto      
##  3 DACIA             3933      642      4575 auto      
##  4 FORD              3067      228      3295 auto      
##  5 MERCEDES-BENZ     2300      814      3114 auto      
##  6 PEUGEOT           2030      885      2915 auto      
##  7 NISSAN            2460      195      2655 auto      
##  8 HYUNDAI           1871      224      2095 auto      
##  9 CITROEN           1339      479      1818 auto      
## 10 KIA               1065      358      1423 auto      
## 11 TOYOTA             852      375      1227 auto      
## 12 FIAT               117      526       643 comm      
## 13 MITSUBISHI          19      281       300 comm      
## 14 SSANGYONG           32       56        88 comm