## Load tidyverse
library(tidyverse)
## -- Attaching packages ---------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## <U+221A> ggplot2 3.1.0     <U+221A> purrr   0.2.5
## <U+221A> tibble  1.4.2     <U+221A> dplyr   0.7.7
## <U+221A> tidyr   0.8.2     <U+221A> stringr 1.3.1
## <U+221A> readr   1.1.1     <U+221A> forcats 0.3.0
## -- Conflicts ------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)
library(tidyverse)
library(ggplot2)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor

Getting the .rds file

Below the rds file is downloaded from github to a local data frame (raw_data) and refined for analysis

#download from github 
raw_data <- readRDS("C:/Users/Murat/Desktop/BIG DATA YL/BDA 503 DATA ANALYTICS(R)/W4/car_data_aggregate.rds")

head(raw_data)
## # A tibble: 6 x 12
##   brand_name auto_dom auto_imp auto_total comm_dom comm_imp comm_total
##   <chr>         <dbl>    <dbl>      <dbl>    <dbl>    <dbl>      <dbl>
## 1 ALFA ROMEO        0       13         13        0        0          0
## 2 ASTON MAR~        0        2          2        0        0          0
## 3 AUDI              0      350        350        0        0          0
## 4 BENTLEY           0        0          0        0        0          0
## 5 BMW               0      158        158        0        0          0
## 6 CITROEN           0      134        134        0      197        197
## # ... with 5 more variables: total_dom <dbl>, total_imp <dbl>,
## #   total_total <dbl>, year <dbl>, month <dbl>
tail(raw_data)
## # A tibble: 6 x 12
##   brand_name auto_dom auto_imp auto_total comm_dom comm_imp comm_total
##   <chr>         <dbl>    <dbl>      <dbl>    <dbl>    <dbl>      <dbl>
## 1 SSANGYONG         0       19         19        0        3          3
## 2 TATA              0        0          0        0        9          9
## 3 TOYOTA         1298      149       1447        0       34         34
## 4 VOLKSWAGEN        0     2792       2792        0     1736       1736
## 5 VOLVO             0      187        187        0        0          0
## 6 TOPLAM:        7375    15983      23358     4815     4540       9355
## # ... with 5 more variables: total_dom <dbl>, total_imp <dbl>,
## #   total_total <dbl>, year <dbl>, month <dbl>

Making some correction on data

raw_data$brand_name = str_replace(raw_data$brand_name, "MARTIN", "MARTIN")

dim(raw_data) # find out the index of latest row number
## [1] 1490   12

Deleting the last row because of total value

raw_data <- raw_data %>% slice(-c(1490))
tail(raw_data)
## # A tibble: 6 x 12
##   brand_name auto_dom auto_imp auto_total comm_dom comm_imp comm_total
##   <chr>         <dbl>    <dbl>      <dbl>    <dbl>    <dbl>      <dbl>
## 1 SMART             0        0          0        0        0          0
## 2 SSANGYONG         0       19         19        0        3          3
## 3 TATA              0        0          0        0        9          9
## 4 TOYOTA         1298      149       1447        0       34         34
## 5 VOLKSWAGEN        0     2792       2792        0     1736       1736
## 6 VOLVO             0      187        187        0        0          0
## # ... with 5 more variables: total_dom <dbl>, total_imp <dbl>,
## #   total_total <dbl>, year <dbl>, month <dbl>

Removing dublicated brand names

raw_data %>%  
filter(total_total>0 & comm_total>0) %>%
distinct(sort(brand_name))
## # A tibble: 19 x 1
##    `sort(brand_name)`
##    <chr>             
##  1 CITROEN           
##  2 DACIA             
##  3 FIAT              
##  4 FORD              
##  5 HYUNDAI           
##  6 ISUZU             
##  7 IVECO             
##  8 KARSAN            
##  9 KIA               
## 10 MERCEDES-BENZ     
## 11 MITSUBISHI        
## 12 NISSAN            
## 13 PEUGEOT           
## 14 RENAULT           
## 15 SSANGYONG         
## 16 TATA              
## 17 TOPLAM:           
## 18 TOYOTA            
## 19 VOLKSWAGEN

Showing annual total sales amount of cars sorted by the most sold brand

raw_data %>% group_by(brand_name,year) %>% summarise(totalsales = sum(total_total)) %>% arrange(desc(totalsales))
## # A tibble: 141 x 3
## # Groups:   brand_name [51]
##    brand_name  year totalsales
##    <chr>      <dbl>      <dbl>
##  1 TOPLAM:     2017     268085
##  2 TOPLAM:     2016     242081
##  3 TOPLAM:     2018     209467
##  4 VOLKSWAGEN  2016     134535
##  5 RENAULT     2017     130276
##  6 RENAULT     2016     121707
##  7 FIAT        2017     120049
##  8 FORD        2017     111064
##  9 FORD        2016     109604
## 10 FIAT        2016     106106
## # ... with 131 more rows

Let’s get some visualization

Comparing total annual automobile sales of BMW and AUDI;

raw_data %>%
  group_by(brand_name, year) %>% 
  filter(brand_name %in% c("BMW", "AUDI")) %>%
  summarize(yearly_auto_total = sum(auto_total))%>%
  ggplot(data=., aes(x=year, y=yearly_auto_total, fill=brand_name)) +
  geom_bar(stat="identity", position=position_dodge())+
  labs(x="Years", y = "Number Of Sales", fill="Brands") +
  theme_minimal()

Top 4 Car Sales per Brand in 2016

raw_data %>% 
    filter(auto_total > 0 & comm_total > 0) %>%
    select(brand_name,auto_total,comm_total,total_total,year) %>%
    arrange(desc(year),desc(total_total)) %>%
    filter(year == "2016") %>%  
    group_by(brand_name) %>%
    summarize(year_total = sum(total_total)) %>%
    arrange(desc(year_total)) %>%
    top_n(4) %>% 
    mutate(prop=percent(year_total / sum(year_total))) %>%
    ggplot(data = ., aes(x="", y=prop, fill=brand_name))+ labs(x = "", y = "", title = "Top 4 Car Sales per Brand in 2016") + 
    geom_bar(width = 1, stat = "identity") + coord_polar("y", start=0) +
    theme(plot.title=element_text(hjust=0.5,face='bold',size=18)) +  
    theme_classic() + theme(axis.line = element_blank(), axis.text = element_blank(), axis.ticks = element_blank()) +       geom_text(aes(label=prop),position=position_stack(vjust = 0.5)) + guides(fill=guide_legend(title="Brand Name"))
## Selecting by year_total