ODD ASS P2

## Load tidyverse
library(tidyverse)

## -- Attaching packages ---------------------------------------------------------------------------------------- tidyverse 1.2.1 --

## <U+221A> ggplot2 3.1.0     <U+221A> purrr   0.2.5
## <U+221A> tibble  1.4.2     <U+221A> dplyr   0.7.7
## <U+221A> tidyr   0.8.2     <U+221A> stringr 1.3.1
## <U+221A> readr   1.1.1     <U+221A> forcats 0.3.0

## -- Conflicts ------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(dplyr)
library(tidyverse)
library(ggplot2)
library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

Getting the .rds file

Below the rds file is downloaded from github to a local data frame (raw_data) and refined for analysis

#download from github 
raw_data <- readRDS("C:/Users/Murat/Desktop/BIG DATA YL/BDA 503 DATA ANALYTICS(R)/W4/car_data_aggregate.rds")

head(raw_data)

## # A tibble: 6 x 12
##   brand_name auto_dom auto_imp auto_total comm_dom comm_imp comm_total
##   <chr>         <dbl>    <dbl>      <dbl>    <dbl>    <dbl>      <dbl>
## 1 ALFA ROMEO        0       13         13        0        0          0
## 2 ASTON MAR~        0        2          2        0        0          0
## 3 AUDI              0      350        350        0        0          0
## 4 BENTLEY           0        0          0        0        0          0
## 5 BMW               0      158        158        0        0          0
## 6 CITROEN           0      134        134        0      197        197
## # ... with 5 more variables: total_dom <dbl>, total_imp <dbl>,
## #   total_total <dbl>, year <dbl>, month <dbl>

tail(raw_data)

## # A tibble: 6 x 12
##   brand_name auto_dom auto_imp auto_total comm_dom comm_imp comm_total
##   <chr>         <dbl>    <dbl>      <dbl>    <dbl>    <dbl>      <dbl>
## 1 SSANGYONG         0       19         19        0        3          3
## 2 TATA              0        0          0        0        9          9
## 3 TOYOTA         1298      149       1447        0       34         34
## 4 VOLKSWAGEN        0     2792       2792        0     1736       1736
## 5 VOLVO             0      187        187        0        0          0
## 6 TOPLAM:        7375    15983      23358     4815     4540       9355
## # ... with 5 more variables: total_dom <dbl>, total_imp <dbl>,
## #   total_total <dbl>, year <dbl>, month <dbl>

Making some correction on data

raw_data$brand_name = str_replace(raw_data$brand_name, "MARTIN", "MARTIN")

dim(raw_data) # find out the index of latest row number

## [1] 1490   12

Deleting the last row because of total value

raw_data <- raw_data %>% slice(-c(1490))
tail(raw_data)

## # A tibble: 6 x 12
##   brand_name auto_dom auto_imp auto_total comm_dom comm_imp comm_total
##   <chr>         <dbl>    <dbl>      <dbl>    <dbl>    <dbl>      <dbl>
## 1 SMART             0        0          0        0        0          0
## 2 SSANGYONG         0       19         19        0        3          3
## 3 TATA              0        0          0        0        9          9
## 4 TOYOTA         1298      149       1447        0       34         34
## 5 VOLKSWAGEN        0     2792       2792        0     1736       1736
## 6 VOLVO             0      187        187        0        0          0
## # ... with 5 more variables: total_dom <dbl>, total_imp <dbl>,
## #   total_total <dbl>, year <dbl>, month <dbl>

Removing dublicated brand names

raw_data %>%  
filter(total_total>0 & comm_total>0) %>%
distinct(sort(brand_name))

## # A tibble: 19 x 1
##    `sort(brand_name)`
##    <chr>             
##  1 CITROEN           
##  2 DACIA             
##  3 FIAT              
##  4 FORD              
##  5 HYUNDAI           
##  6 ISUZU             
##  7 IVECO             
##  8 KARSAN            
##  9 KIA               
## 10 MERCEDES-BENZ     
## 11 MITSUBISHI        
## 12 NISSAN            
## 13 PEUGEOT           
## 14 RENAULT           
## 15 SSANGYONG         
## 16 TATA              
## 17 TOPLAM:           
## 18 TOYOTA            
## 19 VOLKSWAGEN

Showing annual total sales amount of cars sorted by the most sold brand

raw_data %>% group_by(brand_name,year) %>% summarise(totalsales = sum(total_total)) %>% arrange(desc(totalsales))

## # A tibble: 141 x 3
## # Groups:   brand_name [51]
##    brand_name  year totalsales
##    <chr>      <dbl>      <dbl>
##  1 TOPLAM:     2017     268085
##  2 TOPLAM:     2016     242081
##  3 TOPLAM:     2018     209467
##  4 VOLKSWAGEN  2016     134535
##  5 RENAULT     2017     130276
##  6 RENAULT     2016     121707
##  7 FIAT        2017     120049
##  8 FORD        2017     111064
##  9 FORD        2016     109604
## 10 FIAT        2016     106106
## # ... with 131 more rows

Let’s get some visualization

Comparing total annual automobile sales of BMW and AUDI;

raw_data %>%
  group_by(brand_name, year) %>% 
  filter(brand_name %in% c("BMW", "AUDI")) %>%
  summarize(yearly_auto_total = sum(auto_total))%>%
  ggplot(data=., aes(x=year, y=yearly_auto_total, fill=brand_name)) +
  geom_bar(stat="identity", position=position_dodge())+
  labs(x="Years", y = "Number Of Sales", fill="Brands") +
  theme_minimal()

Top 4 Car Sales per Brand in 2016

raw_data %>% 
    filter(auto_total > 0 & comm_total > 0) %>%
    select(brand_name,auto_total,comm_total,total_total,year) %>%
    arrange(desc(year),desc(total_total)) %>%
    filter(year == "2016") %>%  
    group_by(brand_name) %>%
    summarize(year_total = sum(total_total)) %>%
    arrange(desc(year_total)) %>%
    top_n(4) %>% 
    mutate(prop=percent(year_total / sum(year_total))) %>%
    ggplot(data = ., aes(x="", y=prop, fill=brand_name))+ labs(x = "", y = "", title = "Top 4 Car Sales per Brand in 2016") + 
    geom_bar(width = 1, stat = "identity") + coord_polar("y", start=0) +
    theme(plot.title=element_text(hjust=0.5,face='bold',size=18)) +  
    theme_classic() + theme(axis.line = element_blank(), axis.text = element_blank(), axis.ticks = element_blank()) +       geom_text(aes(label=prop),position=position_stack(vjust = 0.5)) + guides(fill=guide_legend(title="Brand Name"))

## Selecting by year_total

ODD ASS P2

Murat Orhan

25 Kasım 2018

Getting the .rds file

Making some correction on data

Deleting the last row because of total value

Removing dublicated brand names

Showing annual total sales amount of cars sorted by the most sold brand

Let’s get some visualization