## Load tidyverse
library(tidyverse)
## -- Attaching packages ---------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## <U+221A> ggplot2 3.1.0 <U+221A> purrr 0.2.5
## <U+221A> tibble 1.4.2 <U+221A> dplyr 0.7.7
## <U+221A> tidyr 0.8.2 <U+221A> stringr 1.3.1
## <U+221A> readr 1.1.1 <U+221A> forcats 0.3.0
## -- Conflicts ------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(tidyverse)
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
Below the rds file is downloaded from github to a local data frame (raw_data) and refined for analysis
#download from github
raw_data <- readRDS("C:/Users/Murat/Desktop/BIG DATA YL/BDA 503 DATA ANALYTICS(R)/W4/car_data_aggregate.rds")
head(raw_data)
## # A tibble: 6 x 12
## brand_name auto_dom auto_imp auto_total comm_dom comm_imp comm_total
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALFA ROMEO 0 13 13 0 0 0
## 2 ASTON MAR~ 0 2 2 0 0 0
## 3 AUDI 0 350 350 0 0 0
## 4 BENTLEY 0 0 0 0 0 0
## 5 BMW 0 158 158 0 0 0
## 6 CITROEN 0 134 134 0 197 197
## # ... with 5 more variables: total_dom <dbl>, total_imp <dbl>,
## # total_total <dbl>, year <dbl>, month <dbl>
tail(raw_data)
## # A tibble: 6 x 12
## brand_name auto_dom auto_imp auto_total comm_dom comm_imp comm_total
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SSANGYONG 0 19 19 0 3 3
## 2 TATA 0 0 0 0 9 9
## 3 TOYOTA 1298 149 1447 0 34 34
## 4 VOLKSWAGEN 0 2792 2792 0 1736 1736
## 5 VOLVO 0 187 187 0 0 0
## 6 TOPLAM: 7375 15983 23358 4815 4540 9355
## # ... with 5 more variables: total_dom <dbl>, total_imp <dbl>,
## # total_total <dbl>, year <dbl>, month <dbl>
raw_data$brand_name = str_replace(raw_data$brand_name, "MARTIN", "MARTIN")
dim(raw_data) # find out the index of latest row number
## [1] 1490 12
raw_data <- raw_data %>% slice(-c(1490))
tail(raw_data)
## # A tibble: 6 x 12
## brand_name auto_dom auto_imp auto_total comm_dom comm_imp comm_total
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SMART 0 0 0 0 0 0
## 2 SSANGYONG 0 19 19 0 3 3
## 3 TATA 0 0 0 0 9 9
## 4 TOYOTA 1298 149 1447 0 34 34
## 5 VOLKSWAGEN 0 2792 2792 0 1736 1736
## 6 VOLVO 0 187 187 0 0 0
## # ... with 5 more variables: total_dom <dbl>, total_imp <dbl>,
## # total_total <dbl>, year <dbl>, month <dbl>
raw_data %>%
filter(total_total>0 & comm_total>0) %>%
distinct(sort(brand_name))
## # A tibble: 19 x 1
## `sort(brand_name)`
## <chr>
## 1 CITROEN
## 2 DACIA
## 3 FIAT
## 4 FORD
## 5 HYUNDAI
## 6 ISUZU
## 7 IVECO
## 8 KARSAN
## 9 KIA
## 10 MERCEDES-BENZ
## 11 MITSUBISHI
## 12 NISSAN
## 13 PEUGEOT
## 14 RENAULT
## 15 SSANGYONG
## 16 TATA
## 17 TOPLAM:
## 18 TOYOTA
## 19 VOLKSWAGEN
raw_data %>% group_by(brand_name,year) %>% summarise(totalsales = sum(total_total)) %>% arrange(desc(totalsales))
## # A tibble: 141 x 3
## # Groups: brand_name [51]
## brand_name year totalsales
## <chr> <dbl> <dbl>
## 1 TOPLAM: 2017 268085
## 2 TOPLAM: 2016 242081
## 3 TOPLAM: 2018 209467
## 4 VOLKSWAGEN 2016 134535
## 5 RENAULT 2017 130276
## 6 RENAULT 2016 121707
## 7 FIAT 2017 120049
## 8 FORD 2017 111064
## 9 FORD 2016 109604
## 10 FIAT 2016 106106
## # ... with 131 more rows
Comparing total annual automobile sales of BMW and AUDI;
raw_data %>%
group_by(brand_name, year) %>%
filter(brand_name %in% c("BMW", "AUDI")) %>%
summarize(yearly_auto_total = sum(auto_total))%>%
ggplot(data=., aes(x=year, y=yearly_auto_total, fill=brand_name)) +
geom_bar(stat="identity", position=position_dodge())+
labs(x="Years", y = "Number Of Sales", fill="Brands") +
theme_minimal()
Top 4 Car Sales per Brand in 2016
raw_data %>%
filter(auto_total > 0 & comm_total > 0) %>%
select(brand_name,auto_total,comm_total,total_total,year) %>%
arrange(desc(year),desc(total_total)) %>%
filter(year == "2016") %>%
group_by(brand_name) %>%
summarize(year_total = sum(total_total)) %>%
arrange(desc(year_total)) %>%
top_n(4) %>%
mutate(prop=percent(year_total / sum(year_total))) %>%
ggplot(data = ., aes(x="", y=prop, fill=brand_name))+ labs(x = "", y = "", title = "Top 4 Car Sales per Brand in 2016") +
geom_bar(width = 1, stat = "identity") + coord_polar("y", start=0) +
theme(plot.title=element_text(hjust=0.5,face='bold',size=18)) +
theme_classic() + theme(axis.line = element_blank(), axis.text = element_blank(), axis.ticks = element_blank()) + geom_text(aes(label=prop),position=position_stack(vjust = 0.5)) + guides(fill=guide_legend(title="Brand Name"))
## Selecting by year_total