In week 2 ODD data have been studied. Now we combine all data in order to have detailed analyses with collected data. Here, We need to install required packages.
install.packages("tidyverse", repos = "https://cran.r-project.org")
## Installing package into 'C:/Users/USER/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)
## package 'tidyverse' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\USER\AppData\Local\Temp\RtmpM1JPym\downloaded_packages
install.packages("formattable", repos = "https://cran.r-project.org")
## Installing package into 'C:/Users/USER/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)
## package 'formattable' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\USER\AppData\Local\Temp\RtmpM1JPym\downloaded_packages
library(tidyverse)
## -- Attaching packages ---------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## <U+221A> ggplot2 3.1.0 <U+221A> purrr 0.2.5
## <U+221A> tibble 1.4.2 <U+221A> dplyr 0.7.7
## <U+221A> tidyr 0.8.1 <U+221A> stringr 1.3.1
## <U+221A> readr 1.1.1 <U+221A> forcats 0.3.0
## -- Conflicts ------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(formattable)
library(ggplot2)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
Here we get data from github repository with reading and assigning it to a dataframe.
#download from github
github_URL <- ("https://github.com/MEF-BDA503/pj18-elmasriomer/blob/master/car_data_aggregate.rds?raw=true")
all_data<- readRDS(url(github_URL))
#check rows,columns, also head and tail
glimpse(all_data)
## Observations: 1,490
## Variables: 12
## $ brand_name <chr> "ALFA ROMEO", "ASTON MARTIN", "AUDI", "BENTLEY", "...
## $ auto_dom <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 632, 91, 1471, 460, 0, ...
## $ auto_imp <dbl> 13, 2, 350, 0, 158, 134, 1141, 9, 3, 57, 705, 502,...
## $ auto_total <dbl> 13, 2, 350, 0, 158, 134, 1141, 9, 3, 689, 796, 197...
## $ comm_dom <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 789, 1421, 0, 0, 0, 59,...
## $ comm_imp <dbl> 0, 0, 0, 0, 0, 197, 319, 0, 0, 199, 139, 0, 39, 0,...
## $ comm_total <dbl> 0, 0, 0, 0, 0, 197, 319, 0, 0, 988, 1560, 0, 39, 0...
## $ total_dom <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1421, 1512, 1471, 460, ...
## $ total_imp <dbl> 13, 2, 350, 0, 158, 331, 1460, 9, 3, 256, 844, 502...
## $ total_total <dbl> 13, 2, 350, 0, 158, 331, 1460, 9, 3, 1677, 2356, 1...
## $ year <dbl> 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 20...
## $ month <dbl> 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,...
head(all_data)
## # A tibble: 6 x 12
## brand_name auto_dom auto_imp auto_total comm_dom comm_imp comm_total
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALFA ROMEO 0 13 13 0 0 0
## 2 ASTON MAR~ 0 2 2 0 0 0
## 3 AUDI 0 350 350 0 0 0
## 4 BENTLEY 0 0 0 0 0 0
## 5 BMW 0 158 158 0 0 0
## 6 CITROEN 0 134 134 0 197 197
## # ... with 5 more variables: total_dom <dbl>, total_imp <dbl>,
## # total_total <dbl>, year <dbl>, month <dbl>
tail(all_data)
## # A tibble: 6 x 12
## brand_name auto_dom auto_imp auto_total comm_dom comm_imp comm_total
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SSANGYONG 0 19 19 0 3 3
## 2 TATA 0 0 0 0 9 9
## 3 TOYOTA 1298 149 1447 0 34 34
## 4 VOLKSWAGEN 0 2792 2792 0 1736 1736
## 5 VOLVO 0 187 187 0 0 0
## 6 TOPLAM: 7375 15983 23358 4815 4540 9355
## # ... with 5 more variables: total_dom <dbl>, total_imp <dbl>,
## # total_total <dbl>, year <dbl>, month <dbl>
control_data <- all_data%>%
group_by (brand_name)%>%
summarize(count=n())
#View(control_data) #contol_data is used to see overall
#When we had a quick view to the aggreated data, there are some phrases in brandnames including "ODD.." and "TOPLAM".
cleaned_data <- all_data %>% filter(! (startsWith(brand_name, "ODD") | startsWith(brand_name, "TOPLAM")))%>%
group_by (brand_name)%>%
summarize(count=n())
#View(cleaned_data) #clean_data is used to see the result of filtering
#Also there are two brand name rows referred to unique brandname but spellled different that is needed to be consolidated (ASTON MARTIN, ASTON MARTİN).
all_data$brand_name <- str_replace(all_data$brand_name,"ASTON MARTİN","ASTON MARTIN")
control_data_2 <- all_data%>%filter(! (startsWith(brand_name, "ODD") | startsWith(brand_name, "TOPLAM")))%>%
group_by (brand_name)%>%
summarize(count=n())
#View(control_data_2) #control_data_2 is used to see the result of both of str_replace and filtering
final_data <- all_data %>% filter(! (startsWith(brand_name, "ODD") | startsWith(brand_name, "TOPLAM"))) #filtering is applied to actual data, re_named as final_data. We will use this.
#View(final_data)
#I would like to see the brand with highest car sales overall where both of automobile and commercial vehicle sales over 0. Then I would like to see the trend of commercial cars sales as domestic and import; also commercial car sales as domestic and import.
top_three <- final_data %>%
filter(auto_total > 0 & comm_total > 0) %>%
select(brand_name, auto_dom, auto_imp, comm_dom, comm_imp, month, year, total_total) %>%
group_by (brand_name)%>%
summarize(total_by_brand = sum(total_total))%>%
arrange(desc(total_by_brand))%>%
slice(1:3)
#View(top_three)
#The highest car sales comes from Renault. Let's see the trends commercial cars sales as domestic and import; also commercial car sales as domestic and import.
#View(final_data)
sales_monthly <- final_data %>%
group_by(year, month) %>%
summarise(total_sales=sum(total_total), total_import=sum(total_imp), total_domestic=sum(total_dom),total_automobil=sum(auto_total), total_commercial=sum(comm_total)) %>%
arrange(year, month) %>% slice(1:12)
sales_monthly$Date <- zoo::as.yearmon(paste(sales_monthly$year,sales_monthly$month), "%Y %m")
sales_monthly$DateFormated <- format(sales_monthly$Date,"%Y-%m")
#View(sales_monthly)
sales_monthly_Renault <- final_data %>%
filter(brand_name=="RENAULT") %>%
group_by(year, month) %>%
summarise(total_sales=sum(total_total), total_import=sum(total_imp), total_domestic=sum(total_dom),total_automobil=sum(auto_total), total_commercial=sum(comm_total), brand_name) %>%
arrange(year, month) %>% slice(1:12)
sales_monthly_Renault$Date <- zoo::as.yearmon(paste(sales_monthly$year,sales_monthly$month), "%Y %m")
sales_monthly_Renault$DateFormated <- format(sales_monthly$Date,"%Y-%m")
View(sales_monthly_Renault)
sales_monthly_Renault %>%
ggplot(aes(x = DateFormated, y = total_sales, group=1)) +
#ggtitle("Total Sales trend of Renault") +
geom_line() +
geom_point() +
theme(legend.position = "none", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0.0, size = 10)) +
scale_linetype_discrete()+
ggtitle("Total Sales Trend of Renault")
#New solutions will be added