ODD_aggregate

Install Package

In week 2 ODD data have been studied. Now we combine all data in order to have detailed analyses with collected data. Here, We need to install required packages.

install.packages("tidyverse", repos = "https://cran.r-project.org")

## Installing package into 'C:/Users/USER/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)

## package 'tidyverse' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\USER\AppData\Local\Temp\RtmpM1JPym\downloaded_packages

install.packages("formattable", repos = "https://cran.r-project.org")

## Installing package into 'C:/Users/USER/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)

## package 'formattable' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\USER\AppData\Local\Temp\RtmpM1JPym\downloaded_packages

library(tidyverse)

## -- Attaching packages ---------------------------------------------------------------------------------------------- tidyverse 1.2.1 --

## <U+221A> ggplot2 3.1.0     <U+221A> purrr   0.2.5
## <U+221A> tibble  1.4.2     <U+221A> dplyr   0.7.7
## <U+221A> tidyr   0.8.1     <U+221A> stringr 1.3.1
## <U+221A> readr   1.1.1     <U+221A> forcats 0.3.0

## -- Conflicts ------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(dplyr)
library(formattable)
library(ggplot2)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following object is masked from 'package:base':
## 
##     date

Download Raw Data

Here we get data from github repository with reading and assigning it to a dataframe.

#download from github 
github_URL <- ("https://github.com/MEF-BDA503/pj18-elmasriomer/blob/master/car_data_aggregate.rds?raw=true")
all_data<- readRDS(url(github_URL))
#check rows,columns, also head and tail
glimpse(all_data)

## Observations: 1,490
## Variables: 12
## $ brand_name  <chr> "ALFA ROMEO", "ASTON MARTIN", "AUDI", "BENTLEY", "...
## $ auto_dom    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 632, 91, 1471, 460, 0, ...
## $ auto_imp    <dbl> 13, 2, 350, 0, 158, 134, 1141, 9, 3, 57, 705, 502,...
## $ auto_total  <dbl> 13, 2, 350, 0, 158, 134, 1141, 9, 3, 689, 796, 197...
## $ comm_dom    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 789, 1421, 0, 0, 0, 59,...
## $ comm_imp    <dbl> 0, 0, 0, 0, 0, 197, 319, 0, 0, 199, 139, 0, 39, 0,...
## $ comm_total  <dbl> 0, 0, 0, 0, 0, 197, 319, 0, 0, 988, 1560, 0, 39, 0...
## $ total_dom   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1421, 1512, 1471, 460, ...
## $ total_imp   <dbl> 13, 2, 350, 0, 158, 331, 1460, 9, 3, 256, 844, 502...
## $ total_total <dbl> 13, 2, 350, 0, 158, 331, 1460, 9, 3, 1677, 2356, 1...
## $ year        <dbl> 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 20...
## $ month       <dbl> 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,...

head(all_data)

## # A tibble: 6 x 12
##   brand_name auto_dom auto_imp auto_total comm_dom comm_imp comm_total
##   <chr>         <dbl>    <dbl>      <dbl>    <dbl>    <dbl>      <dbl>
## 1 ALFA ROMEO        0       13         13        0        0          0
## 2 ASTON MAR~        0        2          2        0        0          0
## 3 AUDI              0      350        350        0        0          0
## 4 BENTLEY           0        0          0        0        0          0
## 5 BMW               0      158        158        0        0          0
## 6 CITROEN           0      134        134        0      197        197
## # ... with 5 more variables: total_dom <dbl>, total_imp <dbl>,
## #   total_total <dbl>, year <dbl>, month <dbl>

tail(all_data)

## # A tibble: 6 x 12
##   brand_name auto_dom auto_imp auto_total comm_dom comm_imp comm_total
##   <chr>         <dbl>    <dbl>      <dbl>    <dbl>    <dbl>      <dbl>
## 1 SSANGYONG         0       19         19        0        3          3
## 2 TATA              0        0          0        0        9          9
## 3 TOYOTA         1298      149       1447        0       34         34
## 4 VOLKSWAGEN        0     2792       2792        0     1736       1736
## 5 VOLVO             0      187        187        0        0          0
## 6 TOPLAM:        7375    15983      23358     4815     4540       9355
## # ... with 5 more variables: total_dom <dbl>, total_imp <dbl>,
## #   total_total <dbl>, year <dbl>, month <dbl>

Analyzing Overall Dataset and Cleaning Data

control_data <- all_data%>%
  group_by (brand_name)%>%
  summarize(count=n())
#View(control_data) #contol_data is used to see overall

#When we had a quick view to the aggreated data, there are some phrases in brandnames including "ODD.." and "TOPLAM". 

cleaned_data <- all_data %>% filter(! (startsWith(brand_name, "ODD") | startsWith(brand_name, "TOPLAM")))%>%
  group_by (brand_name)%>%
  summarize(count=n())
#View(cleaned_data) #clean_data is used to see the result of filtering

#Also there are two brand name rows referred to unique brandname  but spellled different that is needed to be consolidated (ASTON MARTIN, ASTON MARTİN).
all_data$brand_name <- str_replace(all_data$brand_name,"ASTON MARTİN","ASTON MARTIN")

control_data_2 <- all_data%>%filter(! (startsWith(brand_name, "ODD") | startsWith(brand_name, "TOPLAM")))%>%
  group_by (brand_name)%>%
  summarize(count=n())
#View(control_data_2) #control_data_2 is used to see the result of both of str_replace and filtering

final_data <- all_data %>% filter(! (startsWith(brand_name, "ODD") | startsWith(brand_name, "TOPLAM"))) #filtering is applied to actual data, re_named as final_data. We will use this. 
#View(final_data)

Data Analyze

#I would like to see the brand with highest car sales overall where both of automobile and commercial vehicle sales over 0. Then I would like to see the trend of commercial cars sales as domestic and import; also commercial car sales as domestic and import.

top_three <- final_data %>% 
  filter(auto_total > 0 & comm_total > 0) %>%
  select(brand_name, auto_dom, auto_imp, comm_dom, comm_imp, month, year, total_total) %>%
  group_by (brand_name)%>%
  summarize(total_by_brand = sum(total_total))%>%
  arrange(desc(total_by_brand))%>%
  slice(1:3)
#View(top_three)
#The highest car sales comes from Renault. Let's see the trends commercial cars sales as domestic and import; also commercial car sales as domestic and import.

#View(final_data)
sales_monthly <- final_data %>% 
              group_by(year, month)   %>% 
              summarise(total_sales=sum(total_total), total_import=sum(total_imp), total_domestic=sum(total_dom),total_automobil=sum(auto_total), total_commercial=sum(comm_total)) %>%
              arrange(year, month)   %>% slice(1:12)

sales_monthly$Date <- zoo::as.yearmon(paste(sales_monthly$year,sales_monthly$month), "%Y %m")
sales_monthly$DateFormated <- format(sales_monthly$Date,"%Y-%m")
#View(sales_monthly)

sales_monthly_Renault <- final_data %>% 
              filter(brand_name=="RENAULT") %>% 
              group_by(year, month)   %>% 
              summarise(total_sales=sum(total_total), total_import=sum(total_imp), total_domestic=sum(total_dom),total_automobil=sum(auto_total), total_commercial=sum(comm_total), brand_name) %>%
              arrange(year, month)   %>% slice(1:12)

sales_monthly_Renault$Date <- zoo::as.yearmon(paste(sales_monthly$year,sales_monthly$month), "%Y %m")
sales_monthly_Renault$DateFormated <- format(sales_monthly$Date,"%Y-%m")
View(sales_monthly_Renault)

sales_monthly_Renault %>% 
                  ggplot(aes(x = DateFormated, y = total_sales, group=1)) + 
                  #ggtitle("Total Sales trend of Renault") + 
                  geom_line() + 
                  geom_point() +
                  theme(legend.position = "none", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0.0, size = 10)) +
                  scale_linetype_discrete()+
                  ggtitle("Total Sales Trend of Renault")

#New solutions will be added

ODD_aggregate

Omer Elmasri

25 Kasım 2018

Install Package

Download Raw Data

Analyzing Overall Dataset and Cleaning Data

Data Analyze