library(tidyverse)
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## <U+221A> ggplot2 3.0.0 <U+221A> purrr 0.2.5
## <U+221A> tibble 1.4.2 <U+221A> dplyr 0.7.7
## <U+221A> tidyr 0.8.1 <U+221A> stringr 1.3.1
## <U+221A> readr 1.1.1 <U+221A> forcats 0.3.0
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
To read the aggregated ODD sales data for the monthly periods between Jan 2016 - Sep 2018.
data<-readRDS(file = "C:/Users/emrek/Google Drive/BDA/503-EssentialsOfDataAnalytics/GitHub/pj18-EmreKemerci/AssignmentWeek4/car_data_aggregate.rds")
first, take a glance at variables
data %>% glimpse()
## Observations: 1,490
## Variables: 12
## $ brand_name <chr> "ALFA ROMEO", "ASTON MARTIN", "AUDI", "BENTLEY", "...
## $ auto_dom <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 632, 91, 1471, 460, 0, ...
## $ auto_imp <dbl> 13, 2, 350, 0, 158, 134, 1141, 9, 3, 57, 705, 502,...
## $ auto_total <dbl> 13, 2, 350, 0, 158, 134, 1141, 9, 3, 689, 796, 197...
## $ comm_dom <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 789, 1421, 0, 0, 0, 59,...
## $ comm_imp <dbl> 0, 0, 0, 0, 0, 197, 319, 0, 0, 199, 139, 0, 39, 0,...
## $ comm_total <dbl> 0, 0, 0, 0, 0, 197, 319, 0, 0, 988, 1560, 0, 39, 0...
## $ total_dom <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1421, 1512, 1471, 460, ...
## $ total_imp <dbl> 13, 2, 350, 0, 158, 331, 1460, 9, 3, 256, 844, 502...
## $ total_total <dbl> 13, 2, 350, 0, 158, 331, 1460, 9, 3, 1677, 2356, 1...
## $ year <dbl> 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 20...
## $ month <dbl> 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,...
data %>% distinct(brand_name)
## # A tibble: 51 x 1
## brand_name
## <chr>
## 1 ALFA ROMEO
## 2 ASTON MARTIN
## 3 AUDI
## 4 BENTLEY
## 5 BMW
## 6 CITROEN
## 7 DACIA
## 8 DS
## 9 FERRARI
## 10 FIAT
## # ... with 41 more rows
data %>% distinct(year)
## # A tibble: 3 x 1
## year
## <dbl>
## 1 2018
## 2 2017
## 3 2016
data %>% distinct(month)
## # A tibble: 12 x 1
## month
## <dbl>
## 1 9
## 2 8
## 3 7
## 4 6
## 5 5
## 6 4
## 7 3
## 8 2
## 9 1
## 10 12
## 11 11
## 12 10
Month and Year variables have appropriate values but brand_names variable needs cleaning since there are “TOPLAM:” and “ODD, verilerinde…” values. Also, brand_name has values “Aston MARTİN and Aston MARTIN”, we can replace “Aston MARTİN”" with “Aston MARTIN”.
data <- data %>% filter(!grepl("ODD", brand_name) & brand_name!="TOPLAM:")
data <- data %>%
mutate(brand_name=replace(brand_name,brand_name=="ASTON MARTİN","ASTON MARTIN"))
I want to analyse market shares over sales between 2016-2018/9 of each brand for auto sales, commercial vehicle sales and total sales.
market_shares <- data %>%
group_by(brand_name) %>%
summarize(auto_grand_total=sum(auto_total), comm_grand_total=sum(comm_total), total_grand_total=sum(total_total)) %>%
mutate(auto_market_share=round((auto_grand_total/sum(auto_grand_total)),4)) %>%
mutate (comm_market_share=round((comm_grand_total/sum(comm_grand_total)),4)) %>%
mutate (total_market_share=round((total_grand_total/sum(total_grand_total)),4))
market_shares
## # A tibble: 48 x 7
## brand_name auto_grand_total comm_grand_total total_grand_tot~
## <chr> <dbl> <dbl> <dbl>
## 1 ALFA ROMEO 1058 0 1058
## 2 ASTON MAR~ 64 0 64
## 3 AUDI 53038 0 53038
## 4 BENTLEY 31 0 31
## 5 BMW 56434 0 56434
## 6 CHERY 144 0 144
## 7 CITROEN 38016 19941 57957
## 8 DACIA 103206 14772 117978
## 9 DS 721 0 721
## 10 FERRARI 50 0 50
## # ... with 38 more rows, and 3 more variables: auto_market_share <dbl>,
## # comm_market_share <dbl>, total_market_share <dbl>
market_shares_summary <- market_shares %>% select(brand_name, auto_market_share, comm_market_share)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
melt_summary <- melt(market_shares_summary, id=c("brand_name"))
melt_summary
## brand_name variable value
## 1 ALFA ROMEO auto_market_share 0.0006
## 2 ASTON MARTIN auto_market_share 0.0000
## 3 AUDI auto_market_share 0.0295
## 4 BENTLEY auto_market_share 0.0000
## 5 BMW auto_market_share 0.0314
## 6 CHERY auto_market_share 0.0001
## 7 CITROEN auto_market_share 0.0212
## 8 DACIA auto_market_share 0.0575
## 9 DS auto_market_share 0.0004
## 10 FERRARI auto_market_share 0.0000
## 11 FIAT auto_market_share 0.0791
## 12 FORD auto_market_share 0.0558
## 13 GEELY auto_market_share 0.0000
## 14 HONDA auto_market_share 0.0375
## 15 HYUNDAI auto_market_share 0.0699
## 16 INFINITI auto_market_share 0.0001
## 17 ISUZU auto_market_share 0.0000
## 18 IVECO auto_market_share 0.0000
## 19 JAGUAR auto_market_share 0.0005
## 20 JEEP auto_market_share 0.0034
## 21 KARSAN auto_market_share 0.0000
## 22 KIA auto_market_share 0.0173
## 23 LAMBORGHINI auto_market_share 0.0000
## 24 LANCIA auto_market_share 0.0000
## 25 LAND ROVER auto_market_share 0.0027
## 26 LEXUS auto_market_share 0.0001
## 27 MASERATI auto_market_share 0.0001
## 28 MAZDA auto_market_share 0.0020
## 29 MERCEDES-BENZ auto_market_share 0.0393
## 30 MINI auto_market_share 0.0022
## 31 MITSUBISHI auto_market_share 0.0007
## 32 NISSAN auto_market_share 0.0431
## 33 OPEL auto_market_share 0.0652
## 34 OTOKAR auto_market_share 0.0000
## 35 PEUGEOT auto_market_share 0.0412
## 36 PORSCHE auto_market_share 0.0011
## 37 PROTON auto_market_share 0.0000
## 38 RENAULT auto_market_share 0.1558
## 39 SEAT auto_market_share 0.0252
## 40 SKODA auto_market_share 0.0386
## 41 SMART auto_market_share 0.0002
## 42 SSANGYONG auto_market_share 0.0006
## 43 SUBARU auto_market_share 0.0022
## 44 SUZUKI auto_market_share 0.0050
## 45 TATA auto_market_share 0.0000
## 46 TOYOTA auto_market_share 0.0532
## 47 VOLKSWAGEN auto_market_share 0.1105
## 48 VOLVO auto_market_share 0.0066
## 49 ALFA ROMEO comm_market_share 0.0000
## 50 ASTON MARTIN comm_market_share 0.0000
## 51 AUDI comm_market_share 0.0000
## 52 BENTLEY comm_market_share 0.0000
## 53 BMW comm_market_share 0.0000
## 54 CHERY comm_market_share 0.0000
## 55 CITROEN comm_market_share 0.0363
## 56 DACIA comm_market_share 0.0269
## 57 DS comm_market_share 0.0000
## 58 FERRARI comm_market_share 0.0000
## 59 FIAT comm_market_share 0.2434
## 60 FORD comm_market_share 0.3108
## 61 GEELY comm_market_share 0.0000
## 62 HONDA comm_market_share 0.0000
## 63 HYUNDAI comm_market_share 0.0113
## 64 INFINITI comm_market_share 0.0000
## 65 ISUZU comm_market_share 0.0139
## 66 IVECO comm_market_share 0.0107
## 67 JAGUAR comm_market_share 0.0000
## 68 JEEP comm_market_share 0.0000
## 69 KARSAN comm_market_share 0.0072
## 70 KIA comm_market_share 0.0163
## 71 LAMBORGHINI comm_market_share 0.0000
## 72 LANCIA comm_market_share 0.0000
## 73 LAND ROVER comm_market_share 0.0000
## 74 LEXUS comm_market_share 0.0000
## 75 MASERATI comm_market_share 0.0000
## 76 MAZDA comm_market_share 0.0000
## 77 MERCEDES-BENZ comm_market_share 0.0425
## 78 MINI comm_market_share 0.0000
## 79 MITSUBISHI comm_market_share 0.0196
## 80 NISSAN comm_market_share 0.0097
## 81 OPEL comm_market_share 0.0000
## 82 OTOKAR comm_market_share 0.0000
## 83 PEUGEOT comm_market_share 0.0436
## 84 PORSCHE comm_market_share 0.0000
## 85 PROTON comm_market_share 0.0000
## 86 RENAULT comm_market_share 0.0703
## 87 SEAT comm_market_share 0.0000
## 88 SKODA comm_market_share 0.0000
## 89 SMART comm_market_share 0.0000
## 90 SSANGYONG comm_market_share 0.0012
## 91 SUBARU comm_market_share 0.0000
## 92 SUZUKI comm_market_share 0.0000
## 93 TATA comm_market_share 0.0000
## 94 TOYOTA comm_market_share 0.0207
## 95 VOLKSWAGEN comm_market_share 0.1157
## 96 VOLVO comm_market_share 0.0000
lets visualize
library(ggplot2)
ggplot(melt_summary, aes(x=brand_name, y=value*100, fill=variable)) +
geom_bar(stat='identity', position= "dodge") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
theme(legend.position = c(0.7, 0.75)) +
scale_y_continuous(sec.axis = sec_axis(~.,"Market Share (%)")) +
labs(x="Brand Name", y="Market Share (%)", title="Market share of each brand according to sales between Jan 2016 - Sep 2018", fill=guide_legend(title="Vehicle Type")) +
scale_color_manual(name="Mileage", labels = c("Above Average", "Below Average")) +
theme(legend.background = element_rect(fill="lightblue", size=0.5, linetype="solid"))
gr<-market_shares %>%
select(brand_name, total_market_share)
gr$share_z <- round((gr$total_market_share - mean(gr$total_market_share))/sd(gr$total_market_share), 2)
gr$marketshare_type <- ifelse(gr$share_z < 0, "below", "above")
gr <- gr[order(gr$share_z), ]
gr$brand_name <- factor(gr$brand_name, levels = gr$brand_name)
theme_set(theme_bw())
ggplot(gr, aes(x=brand_name, y=share_z, label=share_z)) +
geom_point(stat='identity', aes(col=marketshare_type), size=6) +
scale_color_manual(name="Market Share",
labels = c("Above Average", "Below Average"),
values = c("above"="#00ba38", "below"="#f8766d")) +
geom_text(color="white", size=2) +
labs(title="Diverging Dot Plot",
subtitle="Normalized total market shares") +
ylim(-2.5, 2.5) +
coord_flip()
## Warning: Removed 4 rows containing missing values (geom_point).
## Warning: Removed 4 rows containing missing values (geom_text).