library(tidyverse)
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## <U+221A> ggplot2 3.0.0     <U+221A> purrr   0.2.5
## <U+221A> tibble  1.4.2     <U+221A> dplyr   0.7.7
## <U+221A> tidyr   0.8.1     <U+221A> stringr 1.3.1
## <U+221A> readr   1.1.1     <U+221A> forcats 0.3.0
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

To read the aggregated ODD sales data for the monthly periods between Jan 2016 - Sep 2018.

data<-readRDS(file = "C:/Users/emrek/Google Drive/BDA/503-EssentialsOfDataAnalytics/GitHub/pj18-EmreKemerci/AssignmentWeek4/car_data_aggregate.rds")

first, take a glance at variables

data %>% glimpse()
## Observations: 1,490
## Variables: 12
## $ brand_name  <chr> "ALFA ROMEO", "ASTON MARTIN", "AUDI", "BENTLEY", "...
## $ auto_dom    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 632, 91, 1471, 460, 0, ...
## $ auto_imp    <dbl> 13, 2, 350, 0, 158, 134, 1141, 9, 3, 57, 705, 502,...
## $ auto_total  <dbl> 13, 2, 350, 0, 158, 134, 1141, 9, 3, 689, 796, 197...
## $ comm_dom    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 789, 1421, 0, 0, 0, 59,...
## $ comm_imp    <dbl> 0, 0, 0, 0, 0, 197, 319, 0, 0, 199, 139, 0, 39, 0,...
## $ comm_total  <dbl> 0, 0, 0, 0, 0, 197, 319, 0, 0, 988, 1560, 0, 39, 0...
## $ total_dom   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1421, 1512, 1471, 460, ...
## $ total_imp   <dbl> 13, 2, 350, 0, 158, 331, 1460, 9, 3, 256, 844, 502...
## $ total_total <dbl> 13, 2, 350, 0, 158, 331, 1460, 9, 3, 1677, 2356, 1...
## $ year        <dbl> 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 20...
## $ month       <dbl> 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,...
data %>% distinct(brand_name)
## # A tibble: 51 x 1
##    brand_name  
##    <chr>       
##  1 ALFA ROMEO  
##  2 ASTON MARTIN
##  3 AUDI        
##  4 BENTLEY     
##  5 BMW         
##  6 CITROEN     
##  7 DACIA       
##  8 DS          
##  9 FERRARI     
## 10 FIAT        
## # ... with 41 more rows
data %>% distinct(year)
## # A tibble: 3 x 1
##    year
##   <dbl>
## 1  2018
## 2  2017
## 3  2016
data %>% distinct(month)
## # A tibble: 12 x 1
##    month
##    <dbl>
##  1     9
##  2     8
##  3     7
##  4     6
##  5     5
##  6     4
##  7     3
##  8     2
##  9     1
## 10    12
## 11    11
## 12    10

Month and Year variables have appropriate values but brand_names variable needs cleaning since there are “TOPLAM:” and “ODD, verilerinde…” values. Also, brand_name has values “Aston MARTİN and Aston MARTIN”, we can replace “Aston MARTİN”" with “Aston MARTIN”.

data <- data %>% filter(!grepl("ODD", brand_name) & brand_name!="TOPLAM:")

data <- data %>%
  mutate(brand_name=replace(brand_name,brand_name=="ASTON MARTİN","ASTON MARTIN"))

I want to analyse market shares over sales between 2016-2018/9 of each brand for auto sales, commercial vehicle sales and total sales.

market_shares <- data %>%
  group_by(brand_name) %>%
  summarize(auto_grand_total=sum(auto_total), comm_grand_total=sum(comm_total), total_grand_total=sum(total_total)) %>%
  mutate(auto_market_share=round((auto_grand_total/sum(auto_grand_total)),4)) %>% 
  mutate (comm_market_share=round((comm_grand_total/sum(comm_grand_total)),4)) %>%
  mutate (total_market_share=round((total_grand_total/sum(total_grand_total)),4))
market_shares
## # A tibble: 48 x 7
##    brand_name auto_grand_total comm_grand_total total_grand_tot~
##    <chr>                 <dbl>            <dbl>            <dbl>
##  1 ALFA ROMEO             1058                0             1058
##  2 ASTON MAR~               64                0               64
##  3 AUDI                  53038                0            53038
##  4 BENTLEY                  31                0               31
##  5 BMW                   56434                0            56434
##  6 CHERY                   144                0              144
##  7 CITROEN               38016            19941            57957
##  8 DACIA                103206            14772           117978
##  9 DS                      721                0              721
## 10 FERRARI                  50                0               50
## # ... with 38 more rows, and 3 more variables: auto_market_share <dbl>,
## #   comm_market_share <dbl>, total_market_share <dbl>
market_shares_summary <- market_shares %>% select(brand_name, auto_market_share, comm_market_share)


library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
  melt_summary <- melt(market_shares_summary, id=c("brand_name"))

melt_summary
##       brand_name          variable  value
## 1     ALFA ROMEO auto_market_share 0.0006
## 2   ASTON MARTIN auto_market_share 0.0000
## 3           AUDI auto_market_share 0.0295
## 4        BENTLEY auto_market_share 0.0000
## 5            BMW auto_market_share 0.0314
## 6          CHERY auto_market_share 0.0001
## 7        CITROEN auto_market_share 0.0212
## 8          DACIA auto_market_share 0.0575
## 9             DS auto_market_share 0.0004
## 10       FERRARI auto_market_share 0.0000
## 11          FIAT auto_market_share 0.0791
## 12          FORD auto_market_share 0.0558
## 13         GEELY auto_market_share 0.0000
## 14         HONDA auto_market_share 0.0375
## 15       HYUNDAI auto_market_share 0.0699
## 16      INFINITI auto_market_share 0.0001
## 17         ISUZU auto_market_share 0.0000
## 18         IVECO auto_market_share 0.0000
## 19        JAGUAR auto_market_share 0.0005
## 20          JEEP auto_market_share 0.0034
## 21        KARSAN auto_market_share 0.0000
## 22           KIA auto_market_share 0.0173
## 23   LAMBORGHINI auto_market_share 0.0000
## 24        LANCIA auto_market_share 0.0000
## 25    LAND ROVER auto_market_share 0.0027
## 26         LEXUS auto_market_share 0.0001
## 27      MASERATI auto_market_share 0.0001
## 28         MAZDA auto_market_share 0.0020
## 29 MERCEDES-BENZ auto_market_share 0.0393
## 30          MINI auto_market_share 0.0022
## 31    MITSUBISHI auto_market_share 0.0007
## 32        NISSAN auto_market_share 0.0431
## 33          OPEL auto_market_share 0.0652
## 34        OTOKAR auto_market_share 0.0000
## 35       PEUGEOT auto_market_share 0.0412
## 36       PORSCHE auto_market_share 0.0011
## 37        PROTON auto_market_share 0.0000
## 38       RENAULT auto_market_share 0.1558
## 39          SEAT auto_market_share 0.0252
## 40         SKODA auto_market_share 0.0386
## 41         SMART auto_market_share 0.0002
## 42     SSANGYONG auto_market_share 0.0006
## 43        SUBARU auto_market_share 0.0022
## 44        SUZUKI auto_market_share 0.0050
## 45          TATA auto_market_share 0.0000
## 46        TOYOTA auto_market_share 0.0532
## 47    VOLKSWAGEN auto_market_share 0.1105
## 48         VOLVO auto_market_share 0.0066
## 49    ALFA ROMEO comm_market_share 0.0000
## 50  ASTON MARTIN comm_market_share 0.0000
## 51          AUDI comm_market_share 0.0000
## 52       BENTLEY comm_market_share 0.0000
## 53           BMW comm_market_share 0.0000
## 54         CHERY comm_market_share 0.0000
## 55       CITROEN comm_market_share 0.0363
## 56         DACIA comm_market_share 0.0269
## 57            DS comm_market_share 0.0000
## 58       FERRARI comm_market_share 0.0000
## 59          FIAT comm_market_share 0.2434
## 60          FORD comm_market_share 0.3108
## 61         GEELY comm_market_share 0.0000
## 62         HONDA comm_market_share 0.0000
## 63       HYUNDAI comm_market_share 0.0113
## 64      INFINITI comm_market_share 0.0000
## 65         ISUZU comm_market_share 0.0139
## 66         IVECO comm_market_share 0.0107
## 67        JAGUAR comm_market_share 0.0000
## 68          JEEP comm_market_share 0.0000
## 69        KARSAN comm_market_share 0.0072
## 70           KIA comm_market_share 0.0163
## 71   LAMBORGHINI comm_market_share 0.0000
## 72        LANCIA comm_market_share 0.0000
## 73    LAND ROVER comm_market_share 0.0000
## 74         LEXUS comm_market_share 0.0000
## 75      MASERATI comm_market_share 0.0000
## 76         MAZDA comm_market_share 0.0000
## 77 MERCEDES-BENZ comm_market_share 0.0425
## 78          MINI comm_market_share 0.0000
## 79    MITSUBISHI comm_market_share 0.0196
## 80        NISSAN comm_market_share 0.0097
## 81          OPEL comm_market_share 0.0000
## 82        OTOKAR comm_market_share 0.0000
## 83       PEUGEOT comm_market_share 0.0436
## 84       PORSCHE comm_market_share 0.0000
## 85        PROTON comm_market_share 0.0000
## 86       RENAULT comm_market_share 0.0703
## 87          SEAT comm_market_share 0.0000
## 88         SKODA comm_market_share 0.0000
## 89         SMART comm_market_share 0.0000
## 90     SSANGYONG comm_market_share 0.0012
## 91        SUBARU comm_market_share 0.0000
## 92        SUZUKI comm_market_share 0.0000
## 93          TATA comm_market_share 0.0000
## 94        TOYOTA comm_market_share 0.0207
## 95    VOLKSWAGEN comm_market_share 0.1157
## 96         VOLVO comm_market_share 0.0000

lets visualize

library(ggplot2)

ggplot(melt_summary, aes(x=brand_name, y=value*100, fill=variable)) +
    geom_bar(stat='identity', position= "dodge") +
     theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
      theme(legend.position = c(0.7, 0.75)) + 
        scale_y_continuous(sec.axis = sec_axis(~.,"Market Share (%)")) +
        labs(x="Brand Name", y="Market Share (%)", title="Market share of each brand according to sales between Jan 2016 - Sep 2018", fill=guide_legend(title="Vehicle Type")) + 
  scale_color_manual(name="Mileage", labels = c("Above Average", "Below Average")) +
          theme(legend.background = element_rect(fill="lightblue", size=0.5, linetype="solid"))

gr<-market_shares %>%
  select(brand_name, total_market_share)

gr$share_z <- round((gr$total_market_share - mean(gr$total_market_share))/sd(gr$total_market_share), 2)

gr$marketshare_type <- ifelse(gr$share_z < 0, "below", "above")  
gr <- gr[order(gr$share_z), ]
gr$brand_name <- factor(gr$brand_name, levels = gr$brand_name)


theme_set(theme_bw())

ggplot(gr, aes(x=brand_name, y=share_z, label=share_z)) + 
  geom_point(stat='identity', aes(col=marketshare_type), size=6)  +
  scale_color_manual(name="Market Share", 
                     labels = c("Above Average", "Below Average"), 
                     values = c("above"="#00ba38", "below"="#f8766d")) + 
  geom_text(color="white", size=2) +
  labs(title="Diverging Dot Plot", 
       subtitle="Normalized total market shares") + 
  ylim(-2.5, 2.5) +
  coord_flip()
## Warning: Removed 4 rows containing missing values (geom_point).
## Warning: Removed 4 rows containing missing values (geom_text).