ODD Analysis

Data Preparation

After importing the sales data from the Turkish Automotive Distributors Association to the environment, I checked the dataframe whether there are any type mismatch or peculatiries.

#get the class of the columns of car data frame
sapply(car_df, class)

##  brand_name    auto_dom    auto_imp  auto_total    comm_dom    comm_imp 
## "character"   "numeric"   "numeric"   "numeric"   "numeric"   "numeric" 
##  comm_total   total_dom   total_imp total_total        year       month 
##   "numeric"   "numeric"   "numeric"   "numeric"   "numeric"   "numeric"

#View the head of car_df if there are any pecularities
head(car_df)

## # A tibble: 6 x 12
##   brand_name auto_dom auto_imp auto_total comm_dom comm_imp comm_total
##   <chr>         <dbl>    <dbl>      <dbl>    <dbl>    <dbl>      <dbl>
## 1 ALFA ROMEO        0       13         13        0        0          0
## 2 ASTON MAR~        0        2          2        0        0          0
## 3 AUDI              0      350        350        0        0          0
## 4 BENTLEY           0        0          0        0        0          0
## 5 BMW               0      158        158        0        0          0
## 6 CITROEN           0      134        134        0      197        197
## # ... with 5 more variables: total_dom <dbl>, total_imp <dbl>,
## #   total_total <dbl>, year <dbl>, month <dbl>

#View the tail of car_df if there are any pecularities
tail(car_df)

## # A tibble: 6 x 12
##   brand_name auto_dom auto_imp auto_total comm_dom comm_imp comm_total
##   <chr>         <dbl>    <dbl>      <dbl>    <dbl>    <dbl>      <dbl>
## 1 SSANGYONG         0       19         19        0        3          3
## 2 TATA              0        0          0        0        9          9
## 3 TOYOTA         1298      149       1447        0       34         34
## 4 VOLKSWAGEN        0     2792       2792        0     1736       1736
## 5 VOLVO             0      187        187        0        0          0
## 6 TOPLAM:        7375    15983      23358     4815     4540       9355
## # ... with 5 more variables: total_dom <dbl>, total_imp <dbl>,
## #   total_total <dbl>, year <dbl>, month <dbl>

So there is a row with brand_name “TOPLAM:” that means Total in Turkish, which we need to remove from the car_df. First we find the rows where the brand_name is “Toplam:” and remove them. Also ASTON MARTIN and ASTON MARTIN both exist in the daa frame. This will be handled too.

#Finding the rows where the brand_name is "Toplam:" and removing them with using base R functionality
which(car_df$brand_name == "TOPLAM:")

##  [1]  169  297  340  383  559  738  782  919 1109 1252 1395 1490

#removing the rows
toRemove <- which(car_df$brand_name == "TOPLAM:")
car_df <- car_df[-toRemove, ]
car_df <- car_df[-866, ]
#Check whether there are still any rows fit brand_name == "TOPLAM:"
which(car_df$brand_name == "TOPLAM:")

## integer(0)

#Replacing MARTIN with MARTIN
car_df$brand_name <- gsub("ASTON MARTIN", "ASTON MARTIN", car_df$brand_name)

Then we introduce some new percentage columns to the car_df, which can be useful for further analysis. Also a year_month columns:

#Introducing some new columns
car_df %<>% mutate(percent_auto_imp = auto_imp/auto_total)
car_df %<>% mutate(percent_comm_imp = comm_imp/comm_total) 
car_df %<>% mutate(percent_total_imp = total_imp/total_total)
car_df %<>% mutate(percent_auto = auto_total/total_total)
car_df %<>% mutate(percent_auto = comm_total/total_total)

#A new date column, sales data is obtained at the end of month, so a day value of 28 is used
car_df$year_month = ymd(paste(car_df$year, car_df$month, "28"))

Then we look first to the structure of the car_df and also calculate some aggregate measures about the total sales:

#Structure of car_df 
glimpse(car_df)

## Observations: 1,477
## Variables: 17
## $ brand_name        <chr> "ALFA ROMEO", "ASTON MARTIN", "AUDI", "BENTL...
## $ auto_dom          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 632, 91, 1471, 46...
## $ auto_imp          <dbl> 13, 2, 350, 0, 158, 134, 1141, 9, 3, 57, 705...
## $ auto_total        <dbl> 13, 2, 350, 0, 158, 134, 1141, 9, 3, 689, 79...
## $ comm_dom          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 789, 1421, 0, 0, ...
## $ comm_imp          <dbl> 0, 0, 0, 0, 0, 197, 319, 0, 0, 199, 139, 0, ...
## $ comm_total        <dbl> 0, 0, 0, 0, 0, 197, 319, 0, 0, 988, 1560, 0,...
## $ total_dom         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1421, 1512, 1471,...
## $ total_imp         <dbl> 13, 2, 350, 0, 158, 331, 1460, 9, 3, 256, 84...
## $ total_total       <dbl> 13, 2, 350, 0, 158, 331, 1460, 9, 3, 1677, 2...
## $ year              <dbl> 2018, 2018, 2018, 2018, 2018, 2018, 2018, 20...
## $ month             <dbl> 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,...
## $ percent_auto_imp  <dbl> 1.00000000, 1.00000000, 1.00000000, NaN, 1.0...
## $ percent_comm_imp  <dbl> NaN, NaN, NaN, NaN, NaN, 1.00000000, 1.00000...
## $ percent_total_imp <dbl> 1.0000000, 1.0000000, 1.0000000, NaN, 1.0000...
## $ percent_auto      <dbl> 0.00000000, 0.00000000, 0.00000000, NaN, 0.0...
## $ year_month        <date> 2018-09-28, 2018-09-28, 2018-09-28, 2018-09...

#Total Sales for each brand & year sorted by the most sold brand
car_df %>% group_by(brand_name,year) %>% summarise(totalsales = sum(total_total)) %>% arrange(desc(totalsales))

## # A tibble: 137 x 3
## # Groups:   brand_name [49]
##    brand_name  year totalsales
##    <chr>      <dbl>      <dbl>
##  1 VOLKSWAGEN  2016     134535
##  2 RENAULT     2017     130276
##  3 RENAULT     2016     121707
##  4 FIAT        2017     120049
##  5 FORD        2017     111064
##  6 FORD        2016     109604
##  7 FIAT        2016     106106
##  8 VOLKSWAGEN  2017      77071
##  9 RENAULT     2018      66517
## 10 OPEL        2016      55471
## # ... with 127 more rows

#Total auto and commerial vehicle sales for each brand & year sorted by the most total sales
car_df %>% group_by(brand_name,year) %>% summarise_each(funs(sum),auto_total,comm_total,total_total) %>% arrange(desc(total_total))

## # A tibble: 137 x 5
## # Groups:   brand_name [49]
##    brand_name  year auto_total comm_total total_total
##    <chr>      <dbl>      <dbl>      <dbl>       <dbl>
##  1 VOLKSWAGEN  2016     101763      32772      134535
##  2 RENAULT     2017     113454      16822      130276
##  3 RENAULT     2016     106616      15091      121707
##  4 FIAT        2017      61364      58685      120049
##  5 FORD        2017      39850      71214      111064
##  6 FORD        2016      41370      68234      109604
##  7 FIAT        2016      52369      53737      106106
##  8 VOLKSWAGEN  2017      59098      17973       77071
##  9 RENAULT     2018      59761       6756       66517
## 10 OPEL        2016      55471          0       55471
## # ... with 127 more rows

For visualization, we first find the top selling brands

#Top Sellers
car_df %>% select(brand_name,total_total,year) %>% group_by(brand_name) %>% 
  summarise(totalSales = sum(total_total)) %>% arrange(desc(totalSales))

## # A tibble: 49 x 2
##    brand_name    totalSales
##    <chr>              <dbl>
##  1 RENAULT           318500
##  2 FIAT              275900
##  3 FORD              271157
##  4 VOLKSWAGEN        262041
##  5 HYUNDAI           131666
##  6 DACIA             117978
##  7 OPEL              117122
##  8 TOYOTA            106950
##  9 PEUGEOT            98036
## 10 MERCEDES-BENZ      93944
## # ... with 39 more rows

#Constructing a top_sellers vector
top_sellers <- c("RENAULT","FIAT","FORD","VOLKSWAGEN","HYUNDAI","DACIA","OPEL","TOYOTA","PEUGEOT","MERCEDES-BENZ")

ODD Analysis

Orkun Berk Yuzbasioglu

November 25, 2018

Data Preparation

Plots