After importing the sales data from the Turkish Automotive Distributors Association to the environment, I checked the dataframe whether there are any type mismatch or peculatiries.
#get the class of the columns of car data frame
sapply(car_df, class)
## brand_name auto_dom auto_imp auto_total comm_dom comm_imp
## "character" "numeric" "numeric" "numeric" "numeric" "numeric"
## comm_total total_dom total_imp total_total year month
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
#View the head of car_df if there are any pecularities
head(car_df)
## # A tibble: 6 x 12
## brand_name auto_dom auto_imp auto_total comm_dom comm_imp comm_total
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALFA ROMEO 0 13 13 0 0 0
## 2 ASTON MAR~ 0 2 2 0 0 0
## 3 AUDI 0 350 350 0 0 0
## 4 BENTLEY 0 0 0 0 0 0
## 5 BMW 0 158 158 0 0 0
## 6 CITROEN 0 134 134 0 197 197
## # ... with 5 more variables: total_dom <dbl>, total_imp <dbl>,
## # total_total <dbl>, year <dbl>, month <dbl>
#View the tail of car_df if there are any pecularities
tail(car_df)
## # A tibble: 6 x 12
## brand_name auto_dom auto_imp auto_total comm_dom comm_imp comm_total
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SSANGYONG 0 19 19 0 3 3
## 2 TATA 0 0 0 0 9 9
## 3 TOYOTA 1298 149 1447 0 34 34
## 4 VOLKSWAGEN 0 2792 2792 0 1736 1736
## 5 VOLVO 0 187 187 0 0 0
## 6 TOPLAM: 7375 15983 23358 4815 4540 9355
## # ... with 5 more variables: total_dom <dbl>, total_imp <dbl>,
## # total_total <dbl>, year <dbl>, month <dbl>
So there is a row with brand_name “TOPLAM:” that means Total in Turkish, which we need to remove from the car_df. First we find the rows where the brand_name is “Toplam:” and remove them. Also ASTON MARTIN and ASTON MARTIN both exist in the daa frame. This will be handled too.
#Finding the rows where the brand_name is "Toplam:" and removing them with using base R functionality
which(car_df$brand_name == "TOPLAM:")
## [1] 169 297 340 383 559 738 782 919 1109 1252 1395 1490
#removing the rows
toRemove <- which(car_df$brand_name == "TOPLAM:")
car_df <- car_df[-toRemove, ]
car_df <- car_df[-866, ]
#Check whether there are still any rows fit brand_name == "TOPLAM:"
which(car_df$brand_name == "TOPLAM:")
## integer(0)
#Replacing MARTIN with MARTIN
car_df$brand_name <- gsub("ASTON MARTIN", "ASTON MARTIN", car_df$brand_name)
Then we introduce some new percentage columns to the car_df, which can be useful for further analysis. Also a year_month columns:
#Introducing some new columns
car_df %<>% mutate(percent_auto_imp = auto_imp/auto_total)
car_df %<>% mutate(percent_comm_imp = comm_imp/comm_total)
car_df %<>% mutate(percent_total_imp = total_imp/total_total)
car_df %<>% mutate(percent_auto = auto_total/total_total)
car_df %<>% mutate(percent_auto = comm_total/total_total)
#A new date column, sales data is obtained at the end of month, so a day value of 28 is used
car_df$year_month = ymd(paste(car_df$year, car_df$month, "28"))
Then we look first to the structure of the car_df and also calculate some aggregate measures about the total sales:
#Structure of car_df
glimpse(car_df)
## Observations: 1,477
## Variables: 17
## $ brand_name <chr> "ALFA ROMEO", "ASTON MARTIN", "AUDI", "BENTL...
## $ auto_dom <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 632, 91, 1471, 46...
## $ auto_imp <dbl> 13, 2, 350, 0, 158, 134, 1141, 9, 3, 57, 705...
## $ auto_total <dbl> 13, 2, 350, 0, 158, 134, 1141, 9, 3, 689, 79...
## $ comm_dom <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 789, 1421, 0, 0, ...
## $ comm_imp <dbl> 0, 0, 0, 0, 0, 197, 319, 0, 0, 199, 139, 0, ...
## $ comm_total <dbl> 0, 0, 0, 0, 0, 197, 319, 0, 0, 988, 1560, 0,...
## $ total_dom <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1421, 1512, 1471,...
## $ total_imp <dbl> 13, 2, 350, 0, 158, 331, 1460, 9, 3, 256, 84...
## $ total_total <dbl> 13, 2, 350, 0, 158, 331, 1460, 9, 3, 1677, 2...
## $ year <dbl> 2018, 2018, 2018, 2018, 2018, 2018, 2018, 20...
## $ month <dbl> 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,...
## $ percent_auto_imp <dbl> 1.00000000, 1.00000000, 1.00000000, NaN, 1.0...
## $ percent_comm_imp <dbl> NaN, NaN, NaN, NaN, NaN, 1.00000000, 1.00000...
## $ percent_total_imp <dbl> 1.0000000, 1.0000000, 1.0000000, NaN, 1.0000...
## $ percent_auto <dbl> 0.00000000, 0.00000000, 0.00000000, NaN, 0.0...
## $ year_month <date> 2018-09-28, 2018-09-28, 2018-09-28, 2018-09...
#Total Sales for each brand & year sorted by the most sold brand
car_df %>% group_by(brand_name,year) %>% summarise(totalsales = sum(total_total)) %>% arrange(desc(totalsales))
## # A tibble: 137 x 3
## # Groups: brand_name [49]
## brand_name year totalsales
## <chr> <dbl> <dbl>
## 1 VOLKSWAGEN 2016 134535
## 2 RENAULT 2017 130276
## 3 RENAULT 2016 121707
## 4 FIAT 2017 120049
## 5 FORD 2017 111064
## 6 FORD 2016 109604
## 7 FIAT 2016 106106
## 8 VOLKSWAGEN 2017 77071
## 9 RENAULT 2018 66517
## 10 OPEL 2016 55471
## # ... with 127 more rows
#Total auto and commerial vehicle sales for each brand & year sorted by the most total sales
car_df %>% group_by(brand_name,year) %>% summarise_each(funs(sum),auto_total,comm_total,total_total) %>% arrange(desc(total_total))
## # A tibble: 137 x 5
## # Groups: brand_name [49]
## brand_name year auto_total comm_total total_total
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 VOLKSWAGEN 2016 101763 32772 134535
## 2 RENAULT 2017 113454 16822 130276
## 3 RENAULT 2016 106616 15091 121707
## 4 FIAT 2017 61364 58685 120049
## 5 FORD 2017 39850 71214 111064
## 6 FORD 2016 41370 68234 109604
## 7 FIAT 2016 52369 53737 106106
## 8 VOLKSWAGEN 2017 59098 17973 77071
## 9 RENAULT 2018 59761 6756 66517
## 10 OPEL 2016 55471 0 55471
## # ... with 127 more rows
For visualization, we first find the top selling brands
#Top Sellers
car_df %>% select(brand_name,total_total,year) %>% group_by(brand_name) %>%
summarise(totalSales = sum(total_total)) %>% arrange(desc(totalSales))
## # A tibble: 49 x 2
## brand_name totalSales
## <chr> <dbl>
## 1 RENAULT 318500
## 2 FIAT 275900
## 3 FORD 271157
## 4 VOLKSWAGEN 262041
## 5 HYUNDAI 131666
## 6 DACIA 117978
## 7 OPEL 117122
## 8 TOYOTA 106950
## 9 PEUGEOT 98036
## 10 MERCEDES-BENZ 93944
## # ... with 39 more rows
#Constructing a top_sellers vector
top_sellers <- c("RENAULT","FIAT","FORD","VOLKSWAGEN","HYUNDAI","DACIA","OPEL","TOYOTA","PEUGEOT","MERCEDES-BENZ")
LIne charts of monthly sales for the top selling brands are as follows:
##A line chart for monthly sales of 2017
car_df %>% filter(brand_name %in% top_sellers, year == 2016) %>% ggplot(aes(x=month, y=total_total, color=brand_name)) +
geom_line() + labs(title="Monthly Total Sales for Top Sellers 2016")
##A line chart for monthly sales of 2017
car_df %>% filter(brand_name %in% top_sellers, year == 2017) %>% ggplot(aes(x=month, y=total_total, color=brand_name)) +
geom_line() + labs(title="Monthly Total Sales for Top Sellers 2017")