ODD Homework - Car Sales Data Import
Downloading Raw Data
# Create a temporary file
tmp<-tempfile(fileext=".xlsx")
# Download file from repository to the temp file
download.file("https://github.com/MEF-BDA503/pj18-muratorhan/blob/master/odd_retail_sales_2016_06.xlsx?raw=true", destfile=tmp, mode='wb')
# Read that excel file using readxl package's read_excel function. You might need to adjust the parameters (skip, col_names) according to your raw file's format.
raw_data<-readxl::read_excel(tmp,skip=7,col_names=FALSE)
# Remove the temp file
file.remove(tmp)
## [1] TRUE
[1] TRUE
# Remove the last two rows because they are irrelevant (total and empty rows)
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## <U+221A> ggplot2 3.0.0 <U+221A> purrr 0.2.5
## <U+221A> tibble 1.4.2 <U+221A> dplyr 0.7.6
## <U+221A> tidyr 0.8.1 <U+221A> stringr 1.3.1
## <U+221A> readr 1.1.1 <U+221A> forcats 0.3.0
## -- Conflicts ----------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
raw_data <- raw_data %>% slice(-c(49,50))
# Let's see our raw data
head(raw_data)
## # A tibble: 6 x 10
## X__1 X__2 X__3 X__4 X__5 X__6 X__7 X__8 X__9 X__10
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALFA ROMEO NA 48 48 NA NA 0 0 48 48
## 2 ASTON MARTIN NA 3 3 NA NA 0 0 3 3
## 3 AUDI NA 1961 1961 NA NA 0 0 1961 1961
## 4 BENTLEY NA 0 0 NA NA 0 0 0 0
## 5 BMW NA 2640 2640 NA NA 0 0 2640 2640
## 6 CHERY NA 25 25 NA NA 0 0 25 25
Making Data Civilized
# Use the same column names in your data.
colnames(raw_data) <- c("brand_name","auto_dom","auto_imp","auto_total","comm_dom","comm_imp","comm_total","total_dom","total_imp","total_total")
# Now we replace NA values with 0 and label the time period with year and month, so when we merge the data we won't be confused.
car_data_june_16 <- raw_data %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year=2016,month=6)
print(car_data_june_16,width=Inf)
## # A tibble: 48 x 12
## brand_name auto_dom auto_imp auto_total comm_dom comm_imp comm_total
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALFA ROMEO 0 48 48 0 0 0
## 2 ASTON MARTIN 0 3 3 0 0 0
## 3 AUDI 0 1961 1961 0 0 0
## 4 BENTLEY 0 0 0 0 0 0
## 5 BMW 0 2640 2640 0 0 0
## 6 CHERY 0 25 25 0 0 0
## 7 CITROEN 0 1613 1613 131 690 821
## 8 DACIA 0 3799 3799 0 429 429
## 9 DS 0 14 14 0 0 0
## 10 FERRARI 0 1 1 0 0 0
## total_dom total_imp total_total year month
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 48 48 2016 6
## 2 0 3 3 2016 6
## 3 0 1961 1961 2016 6
## 4 0 0 0 2016 6
## 5 0 2640 2640 2016 6
## 6 0 25 25 2016 6
## 7 131 2303 2434 2016 6
## 8 0 4228 4228 2016 6
## 9 0 14 14 2016 6
## 10 0 1 1 2016 6
## # ... with 38 more rows
Saving Your Civilized Data
saveRDS(car_data_june_16,file="~/C:\\Users\\Murat\\Desktop\\BIG DATA YL\\BDA 503 DATA ANALYTICS(R)\\W2\\ODD/odd_car_sales_data_june_16.rds")
# You can read that file by readRDS and assigning to an object
# e.g
# rds_data <- readRDS("~/C:\Users\Murat\Desktop\BIG DATA YL\BDA 503 DATA ANALYTICS(R)\W2\ODD/odd_car_sales_data_june_16.rds")
Finishing With Some Analysis
car_data_june_16 %>%
filter(auto_total > 0 & comm_total > 0) %>%
select(brand_name,total_total) %>%
arrange(desc(total_total))
## # A tibble: 14 x 2
## brand_name total_total
## <chr> <dbl>
## 1 VOLKSWAGEN 12406
## 2 RENAULT 12004
## 3 FORD 11307
## 4 FIAT 9486
## 5 MERCEDES-BENZ 4546
## 6 DACIA 4228
## 7 HYUNDAI 4201
## 8 PEUGEOT 3463
## 9 TOYOTA 3415
## 10 NISSAN 2705
## 11 CITROEN 2434
## 12 KIA 1700
## 13 MITSUBISHI 490
## 14 SSANGYONG 108