install.packages("readxl")
##
## The downloaded binary packages are in
## /var/folders/qv/khc24w491dn9d18j77490j380000gn/T//RtmpPXCd6Y/downloaded_packages
install.packages("tidyverse")
##
## The downloaded binary packages are in
## /var/folders/qv/khc24w491dn9d18j77490j380000gn/T//RtmpPXCd6Y/downloaded_packages
library(readxl)
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.7
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ─────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
Create a temporary file
tmp<-tempfile(fileext=".xlsx")
Download file from repository to the temp file
download.file("https://github.com/MEF-BDA503/pj18-mehmetakk/blob/master/odd_retail_sales_2016_09.xlsx?raw=true",mode = 'wb',destfile=tmp)
Read that excel file using readxl package’s read_excel function. You might need to adjust the parameters (skip, col_names) according to your raw file’s format.
raw_data<-readxl::read_excel(tmp,skip=7,col_names=FALSE)
When I first run readxl function, it gave me error – Evaluation error: error -103 with zipfile in unzGetCurrentFileInfo then I insert mode=‘wb’ to download.file function
Remove the temp file
file.remove(tmp)
## [1] TRUE
I checked head and tail of my data
head(raw_data)
## # A tibble: 6 x 10
## X__1 X__2 X__3 X__4 X__5 X__6 X__7 X__8 X__9 X__10
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALFA ROMEO NA 32 32 NA NA 0 0 32 32
## 2 ASTON MARTIN NA 4 4 NA NA 0 0 4 4
## 3 AUDI NA 1541 1541 NA NA 0 0 1541 1541
## 4 BENTLEY NA 3 3 NA NA 0 0 3 3
## 5 BMW NA 1806 1806 NA NA 0 0 1806 1806
## 6 CHERY NA NA 0 NA NA 0 0 0 0
tail(raw_data)
## # A tibble: 6 x 10
## X__1 X__2 X__3 X__4 X__5 X__6 X__7 X__8 X__9 X__10
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 TATA NA NA 0 NA NA 0 0 0 0
## 2 TOYOTA 2069 858 2927 NA 543 543 2069 1401 3470
## 3 VOLKSWAGEN NA 8028 8028 NA 2284 2284 0 10312 10312
## 4 VOLVO NA 222 222 NA NA 0 0 222 222
## 5 <NA> NA NA NA NA NA NA NA NA NA
## 6 TOPLAM: 10971 40369 51340 8374 7879 16253 19345 48248 67593
Since the last 2 rows are blank and total, I removed these rows
raw_data <- raw_data %>% slice(-c(43,44))
tail(raw_data)
## # A tibble: 6 x 10
## X__1 X__2 X__3 X__4 X__5 X__6 X__7 X__8 X__9 X__10
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 TATA NA NA 0 NA NA 0 0 0 0
## 2 TOYOTA 2069 858 2927 NA 543 543 2069 1401 3470
## 3 VOLKSWAGEN NA 8028 8028 NA 2284 2284 0 10312 10312
## 4 VOLVO NA 222 222 NA NA 0 0 222 222
## 5 <NA> NA NA NA NA NA NA NA NA NA
## 6 TOPLAM: 10971 40369 51340 8374 7879 16253 19345 48248 67593
head(raw_data)
## # A tibble: 6 x 10
## X__1 X__2 X__3 X__4 X__5 X__6 X__7 X__8 X__9 X__10
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALFA ROMEO NA 32 32 NA NA 0 0 32 32
## 2 ASTON MARTIN NA 4 4 NA NA 0 0 4 4
## 3 AUDI NA 1541 1541 NA NA 0 0 1541 1541
## 4 BENTLEY NA 3 3 NA NA 0 0 3 3
## 5 BMW NA 1806 1806 NA NA 0 0 1806 1806
## 6 CHERY NA NA 0 NA NA 0 0 0 0
Define column names
colnames(raw_data) <- c("brand_name","auto_dom","auto_imp","auto_total","comm_dom","comm_imp","comm_total","total_dom","total_imp","total_total")
and remove NA values with 0 and label the time period with year and month in order to prevent confusion when we merge data ( mutate()
adds new variables and preserves existing; transmute()
drops existing variables )
car_data_sep_16 <- raw_data %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year=2016,month=9)
print(car_data_sep_16,width=Inf)
## # A tibble: 48 x 12
## brand_name auto_dom auto_imp auto_total comm_dom comm_imp comm_total
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALFA ROMEO 0 32 32 0 0 0
## 2 ASTON MARTIN 0 4 4 0 0 0
## 3 AUDI 0 1541 1541 0 0 0
## 4 BENTLEY 0 3 3 0 0 0
## 5 BMW 0 1806 1806 0 0 0
## 6 CHERY 0 0 0 0 0 0
## 7 CITROEN 0 1519 1519 88 528 616
## 8 DACIA 0 3116 3116 0 502 502
## 9 DS 0 32 32 0 0 0
## 10 FERRARI 0 1 1 0 0 0
## total_dom total_imp total_total year month
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 32 32 2016 9
## 2 0 4 4 2016 9
## 3 0 1541 1541 2016 9
## 4 0 3 3 2016 9
## 5 0 1806 1806 2016 9
## 6 0 0 0 2016 9
## 7 88 2047 2135 2016 9
## 8 0 3618 3618 2016 9
## 9 0 32 32 2016 9
## 10 0 1 1 2016 9
## # ... with 38 more rows
One of the best methods is to save your data to an RDS or RData file. The difference is RDS can hold only one object but RData can hold many. Since we have only one data frame here we will go with RDS.
You can read that file by readRDS and assigning to an object rds_data <- readRDS("~/YOUR_OWN_PATH/odd_car_sales_data_sep_18.rds")
saveRDS(car_data_sep_16,file="car_data_sep_16.rds")
car_data_sep_16 %>%
select(brand_name,auto_dom) %>%
arrange(desc(auto_dom)) %>%
filter(auto_dom>0)
## # A tibble: 7 x 2
## brand_name auto_dom
## <chr> <dbl>
## 1 TOPLAM: 10971
## 2 FIAT 3491
## 3 RENAULT 2806
## 4 TOYOTA 2069
## 5 HYUNDAI 1778
## 6 FORD 438
## 7 HONDA 389