install.packages("readxl")
##
## The downloaded binary packages are in
## /var/folders/qv/khc24w491dn9d18j77490j380000gn/T//RtmpyZzxEJ/downloaded_packages
install.packages("tidyverse")
##
## The downloaded binary packages are in
## /var/folders/qv/khc24w491dn9d18j77490j380000gn/T//RtmpyZzxEJ/downloaded_packages
library(readxl)
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.7
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ─────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
Create a temporary file
tmp<-tempfile(fileext=".xlsx")
Download file from repository to the temp file
download.file("https://github.com/MEF-BDA503/pj18-mkaracabey/blob/master/oto_sales_analysis/2017_4_sales_oto.xlsx?raw=true",mode = 'wb',destfile=tmp)
raw_data<-readxl::read_excel(tmp,skip=7,col_names=FALSE)
When I first run readxl function, it gave me error – Evaluation error: error -103 with zipfile in unzGetCurrentFileInfo then I insert mode=‘wb’ to download.file function
Remove the temp file
file.remove(tmp)
## [1] TRUE
head and tail of excel data
head(raw_data)
## # A tibble: 6 x 10
## X__1 X__2 X__3 X__4 X__5 X__6 X__7 X__8 X__9 X__10
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALFA ROMEO NA 27 27 NA NA 0 0 27 27
## 2 ASTON MARTIN NA 2 2 NA NA 0 0 2 2
## 3 AUDI NA 1673 1673 NA NA 0 0 1673 1673
## 4 BENTLEY NA 1 1 NA NA 0 0 1 1
## 5 BMW NA 877 877 NA NA 0 0 877 877
## 6 CHERY NA 0 0 NA NA 0 0 0 0
tail(raw_data)
## # A tibble: 6 x 10
## X__1 X__2 X__3 X__4 X__5 X__6 X__7 X__8 X__9 X__10
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SUZUKI NA 206 206 NA NA 0 0 206 206
## 2 TOYOTA 1975 427 2402 NA 304 304 1975 731 2706
## 3 VOLKSWAGEN NA 6672 6672 NA 2362 2362 0 9034 9034
## 4 VOLVO NA 340 340 NA NA 0 0 340 340
## 5 <NA> NA NA NA NA NA NA NA NA NA
## 6 TOPLAM: 17486 40512 57998 9060 8930 17990 26546 49442 75988
Since the last 2 rows are blank and total, I removed these rows
raw_data <- raw_data %>% slice(-c(43,44))
tail(raw_data)
## # A tibble: 6 x 10
## X__1 X__2 X__3 X__4 X__5 X__6 X__7 X__8 X__9 X__10
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SSANGYONG NA 27 27 NA 12 12 0 39 39
## 2 SUBARU NA 129 129 NA NA 0 0 129 129
## 3 SUZUKI NA 206 206 NA NA 0 0 206 206
## 4 VOLVO NA 340 340 NA NA 0 0 340 340
## 5 <NA> NA NA NA NA NA NA NA NA NA
## 6 TOPLAM: 17486 40512 57998 9060 8930 17990 26546 49442 75988
head(raw_data)
## # A tibble: 6 x 10
## X__1 X__2 X__3 X__4 X__5 X__6 X__7 X__8 X__9 X__10
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALFA ROMEO NA 27 27 NA NA 0 0 27 27
## 2 ASTON MARTIN NA 2 2 NA NA 0 0 2 2
## 3 AUDI NA 1673 1673 NA NA 0 0 1673 1673
## 4 BENTLEY NA 1 1 NA NA 0 0 1 1
## 5 BMW NA 877 877 NA NA 0 0 877 877
## 6 CHERY NA 0 0 NA NA 0 0 0 0
Define column names
colnames(raw_data) <- c("brand_name","auto_dom","auto_imp","auto_total","comm_dom","comm_imp","comm_total","total_dom","total_imp","total_total")
and remove NA values with 0 and label the time period with year and month in order to prevent confusion when we merge data ( mutate()
adds new variables and preserves existing; transmute()
drops existing variables )
car_data_april_17 <- raw_data %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year=2017,month=4)
print(car_data_april_17,width=Inf)
## # A tibble: 45 x 12
## brand_name auto_dom auto_imp auto_total comm_dom comm_imp comm_total
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALFA ROMEO 0 27 27 0 0 0
## 2 ASTON MARTIN 0 2 2 0 0 0
## 3 AUDI 0 1673 1673 0 0 0
## 4 BENTLEY 0 1 1 0 0 0
## 5 BMW 0 877 877 0 0 0
## 6 CHERY 0 0 0 0 0 0
## 7 CITROEN 0 1654 1654 0 807 807
## 8 DACIA 0 3557 3557 0 615 615
## 9 DS 0 13 13 0 0 0
## 10 FERRARI 0 1 1 0 0 0
## total_dom total_imp total_total year month
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 27 27 2017 4
## 2 0 2 2 2017 4
## 3 0 1673 1673 2017 4
## 4 0 1 1 2017 4
## 5 0 877 877 2017 4
## 6 0 0 0 2017 4
## 7 0 2461 2461 2017 4
## 8 0 4172 4172 2017 4
## 9 0 13 13 2017 4
## 10 0 1 1 2017 4
## # ... with 35 more rows
One of the best methods is to save your data to an RDS or RData file. The difference is RDS can hold only one object but RData can hold many. Since we have only one data frame here we will go with RDS.
You can read that file by readRDS and assigning to an object rds_data <- readRDS("~/YOUR_OWN_PATH/odd_car_sales_data_april_17.rds")
saveRDS(car_data_april_17,file="data_april_17.rds")
car_data_april_17 %>%
select(brand_name,auto_dom) %>%
arrange(auto_dom) %>%
filter(auto_dom>0)
## # A tibble: 6 x 2
## brand_name auto_dom
## <chr> <dbl>
## 1 FORD 1105
## 2 HONDA 1626
## 3 HYUNDAI 2293
## 4 FIAT 4700
## 5 RENAULT 5787
## 6 TOPLAM: 17486
On above, companies of maked car sales.
On the below, Some Filter Added to show sales of car clearly in April of 2017.
car_data_april_17 %>%
select(brand_name,auto_dom,comm_dom,total_imp,total_dom) %>%
arrange(auto_dom) %>% arrange(total_imp) %>%
filter(comm_dom>0) %>% filter(auto_dom>0) %>% filter(total_imp >0)
## # A tibble: 3 x 5
## brand_name auto_dom comm_dom total_imp total_dom
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 FIAT 4700 3752 787 8452
## 2 FORD 1105 4987 3114 6092
## 3 TOPLAM: 17486 9060 49442 26546