install.packages("readxl")
install.packages("plyr")
install.packages("tidyverse", repos = "https://cran.r-project.org")
install.packages("stringr")
library(tidyverse)
library(plyr)
library("readxl")
library(stringr)
Download export and import raw excel files and put them in temp files and read them.
And remove the temp files after reading.
Export Data
# Create a temporary file
tmp=tempfile(fileext=".xls")
# Download file from repository to the temp file
download.file("https://github.com/MEF-BDA503/gpj18-r_coders/blob/master/Data_Sources(Excel)/export_1996_2018.xls?raw=true",destfile=tmp,mode='wb')
# Read that excel file.
export_data <- read_excel(tmp)
# Remove the temp file
file.remove(tmp)
Import Data
# Create a temporary file
tmp=tempfile(fileext=".xls")
# Download file from repository to the temp file
download.file("https://github.com/MEF-BDA503/gpj18-r_coders/blob/master/Data_Sources(Excel)/import_1996_2018.xls?raw=true",destfile=tmp,mode='wb')
# Read that excel file.
import_data <- read_excel(tmp)
# Remove the temp file
file.remove(tmp)
Download the files from local computer. For example:
setwd("C:\\Users\\ozenm\\Documents\\GitHub\\gpj18-r_coders\Data_Sources\Excel_Files")
export_data <- read_excel("export_1996_2018.xls")
import_data <- read_excel("import_1996_2018.xls")
Download the rds files from our github repository.
tmp<-tempfile(fileext=".rds")
download.file("https://github.com/MEF-BDA503/gpj18-r_coders/blob/master/Data_Sources(Rds)/Producer_Inflation.rds?raw=true?raw=true",destfile=tmp,mode = 'wb')
producer_inf_data<-read_rds(tmp)
file.remove(tmp)
githubURL_data <- ("https://github.com/MEF-BDA503/gpj18-r_coders/blob/master/Data_Sources(Rds)/Consumer_Inflation.rds?raw=true")
download.file(githubURL_data,"Consumer_Inflation.rds", method="curl")
cons_inf_data<-readRDS("Consumer_Inflation.rds")
tmp<-tempfile(fileext=".rds")
download.file("https://github.com/MEF-BDA503/gpj18-r_coders/blob/master/import_data/Consumer_Inflation.rds?raw=true",destfile=tmp,mode = 'wb')
cons_inf_data<-read_rds(tmp)
file.remove(tmp)
tmp<-tempfile(fileext=".rds")
download.file("https://github.com/MEF-BDA503/gpj18-r_coders/blob/master/import_data/Enflasyon_Data.rds?raw=true",destfile=tmp,mode = 'wb')
enflasyon_data<-read_rds(tmp)
file.remove(tmp)
Format Data
Change the column names of data
colnames(export_data) <- c("Year","Sector_Type_Code","Sector_Name","Total_Amount","January","February","March","April","May","June","July","August","September","October","November","December")
colnames(import_data) <- c("Year","Sector_Type_Code","Sector_Name","Total_Amount","January","February","March","April","May","June","July","August","September","October","November","December")
Remove the NA rows
export_data_without_NA <- export_data[rowSums(is.na(export_data)) != ncol(export_data),]
import_data_without_NA <- import_data[rowSums(is.na(import_data)) != ncol(import_data),]
Analyze Data
class(export_data_without_NA)
dim(export_data_without_NA)
summary(export_data_without_NA)
#Total_Amount is not numeric!!! Check some values
hist(export_data_without_NA$Total_Amount)
Remove the total amount columns
#Drop Total Amount Data
export_data_without_NA_Total <- within(export_data_without_NA, rm(Total_Amount))
#Drop Total Amount Data
import_data_without_NA_Total <- within(import_data_without_NA, rm(Total_Amount))
Convert char data type to numeric data type for January column
str(export_data_without_NA_Total)
x <-suppressWarnings(as.numeric(export_data_without_NA_Total$January))
Control data type for January column
str(x)
sapply(export_data_without_NA_Total, is.numeric)
hist(as.numeric(export_data_without_NA_Total$January))
Convert char data types to numeric data types for other columns
cols = c(4:15);
export_data_without_NA_Total[,cols] = suppressWarnings(apply(export_data_without_NA_Total[,cols], 2, function(x) as.numeric(as.character(x))));
import_data_without_NA_Total[,cols] = suppressWarnings(apply(import_data_without_NA_Total[,cols], 2, function(x) as.numeric(as.character(x))));
Control data type for all columns
str(export_data_without_NA_Total)
str(import_data_without_NA_Total)
Do some tests on data
export_data_without_NA_Total$Year <- str_trim(export_data_without_NA_Total$Year)
head(export_data_without_NA_Total)
#outlier detection. -- Not use it.
out_export_data <- boxplot.stats(export_data_without_NA_Total$January)$out;
out_export_data
Check for the total number of missing values in the entire data
sum(is.na(export_data_without_NA_Total)) #1586
sum(is.na(import_data_without_NA_Total)) #1586
Shorter data name
exp_data <- export_data_without_NA_Total
str(exp_data)
imp_data <- import_data_without_NA_Total
str(imp_data)
Fill the empty years
v_year <- 2017
for (row in 1:nrow(exp_data)) {
year <- exp_data[row, "Year"]
if(!is.na(year) & year == v_year){
#print(paste("YEAR-ROW",year))
#print(paste("row-i",row))
v_year <- v_year - 1
}
exp_data[row, "Year"] <- v_year + 1
if (v_year==2008){
break
}
}
v_year <- 2017
for (row in 1:nrow(imp_data)) {
year <- imp_data[row, "Year"]
if(!is.na(year) & year == v_year){
#print(paste("YEAR-ROW",year))
#print(paste("row-i",row))
v_year <- v_year - 1
}
imp_data[row, "Year"] <- v_year + 1
if (v_year==2008){
break
}
}
Remove the unnecessary rows
exp_data_v2 <- exp_data %>%
slice(6:391)%>% filter(Sector_Name != "Toplam -Total")
imp_data_v2 <- imp_data %>%
slice(6:391)%>% filter(Sector_Name != "Toplam -Total")
Show data
tail(exp_data_v2)
tail(imp_data_v2)
exp_data_v3 <-
exp_data_v2 %>%
gather(key=Month,value=Amount,-Year,-Sector_Type_Code,-Sector_Name)
imp_data_v3 <-
imp_data_v2 %>%
gather(key=Month,value=Amount,-Year,-Sector_Type_Code,-Sector_Name)
Replace the month names with the month numbers
exp_data_v3$Month <- replace(exp_data_v3$Month,exp_data_v3$Month=="January","01")
exp_data_v3$Month <- replace(exp_data_v3$Month,exp_data_v3$Month=="February","02")
exp_data_v3$Month <- replace(exp_data_v3$Month,exp_data_v3$Month=="March","03")
exp_data_v3$Month <- replace(exp_data_v3$Month,exp_data_v3$Month=="April","04")
exp_data_v3$Month <- replace(exp_data_v3$Month,exp_data_v3$Month=="May","05")
exp_data_v3$Month <- replace(exp_data_v3$Month,exp_data_v3$Month=="June","06")
exp_data_v3$Month <- replace(exp_data_v3$Month,exp_data_v3$Month=="July","07")
exp_data_v3$Month <- replace(exp_data_v3$Month,exp_data_v3$Month=="August","08")
exp_data_v3$Month <- replace(exp_data_v3$Month,exp_data_v3$Month=="September","09")
exp_data_v3$Month <- replace(exp_data_v3$Month,exp_data_v3$Month=="October","10")
exp_data_v3$Month <- replace(exp_data_v3$Month,exp_data_v3$Month=="November","11")
exp_data_v3$Month <- replace(exp_data_v3$Month,exp_data_v3$Month=="December","12")
imp_data_v3$Month <- replace(imp_data_v3$Month,imp_data_v3$Month=="January","01")
imp_data_v3$Month <- replace(imp_data_v3$Month,imp_data_v3$Month=="February","02")
imp_data_v3$Month <- replace(imp_data_v3$Month,imp_data_v3$Month=="March","03")
imp_data_v3$Month <- replace(imp_data_v3$Month,imp_data_v3$Month=="April","04")
imp_data_v3$Month <- replace(imp_data_v3$Month,imp_data_v3$Month=="May","05")
imp_data_v3$Month <- replace(imp_data_v3$Month,imp_data_v3$Month=="June","06")
imp_data_v3$Month <- replace(imp_data_v3$Month,imp_data_v3$Month=="July","07")
imp_data_v3$Month <- replace(imp_data_v3$Month,imp_data_v3$Month=="August","08")
imp_data_v3$Month <- replace(imp_data_v3$Month,imp_data_v3$Month=="September","09")
imp_data_v3$Month <- replace(imp_data_v3$Month,imp_data_v3$Month=="October","10")
imp_data_v3$Month <- replace(imp_data_v3$Month,imp_data_v3$Month=="November","11")
imp_data_v3$Month <- replace(imp_data_v3$Month,imp_data_v3$Month=="December","12")
Convert about date columns as lubridate
exp_data_v4 <- exp_data_v3 %>% mutate(Date = lubridate::as_date(paste(Year,
as.integer(Month), as.integer("01"), sep = "-")))
imp_data_v4 <- imp_data_v3 %>% mutate(Date = lubridate::as_date(paste(Year,
as.integer(Month), as.integer("01"), sep = "-")))
Remove unnecessary columns
exp_data_v5 <- exp_data_v4 %>% select ( Date, Sector_Type_Code, Sector_Name, Amount)
imp_data_v5 <- imp_data_v4 %>% select ( Date, Sector_Type_Code, Sector_Name, Amount)
Show data structure
str(exp_data_v5)
str(imp_data_v5)
Save as rds files
saveRDS(exp_data_v5,file="exp_data.rds")
#export_data_rds <- readRDS("exp_data.rds")
saveRDS(imp_data_v5,file="imp_data.rds")
#import_data_rds <- readRDS("imp_data.rds")
Download new rds files
tmp<-tempfile(fileext=".rds")
download.file("https://github.com/MEF-BDA503/gpj18-r_coders/blob/master/Data_Sources(Rds)/imp_data.rds?raw=true",destfile=tmp,mode = 'wb')
imp_data<-read_rds(tmp)
file.remove(tmp)
tmp<-tempfile(fileext=".rds")
download.file("https://github.com/MEF-BDA503/gpj18-r_coders/blob/master/Data_Sources(Rds)/exp_data.rds?raw=true",destfile=tmp,mode = 'wb')
exp_data<-read_rds(tmp)
file.remove(tmp)
Data transformation
exp_data_without_NA <- exp_data
exp_data_without_NA[is.na(exp_data_without_NA <- exp_data)] <- 0
exp_data_without_NA
imp_data_without_NA <- imp_data
imp_data_without_NA[is.na(imp_data_without_NA <- imp_data)] <- 0
imp_data_without_NA
Convert date column POSIXct to Date and check data types
str(us_rate_data)
us_rate_data$Date <- as.Date(as.character(as.POSIXct(us_rate_data$Date)))
str(us_rate_data)
str(cons_inf_data)
cons_inf_data$Date <- as.Date(as.character(as.POSIXct(cons_inf_data$Date)))
str(cons_inf_data)
str(cons_inf_data)
producer_inf_data$Date <- as.Date(as.character(as.POSIXct(producer_inf_data$Date)))
str(cons_inf_data)
Join data
install.packages("tidyverse")
library("tidyverse")
exp_data_v2 <-
exp_data_without_NA %>%
group_by(Date) %>%
summarise(Export_Total_Amount=sum(Amount))
imp_data_v2 <-
imp_data_without_NA %>%
group_by(Date) %>%
summarise(Import_Total_Amount=sum(Amount))
exp_data_v3 <- inner_join(exp_data_v2,cons_inf_data, by=c("Date" = "Date"))
exp_data_v4 <- inner_join(exp_data_v3,us_rate_data, by=c("Date" = "Date"))
imp_data_v3 <- inner_join(imp_data_v2,cons_inf_data, by=c("Date" = "Date"))
imp_data_v4 <- inner_join(imp_data_v3,us_rate_data, by=c("Date" = "Date"))
Change column names
colnames(exp_data_v4) <- c("Date","Export_Total_Amount","Consumer_Price_Index_Yearly_Change","Consumer_Price_Index_Monthly_Change","USD_Rate")
colnames(imp_data_v4) <- c("Date","Export_Total_Amount","Consumer_Price_Index_Yearly_Change","Consumer_Price_Index_Monthly_Change","USD_Rate")
Save as RDS files
saveRDS(imp_data_v4,file='imp_data_final.rds')
saveRDS(exp_data_v4,file='exp_data_final.rds')