Download Raw Data

Our raw excel file is in our repository. We can automatically download that file and put it in a temporary file. Then we can read that excel document into R and remove the temp file.

# Create a temporary file
tmp=tempfile(fileext=".xlsx")

# Download file from repository to the temp file
download.file("https://github.com/MEF-BDA503/pj18-baysalu/blob/master/files/egm_example_data.xlsx?raw=true",destfile=tmp,mode='wb')

# Read that excel file using readxl package's read_excel function. You might need to adjust the parameters (skip, col_names) according to your raw file's format.
raw_data=readxl::read_excel(tmp)

# Remove the temp file
file.remove(tmp)
## [1] TRUE
# Let's see our raw data
head(raw_data)
## # A tibble: 6 x 15
##   date  pension_fund_co~ n_of_participan~ fund_size_parti~ gov_contribution
##   <chr> <chr>                       <dbl>            <dbl>            <dbl>
## 1 06.0~ Aegon Emeklilik~            42826        139193651         14409322
## 2 06.0~ Allianz Hayat v~           118867       1840701596        231156574
## 3 06.0~ Allianz Yasam v~           726684       7387398235        815356276
## 4 06.0~ Anadolu Hayat E~          1119572       9975066501       1484745501
## 5 06.0~ Asya Emeklilik ~           129389        337754079         77689275
## 6 06.0~ Avivasa Emeklil~           876894      10550032358       1355045933
## # ... with 10 more variables: contribution <dbl>, n_of_pensioners <chr>,
## #   n_of_ind_contracts <dbl>, n_of_group_ind_contracts <chr>,
## #   n_of_employer_group_certificates <chr>, n_total <dbl>,
## #   size_of_ind_contracts <dbl>, size_of_group_ind_contracts <dbl>,
## #   size_of_employer_group_certificates <dbl>, size_total <dbl>
tail(raw_data)
## # A tibble: 6 x 15
##   date  pension_fund_co~ n_of_participan~ fund_size_parti~ gov_contribution
##   <chr> <chr>                       <dbl>            <dbl>            <dbl>
## 1 30.0~ Halk Hayat ve E~           511848       2845016752        527923905
## 2 30.0~ Katilim Emeklil~           205994        907989162        182110867
## 3 30.0~ Metlife Emeklil~           187483       1368676067        233517732
## 4 30.0~ NN Hayat ve Eme~           265725       2607702561        382046370
## 5 30.0~ Vakif Emeklilik~           502994       5375415236        757466988
## 6 30.0~ Ziraat Hayat ve~           598703       3105479818        590220958
## # ... with 10 more variables: contribution <dbl>, n_of_pensioners <chr>,
## #   n_of_ind_contracts <dbl>, n_of_group_ind_contracts <chr>,
## #   n_of_employer_group_certificates <chr>, n_total <dbl>,
## #   size_of_ind_contracts <dbl>, size_of_group_ind_contracts <dbl>,
## #   size_of_employer_group_certificates <dbl>, size_total <dbl>
# Let's convert data frame column from character to numeric
raw_data$n_of_group_ind_contracts = as.numeric(as.character(raw_data$n_of_group_ind_contracts))
raw_data$n_of_pensioners = as.numeric(as.character(raw_data$n_of_pensioners))
raw_data$n_of_employer_group_certificates = as.numeric(as.character(raw_data$n_of_employer_group_certificates))
head(raw_data)
## # A tibble: 6 x 15
##   date  pension_fund_co~ n_of_participan~ fund_size_parti~ gov_contribution
##   <chr> <chr>                       <dbl>            <dbl>            <dbl>
## 1 06.0~ Aegon Emeklilik~            42826        139193651         14409322
## 2 06.0~ Allianz Hayat v~           118867       1840701596        231156574
## 3 06.0~ Allianz Yasam v~           726684       7387398235        815356276
## 4 06.0~ Anadolu Hayat E~          1119572       9975066501       1484745501
## 5 06.0~ Asya Emeklilik ~           129389        337754079         77689275
## 6 06.0~ Avivasa Emeklil~           876894      10550032358       1355045933
## # ... with 10 more variables: contribution <dbl>, n_of_pensioners <dbl>,
## #   n_of_ind_contracts <dbl>, n_of_group_ind_contracts <dbl>,
## #   n_of_employer_group_certificates <dbl>, n_total <dbl>,
## #   size_of_ind_contracts <dbl>, size_of_group_ind_contracts <dbl>,
## #   size_of_employer_group_certificates <dbl>, size_total <dbl>

Finish With Some Analysis

raw_data2 = raw_data %>% 
  group_by(pension_fund_company) %>%
  summarise(sum_n_total=sum(n_total)) %>%
  arrange(desc(sum_n_total)) %>%
  filter(sum_n_total > 45000000) 
  print(raw_data2)
## # A tibble: 5 x 2
##   pension_fund_company       sum_n_total
##   <chr>                            <dbl>
## 1 Anadolu Hayat Emeklilik       99557810
## 2 Garanti Emeklilik ve Hayat    96132459
## 3 Avivasa Emeklilik ve Hayat    84707101
## 4 Allianz Yasam ve Emeklilik    70880813
## 5 Ziraat Hayat ve Emeklilik     48296866
raw_data2 %>% 
   ggplot(data = ., aes(x = pension_fund_company, y = sum_n_total, 
   fill = pension_fund_company)) + geom_bar(stat = "identity")  + aes(x = reorder(pension_fund_company, -sum_n_total), y = sum_n_total) + labs(x = "", y = "", title = "Top 5 according to Total Pension Funds Number") + theme_bw() + theme(legend.position = "none", axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 0.5, size = 12))

raw_data3 = raw_data %>% 
  group_by(pension_fund_company) %>%
  summarise(sum_size_total=sum(size_total)) %>%
  arrange(desc(sum_size_total)) %>%
  filter(sum_size_total > 165000000000) 
  print(raw_data3)
## # A tibble: 5 x 2
##   pension_fund_company       sum_size_total
##   <chr>                               <dbl>
## 1 Avivasa Emeklilik ve Hayat   716465921455
## 2 Anadolu Hayat Emeklilik      680350979685
## 3 Garanti Emeklilik ve Hayat   558007446946
## 4 Allianz Yasam ve Emeklilik   481613261122
## 5 Ziraat Hayat ve Emeklilik    166173034923
raw_data3 %>% 
   ggplot(data = ., aes(x = pension_fund_company, y = sum_size_total, 
   fill = pension_fund_company)) + geom_bar(stat = "identity")  + aes(x = reorder(pension_fund_company, -sum_size_total), y = sum_size_total) + labs(x = "", y = "", title = "Top 5 according to Total Pension Funds Size") + theme_bw() + theme(legend.position = "none", axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 0.5, size = 12))