install.packages("readxl")
##
## The downloaded binary packages are in
## /var/folders/qv/khc24w491dn9d18j77490j380000gn/T//RtmpzeKSCn/downloaded_packages
install.packages("tidyverse")
##
## The downloaded binary packages are in
## /var/folders/qv/khc24w491dn9d18j77490j380000gn/T//RtmpzeKSCn/downloaded_packages
library(readxl)
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.7
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
Create a temporary file
tmp<-tempfile(fileext=".xlsx")
Download file from repository to the temp file
download.file("https://github.com/MEF-BDA503/pj18-mkaracabey/blob/master/EGM_Analysis/egm_example_data.xlsx?raw=true",mode = 'wb',destfile=tmp)
raw_data<-readxl::read_excel(tmp,skip=7,col_names=FALSE)
Remove the temp file
file.remove(tmp)
## [1] TRUE
head(raw_data)
## # A tibble: 6 x 15
## X__1 X__2 X__3 X__4 X__5 X__6 X__7 X__8 X__9 X__10 X__11
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <chr> <chr> <dbl>
## 1 06.0… Axa … 3.33e4 2.11e8 4.33e7 1.94e8 23 38686 856 522 4.01e4
## 2 06.0… BNP … 1.87e5 1.55e9 2.02e8 1.30e9 1196 149094 48359 29668 2.27e5
## 3 06.0… Cign… 1.23e5 5.64e8 1.04e8 5.01e8 57 107774 23016 2304 1.33e5
## 4 06.0… Fiba… 3.88e4 2.48e8 3.46e7 2.16e8 103 33841 8088 1075 4.30e4
## 5 06.0… Gara… 1.13e6 8.42e9 1.17e9 6.86e9 5669 963726 1918… 58214 1.21e6
## 6 06.0… Grou… 6.21e4 8.09e8 8.31e7 6.17e8 1963 61411 8351 1012 7.08e4
## # ... with 4 more variables: X__12 <dbl>, X__13 <dbl>, X__14 <dbl>,
## # X__15 <dbl>
tail(raw_data)
## # A tibble: 6 x 15
## X__1 X__2 X__3 X__4 X__5 X__6 X__7 X__8 X__9 X__10 X__11
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <chr> <chr> <dbl>
## 1 30.0… Halk… 511848 2.85e9 5.28e8 2.45e9 228 296337 2281… 56967 581442
## 2 30.0… Katı… 205994 9.08e8 1.82e8 8.13e8 50 174654 48494 52861 276009
## 3 30.0… Metl… 187483 1.37e9 2.34e8 1.14e9 312 153787 35060 17179 206026
## 4 30.0… NN H… 265725 2.61e9 3.82e8 1.98e9 3771 265714 33290 10259 309263
## 5 30.0… Vakı… 502994 5.38e9 7.57e8 4.18e9 6138 628544 75807 29694 734045
## 6 30.0… Zira… 598703 3.11e9 5.90e8 2.61e9 869 406934 2179… 53558 678456
## # ... with 4 more variables: X__12 <dbl>, X__13 <dbl>, X__14 <dbl>,
## # X__15 <dbl>
Define column names
colnames(raw_data) <- c("date","pension_fund_company","n_of_participants", "fund_size_participants", "gov_contribution", "contribution", "n_of_pensioners", "n_of_ind_contracts", "n_of_group_ind_contracts", "n_of_employer_group_certificates", "n_total","size_of_ind_contracts", "size_of_group_ind_contracts", "size_of_employer_group_certificates" ,"size_total")
egm_data <- raw_data %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.)))
print(egm_data,width=Inf)
## # A tibble: 1,362 x 15
## date pension_fund_company n_of_participants
## <chr> <chr> <dbl>
## 1 06.01.2017 Axa Hayat ve Emeklilik 33327
## 2 06.01.2017 BNP Paribas Cardif Emeklilik 187289
## 3 06.01.2017 Cigna Finans Emeklilik ve Hayat 122997
## 4 06.01.2017 Fiba Emeklilik ve Hayat 38754
## 5 06.01.2017 Garanti Emeklilik ve Hayat 1134051
## 6 06.01.2017 Groupama Emeklilik 62080
## 7 06.01.2017 Halk Hayat ve Emeklilik 445422
## 8 06.01.2017 Katılım Emeklilik ve Hayat 162207
## 9 06.01.2017 Metlife Emeklilik ve Hayat 179609
## 10 06.01.2017 NN Hayat ve Emeklilik 268929
## fund_size_participants gov_contribution contribution n_of_pensioners
## <dbl> <dbl> <dbl> <chr>
## 1 210804109 43342953 193638362 23
## 2 1545869615 202292735 1299990793 1196
## 3 564291355 103628903 501174178 57
## 4 247746061 34576421 216460096 103
## 5 8416458527 1169252346 6858941516 5669
## 6 808664284 83060092 616889142 1963
## 7 1948976004 349943882 1778929612 111
## 8 533523277 105023045 508788631 20
## 9 1015462865 167726855 908695745 164
## 10 2078148069 278835177 1691654417 2337
## n_of_ind_contracts n_of_group_ind_contracts
## <dbl> <chr>
## 1 38686 856
## 2 149094 48359
## 3 107774 23016
## 4 33841 8088
## 5 963726 191898
## 6 61411 8351
## 7 214731 238995
## 8 109836 46228
## 9 143625 35857
## 10 261658 39099
## n_of_employer_group_certificates n_total size_of_ind_contracts
## <chr> <dbl> <dbl>
## 1 522 40064 179784075
## 2 29668 227121 1011515864
## 3 2304 133094 410000280
## 4 1075 43004 177009961
## 5 58214 1213838 5088145866
## 6 1012 70774 540569297
## 7 49978 503704 739849661
## 8 49067 205131 349524378
## 9 18318 197800 626504799
## 10 11007 311764 1410370190
## size_of_group_ind_contracts size_of_employer_group_certificates
## <dbl> <dbl>
## 1 7791701 3542753
## 2 216811175 56924248
## 3 78595897 3171981
## 4 30913194 5888490
## 5 1436618301 223937666
## 6 66914330 2532006
## 7 928353692 99514882
## 8 112523243 38497518
## 9 160814973 111520785
## 10 229308207 27057100
## size_total
## <dbl>
## 1 191118529
## 2 1285251288
## 3 491768158
## 4 213811645
## 5 6748701834
## 6 610015633
## 7 1767718235
## 8 500545140
## 9 898840557
## 10 1666735497
## # ... with 1,352 more rows
saveRDS(egm_data,file="egm_data.rds")
egm_data %>%
select(date,n_of_participants,pension_fund_company,n_of_pensioners,n_of_ind_contracts) %>% arrange(n_of_pensioners) %>%
filter(n_of_participants>50000 & n_of_participants<100000)
## # A tibble: 76 x 5
## date n_of_participan… pension_fund_co… n_of_pensioners n_of_ind_contr…
## <chr> <dbl> <chr> <chr> <dbl>
## 1 06.0… 62080 Groupama Emekli… 1963 61411
## 2 13.0… 61925 Groupama Emekli… 1981 61246
## 3 20.0… 61774 Groupama Emekli… 1998 61064
## 4 27.0… 61590 Groupama Emekli… 2023 60852
## 5 31.0… 61591 Groupama Emekli… 2027 60855
## 6 03.0… 61576 Groupama Emekli… 2039 60833
## 7 10.0… 61427 Groupama Emekli… 2056 60654
## 8 17.0… 61298 Groupama Emekli… 2073 60495
## 9 24.0… 61053 Groupama Emekli… 2096 60222
## 10 28.0… 61050 Groupama Emekli… 2102 60217
## # ... with 66 more rows
egm_data %>% group_by(date) %>%summarise(MeanOfParticipants= mean(n_of_participants))
## # A tibble: 76 x 2
## date MeanOfParticipants
## <chr> <dbl>
## 1 01.09.2017 379582.
## 2 01.12.2017 383313.
## 3 02.02.2018 384803.
## 4 02.03.2018 386044.
## 5 02.06.2017 377354.
## 6 03.02.2017 373374.
## 7 03.03.2017 374766.
## 8 03.11.2017 381675
## 9 04.08.2017 379189.
## 10 05.01.2018 384943.
## # ... with 66 more rows
ggplot(data=egm_data ,aes(x = n_of_pensioners,y =n_of_employer_group_certificates)) + geom_point()
egm_data %>% select(n_of_ind_contracts:n_of_group_ind_contracts) %>% gather(key = n_of_ind_contracts, value = n_of_group_ind_contracts) %>% ggplot(data = ., aes(x = n_of_ind_contracts, y =n_of_group_ind_contracts )) + geom_bar(stat = "identity")