#Check if there is any errors in reading.
head(egm_data)
## # A tibble: 6 x 15
## date pension_fund_company n_of_participan~ fund_size_participa~
## <chr> <chr> <dbl> <dbl>
## 1 06.01.20~ Aegon Emeklilik ve Hayat 42826 139193651
## 2 06.01.20~ Allianz Hayat ve Emekli~ 118867 1840701596
## 3 06.01.20~ Allianz Yaşam ve Emekli~ 726684 7387398235
## 4 06.01.20~ Anadolu Hayat Emeklilik 1119572 9975066501
## 5 06.01.20~ Asya Emeklilik ve Hayat 129389 337754079
## 6 06.01.20~ Avivasa Emeklilik ve Ha~ 876894 10550032358
## # ... with 11 more variables: gov_contribution <dbl>, contribution <dbl>,
## # n_of_pensioners <chr>, n_of_ind_contracts <dbl>,
## # n_of_group_ind_contracts <chr>,
## # n_of_employer_group_certificates <chr>, n_total <dbl>,
## # size_of_ind_contracts <dbl>, size_of_group_ind_contracts <dbl>,
## # size_of_employer_group_certificates <dbl>, size_total <dbl>
#And tail, just to be sure.
tail(egm_data)
## # A tibble: 6 x 15
## date pension_fund_company n_of_participan~ fund_size_participa~
## <chr> <chr> <dbl> <dbl>
## 1 30.03.20~ Halk Hayat ve Emeklilik 511848 2845016752
## 2 30.03.20~ Katılım Emeklilik ve Ha~ 205994 907989162
## 3 30.03.20~ Metlife Emeklilik ve Ha~ 187483 1368676067
## 4 30.03.20~ NN Hayat ve Emeklilik 265725 2607702561
## 5 30.03.20~ Vakıf Emeklilik ve Hayat 502994 5375415236
## 6 30.03.20~ Ziraat Hayat ve Emeklil~ 598703 3105479818
## # ... with 11 more variables: gov_contribution <dbl>, contribution <dbl>,
## # n_of_pensioners <chr>, n_of_ind_contracts <dbl>,
## # n_of_group_ind_contracts <chr>,
## # n_of_employer_group_certificates <chr>, n_total <dbl>,
## # size_of_ind_contracts <dbl>, size_of_group_ind_contracts <dbl>,
## # size_of_employer_group_certificates <dbl>, size_total <dbl>
#Next step is looking at the structure of our dataframe.
str(egm_data)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1368 obs. of 15 variables:
## $ date : chr "06.01.2017" "06.01.2017" "06.01.2017" "06.01.2017" ...
## $ pension_fund_company : chr "Aegon Emeklilik ve Hayat" "Allianz Hayat ve Emeklilik" "Allianz Yaşam ve Emeklilik" "Anadolu Hayat Emeklilik" ...
## $ n_of_participants : num 42826 118867 726684 1119572 129389 ...
## $ fund_size_participants : num 1.39e+08 1.84e+09 7.39e+09 9.98e+09 3.38e+08 ...
## $ gov_contribution : num 1.44e+07 2.31e+08 8.15e+08 1.48e+09 7.77e+07 ...
## $ contribution : num 1.10e+08 1.47e+09 5.80e+09 8.25e+09 2.99e+08 ...
## $ n_of_pensioners : chr "269" "2300" "5640" "11467" ...
## $ n_of_ind_contracts : num 34003 113222 627972 938794 39160 ...
## $ n_of_group_ind_contracts : chr "14127" "18813" "148377" "323209" ...
## $ n_of_employer_group_certificates : chr "884" "4488" "98772" "27232" ...
## $ n_total : num 49014 136523 875121 1289235 135451 ...
## $ size_of_ind_contracts : num 8.71e+07 1.17e+09 4.00e+09 6.05e+09 1.28e+08 ...
## $ size_of_group_ind_contracts : num 1.77e+07 2.45e+08 1.32e+09 1.91e+09 1.57e+08 ...
## $ size_of_employer_group_certificates: num 1.81e+06 4.27e+07 3.78e+08 1.63e+08 1.20e+07 ...
## $ size_total : num 1.07e+08 1.45e+09 5.70e+09 8.12e+09 2.97e+08 ...
For my first analysis I want to see the data on each company.
#Check if all companies have the same amount of data entries.
egm_data %>%
group_by(pension_fund_company) %>% arrange(pension_fund_company) %>% count()
## # A tibble: 20 x 2
## # Groups: pension_fund_company [20]
## pension_fund_company n
## <chr> <int>
## 1 Aegon Emeklilik ve Hayat 76
## 2 Allianz Hayat ve Emeklilik 76
## 3 Allianz Yaşam ve Emeklilik 76
## 4 Anadolu Hayat Emeklilik 76
## 5 Asya Emeklilik ve Hayat 7
## 6 Avivasa Emeklilik ve Hayat 76
## 7 Axa Hayat ve Emeklilik 76
## 8 Bereket Emeklilik ve Hayat 69
## 9 BNP Paribas Cardif Emeklilik 76
## 10 Cigna Finans Emeklilik ve Hayat 76
## 11 Fiba Emeklilik ve Hayat 76
## 12 Garanti Emeklilik ve Hayat 76
## 13 Groupama Emeklilik 76
## 14 Halk Hayat ve Emeklilik 76
## 15 Katılım Emeklilik ve Hayat 76
## 16 Metlife Emeklilik ve Hayat 76
## 17 NN Hayat ve Emeklilik 76
## 18 Vakıf Emeklilik 35
## 19 Vakıf Emeklilik ve Hayat 41
## 20 Ziraat Hayat ve Emeklilik 76
All companies seem to have 76 data entries except for two. For stability purposes, let’s deduct those two. For my first analysis, I want to compare the number of participants for the companies.
#Restructure our data for number of participant analysis.
a1_plot <- egm_data %>%
filter(pension_fund_company != c("Asya Emeklilik ve Hayat", "Bereket Emeklilik ve Hayat")) %>%
group_by(pension_fund_company) %>% arrange(pension_fund_company) %>%
summarise(mean_of_participants=mean(n_of_participants, na.rm = TRUE)) %>%
arrange(desc(mean_of_participants)) %>%
filter(mean_of_participants > 200000)
a1_plot
## # A tibble: 9 x 2
## pension_fund_company mean_of_participants
## <chr> <dbl>
## 1 Garanti Emeklilik ve Hayat 1185464.
## 2 Anadolu Hayat Emeklilik 1135500.
## 3 Avivasa Emeklilik ve Hayat 883312.
## 4 Allianz Yaşam ve Emeklilik 775468.
## 5 Ziraat Hayat ve Emeklilik 557695.
## 6 Vakıf Emeklilik ve Hayat 487677.
## 7 Halk Hayat ve Emeklilik 478004.
## 8 Vakıf Emeklilik 470575.
## 9 NN Hayat ve Emeklilik 266536.
Garanti and Anadolu Companies have the largest number of participants on average. Lets see this in a graph.
#Restructure our data for number of participant analysis.
ggplot(data = a1_plot, aes(x=pension_fund_company, y=mean_of_participants, fill=pension_fund_company))+
geom_bar(stat="identity")+
coord_flip()+
coord_polar()+
theme(legend.position = "none", axis.text.x = element_text(angle = 10))+
geom_text(aes(y = round(mean_of_participants) , label = (round(mean_of_participants/1000000, digits = 2))))+
labs(title = "Average number of participants of top 9 companies (Millions)", x="", y="")
It is clear here that Anadolu Hayat and Garanti has the most number of participants.
For the second analysis I want to see how the number of participants have changed troughout time for the top 5 companies.
#Extract the top 5 companies
top_5_list <- c("Garanti Emeklilik ve Hayat", "Anadolu Hayat Emeklilik", "Avivasa Emeklilik ve Hayat",
"Allianz Yaşam ve Emeklilik", "Ziraat Hayat ve Emeklilik", "Vakıf Emeklilik ve Hayat")
a2_plot <- egm_data %>%
filter(pension_fund_company %in% top_5_list) %>%
arrange(date) %>%
mutate(date = as.Date(date, format("%d.%m.%Y"))) %>%
mutate(participants = n_of_participants/1000000) %>%
select(date, pension_fund_company, participants)
Now that we have the tidied dataset, let’s see how the number of participants have changed troughout time.
#Plot
ggplot(data=a2_plot, aes(x=date, y=participants, color=pension_fund_company)) +
geom_point() +
scale_x_date(date_breaks = "1 month") +
theme(axis.text.x = element_text(angle=90)) +
labs(title = "Number of participants based on Time", y = "participants (Millions)")
While Garanti and Anadolu shows a slight but steady increase, Allianz and Avivasa presents a stagnation profile and Vakıf and Ziraat is constantly increasing.