Load egm_example_data

Examine the database

#Check if there is any errors in reading.
head(egm_data)
## # A tibble: 6 x 15
##   date      pension_fund_company     n_of_participan~ fund_size_participa~
##   <chr>     <chr>                               <dbl>                <dbl>
## 1 06.01.20~ Aegon Emeklilik ve Hayat            42826            139193651
## 2 06.01.20~ Allianz Hayat ve Emekli~           118867           1840701596
## 3 06.01.20~ Allianz Yaşam ve Emekli~           726684           7387398235
## 4 06.01.20~ Anadolu Hayat Emeklilik           1119572           9975066501
## 5 06.01.20~ Asya Emeklilik ve Hayat            129389            337754079
## 6 06.01.20~ Avivasa Emeklilik ve Ha~           876894          10550032358
## # ... with 11 more variables: gov_contribution <dbl>, contribution <dbl>,
## #   n_of_pensioners <chr>, n_of_ind_contracts <dbl>,
## #   n_of_group_ind_contracts <chr>,
## #   n_of_employer_group_certificates <chr>, n_total <dbl>,
## #   size_of_ind_contracts <dbl>, size_of_group_ind_contracts <dbl>,
## #   size_of_employer_group_certificates <dbl>, size_total <dbl>
#And tail, just to be sure.
tail(egm_data)
## # A tibble: 6 x 15
##   date      pension_fund_company     n_of_participan~ fund_size_participa~
##   <chr>     <chr>                               <dbl>                <dbl>
## 1 30.03.20~ Halk Hayat ve Emeklilik            511848           2845016752
## 2 30.03.20~ Katılım Emeklilik ve Ha~           205994            907989162
## 3 30.03.20~ Metlife Emeklilik ve Ha~           187483           1368676067
## 4 30.03.20~ NN Hayat ve Emeklilik              265725           2607702561
## 5 30.03.20~ Vakıf Emeklilik ve Hayat           502994           5375415236
## 6 30.03.20~ Ziraat Hayat ve Emeklil~           598703           3105479818
## # ... with 11 more variables: gov_contribution <dbl>, contribution <dbl>,
## #   n_of_pensioners <chr>, n_of_ind_contracts <dbl>,
## #   n_of_group_ind_contracts <chr>,
## #   n_of_employer_group_certificates <chr>, n_total <dbl>,
## #   size_of_ind_contracts <dbl>, size_of_group_ind_contracts <dbl>,
## #   size_of_employer_group_certificates <dbl>, size_total <dbl>
#Next step is looking at the structure of our dataframe.
str(egm_data)
## Classes 'tbl_df', 'tbl' and 'data.frame':    1368 obs. of  15 variables:
##  $ date                               : chr  "06.01.2017" "06.01.2017" "06.01.2017" "06.01.2017" ...
##  $ pension_fund_company               : chr  "Aegon Emeklilik ve Hayat" "Allianz Hayat ve Emeklilik" "Allianz Yaşam ve Emeklilik" "Anadolu Hayat Emeklilik" ...
##  $ n_of_participants                  : num  42826 118867 726684 1119572 129389 ...
##  $ fund_size_participants             : num  1.39e+08 1.84e+09 7.39e+09 9.98e+09 3.38e+08 ...
##  $ gov_contribution                   : num  1.44e+07 2.31e+08 8.15e+08 1.48e+09 7.77e+07 ...
##  $ contribution                       : num  1.10e+08 1.47e+09 5.80e+09 8.25e+09 2.99e+08 ...
##  $ n_of_pensioners                    : chr  "269" "2300" "5640" "11467" ...
##  $ n_of_ind_contracts                 : num  34003 113222 627972 938794 39160 ...
##  $ n_of_group_ind_contracts           : chr  "14127" "18813" "148377" "323209" ...
##  $ n_of_employer_group_certificates   : chr  "884" "4488" "98772" "27232" ...
##  $ n_total                            : num  49014 136523 875121 1289235 135451 ...
##  $ size_of_ind_contracts              : num  8.71e+07 1.17e+09 4.00e+09 6.05e+09 1.28e+08 ...
##  $ size_of_group_ind_contracts        : num  1.77e+07 2.45e+08 1.32e+09 1.91e+09 1.57e+08 ...
##  $ size_of_employer_group_certificates: num  1.81e+06 4.27e+07 3.78e+08 1.63e+08 1.20e+07 ...
##  $ size_total                         : num  1.07e+08 1.45e+09 5.70e+09 8.12e+09 2.97e+08 ...

Analysis

Analysis 1

For my first analysis I want to see the data on each company.

#Check if all companies have the same amount of data entries.
egm_data %>%
  group_by(pension_fund_company) %>% arrange(pension_fund_company) %>% count()
## # A tibble: 20 x 2
## # Groups:   pension_fund_company [20]
##    pension_fund_company                n
##    <chr>                           <int>
##  1 Aegon Emeklilik ve Hayat           76
##  2 Allianz Hayat ve Emeklilik         76
##  3 Allianz Yaşam ve Emeklilik         76
##  4 Anadolu Hayat Emeklilik            76
##  5 Asya Emeklilik ve Hayat             7
##  6 Avivasa Emeklilik ve Hayat         76
##  7 Axa Hayat ve Emeklilik             76
##  8 Bereket Emeklilik ve Hayat         69
##  9 BNP Paribas Cardif Emeklilik       76
## 10 Cigna Finans Emeklilik ve Hayat    76
## 11 Fiba Emeklilik ve Hayat            76
## 12 Garanti Emeklilik ve Hayat         76
## 13 Groupama Emeklilik                 76
## 14 Halk Hayat ve Emeklilik            76
## 15 Katılım Emeklilik ve Hayat         76
## 16 Metlife Emeklilik ve Hayat         76
## 17 NN Hayat ve Emeklilik              76
## 18 Vakıf Emeklilik                    35
## 19 Vakıf Emeklilik ve Hayat           41
## 20 Ziraat Hayat ve Emeklilik          76

All companies seem to have 76 data entries except for two. For stability purposes, let’s deduct those two. For my first analysis, I want to compare the number of participants for the companies.

#Restructure our data for number of participant analysis.
a1_plot <- egm_data %>%
  filter(pension_fund_company != c("Asya Emeklilik ve Hayat", "Bereket Emeklilik ve Hayat")) %>% 
  group_by(pension_fund_company) %>% arrange(pension_fund_company) %>%
  summarise(mean_of_participants=mean(n_of_participants, na.rm = TRUE)) %>%
  arrange(desc(mean_of_participants)) %>%
  filter(mean_of_participants > 200000)
a1_plot
## # A tibble: 9 x 2
##   pension_fund_company       mean_of_participants
##   <chr>                                     <dbl>
## 1 Garanti Emeklilik ve Hayat             1185464.
## 2 Anadolu Hayat Emeklilik                1135500.
## 3 Avivasa Emeklilik ve Hayat              883312.
## 4 Allianz Yaşam ve Emeklilik              775468.
## 5 Ziraat Hayat ve Emeklilik               557695.
## 6 Vakıf Emeklilik ve Hayat                487677.
## 7 Halk Hayat ve Emeklilik                 478004.
## 8 Vakıf Emeklilik                         470575.
## 9 NN Hayat ve Emeklilik                   266536.

Garanti and Anadolu Companies have the largest number of participants on average. Lets see this in a graph.

#Restructure our data for number of participant analysis.
ggplot(data = a1_plot, aes(x=pension_fund_company, y=mean_of_participants, fill=pension_fund_company))+
  geom_bar(stat="identity")+
  coord_flip()+
  coord_polar()+
  theme(legend.position = "none", axis.text.x = element_text(angle = 10))+
  geom_text(aes(y = round(mean_of_participants) , label = (round(mean_of_participants/1000000, digits = 2))))+
  labs(title = "Average number of participants of top 9 companies (Millions)", x="", y="")

It is clear here that Anadolu Hayat and Garanti has the most number of participants.

Analysis 2

For the second analysis I want to see how the number of participants have changed troughout time for the top 5 companies.

#Extract the top 5 companies
top_5_list <- c("Garanti Emeklilik ve Hayat", "Anadolu Hayat Emeklilik", "Avivasa Emeklilik ve Hayat",
                "Allianz Yaşam ve Emeklilik", "Ziraat Hayat ve Emeklilik",  "Vakıf Emeklilik ve Hayat")
a2_plot <- egm_data %>%
  filter(pension_fund_company %in% top_5_list) %>%
  arrange(date) %>%
  mutate(date = as.Date(date, format("%d.%m.%Y"))) %>%
  mutate(participants = n_of_participants/1000000) %>%
  select(date, pension_fund_company, participants) 

Now that we have the tidied dataset, let’s see how the number of participants have changed troughout time.

#Plot 
ggplot(data=a2_plot, aes(x=date, y=participants, color=pension_fund_company)) +
  geom_point() +
  scale_x_date(date_breaks = "1 month") +
  theme(axis.text.x = element_text(angle=90)) +
  labs(title = "Number of participants based on Time", y = "participants (Millions)")

While Garanti and Anadolu shows a slight but steady increase, Allianz and Avivasa presents a stagnation profile and Vakıf and Ziraat is constantly increasing.