Excel file provided in class, letās start by reading it
# I am assigning excel data to a variable called egm
egm <- readxl::read_excel("D:/Data Analytics Esentials/week4/egm_example_data.xlsx")
# Let's have a look at data structure
str(egm)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1368 obs. of 15 variables:
## $ date : chr "06.01.2017" "06.01.2017" "06.01.2017" "06.01.2017" ...
## $ pension_fund_company : chr "Aegon Emeklilik ve Hayat" "Allianz Hayat ve Emeklilik" "Allianz Yasam ve Emeklilik" "Anadolu Hayat Emeklilik" ...
## $ n_of_participants : num 42826 118867 726684 1119572 129389 ...
## $ fund_size_participants : num 1.39e+08 1.84e+09 7.39e+09 9.98e+09 3.38e+08 ...
## $ gov_contribution : num 1.44e+07 2.31e+08 8.15e+08 1.48e+09 7.77e+07 ...
## $ contribution : num 1.10e+08 1.47e+09 5.80e+09 8.25e+09 2.99e+08 ...
## $ n_of_pensioners : chr "269" "2300" "5640" "11467" ...
## $ n_of_ind_contracts : num 34003 113222 627972 938794 39160 ...
## $ n_of_group_ind_contracts : chr "14127" "18813" "148377" "323209" ...
## $ n_of_employer_group_certificates : chr "884" "4488" "98772" "27232" ...
## $ n_total : num 49014 136523 875121 1289235 135451 ...
## $ size_of_ind_contracts : num 8.71e+07 1.17e+09 4.00e+09 6.05e+09 1.28e+08 ...
## $ size_of_group_ind_contracts : num 1.77e+07 2.45e+08 1.32e+09 1.91e+09 1.57e+08 ...
## $ size_of_employer_group_certificates: num 1.81e+06 4.27e+07 3.78e+08 1.63e+08 1.20e+07 ...
## $ size_total : num 1.07e+08 1.45e+09 5.70e+09 8.12e+09 2.97e+08 ...
Data holds a lot of information, but I am interested in market shares. I prefer calculating market share on average fund sizes in given period.
# Let's assign again to variable ex1
ex1 <- egm %>%
group_by(pension_fund_company) %>% arrange(pension_fund_company) %>%
# I will add a new column for average fund size
summarise(mean_of_fundsize=mean(fund_size_participants, na.rm = TRUE)) %>%
# Then I will add another column for market shares
mutate(mshare = percent(mean_of_fundsize / sum(mean_of_fundsize))) %>%
# Let's arrange them largest first
arrange(desc(mean_of_fundsize)) %>%
# Top 5 companies will give us most of the market
top_n(n = 5, wt = mean_of_fundsize)
# Since it is summarized to a few rows we can see whole
print(ex1, width = Inf)
## # A tibble: 5 x 3
## pension_fund_company mean_of_fundsize mshare
## <chr> <dbl> <chr>
## 1 Avivasa Emeklilik ve Hayat 12255577721. 18.1%
## 2 Anadolu Hayat Emeklilik 11576493704. 17.1%
## 3 Garanti Emeklilik ve Hayat 9554762898. 14.1%
## 4 Allianz Yasam ve Emeklilik 8600737835. 12.7%
## 5 Vakif Emeklilik ve Hayat 5062596313. 7.5%