Reading Data

Excel file provided in class, letā€™s start by reading it

# I am assigning excel data to a variable called egm
egm <- readxl::read_excel("D:/Data Analytics Esentials/week4/egm_example_data.xlsx")
# Let's have a look at data structure
str(egm)
## Classes 'tbl_df', 'tbl' and 'data.frame':    1368 obs. of  15 variables:
##  $ date                               : chr  "06.01.2017" "06.01.2017" "06.01.2017" "06.01.2017" ...
##  $ pension_fund_company               : chr  "Aegon Emeklilik ve Hayat" "Allianz Hayat ve Emeklilik" "Allianz Yasam ve Emeklilik" "Anadolu Hayat Emeklilik" ...
##  $ n_of_participants                  : num  42826 118867 726684 1119572 129389 ...
##  $ fund_size_participants             : num  1.39e+08 1.84e+09 7.39e+09 9.98e+09 3.38e+08 ...
##  $ gov_contribution                   : num  1.44e+07 2.31e+08 8.15e+08 1.48e+09 7.77e+07 ...
##  $ contribution                       : num  1.10e+08 1.47e+09 5.80e+09 8.25e+09 2.99e+08 ...
##  $ n_of_pensioners                    : chr  "269" "2300" "5640" "11467" ...
##  $ n_of_ind_contracts                 : num  34003 113222 627972 938794 39160 ...
##  $ n_of_group_ind_contracts           : chr  "14127" "18813" "148377" "323209" ...
##  $ n_of_employer_group_certificates   : chr  "884" "4488" "98772" "27232" ...
##  $ n_total                            : num  49014 136523 875121 1289235 135451 ...
##  $ size_of_ind_contracts              : num  8.71e+07 1.17e+09 4.00e+09 6.05e+09 1.28e+08 ...
##  $ size_of_group_ind_contracts        : num  1.77e+07 2.45e+08 1.32e+09 1.91e+09 1.57e+08 ...
##  $ size_of_employer_group_certificates: num  1.81e+06 4.27e+07 3.78e+08 1.63e+08 1.20e+07 ...
##  $ size_total                         : num  1.07e+08 1.45e+09 5.70e+09 8.12e+09 2.97e+08 ...

Getting Necessary Parts and Starting Analysis

Data holds a lot of information, but I am interested in market shares. I prefer calculating market share on average fund sizes in given period.

# Let's assign again to variable ex1
ex1 <- egm %>%
  group_by(pension_fund_company) %>% arrange(pension_fund_company) %>%
# I will add a new column for average fund size
  summarise(mean_of_fundsize=mean(fund_size_participants, na.rm = TRUE)) %>%
# Then I will add another column for market shares
  mutate(mshare = percent(mean_of_fundsize / sum(mean_of_fundsize))) %>%
# Let's arrange them largest first
  arrange(desc(mean_of_fundsize)) %>%
# Top 5 companies will give us most of the market
  top_n(n = 5, wt = mean_of_fundsize)
# Since it is summarized to a few rows we can see whole
print(ex1, width = Inf)
## # A tibble: 5 x 3
##   pension_fund_company       mean_of_fundsize mshare
##   <chr>                                 <dbl> <chr> 
## 1 Avivasa Emeklilik ve Hayat     12255577721. 18.1% 
## 2 Anadolu Hayat Emeklilik        11576493704. 17.1% 
## 3 Garanti Emeklilik ve Hayat      9554762898. 14.1% 
## 4 Allianz Yasam ve Emeklilik      8600737835. 12.7% 
## 5 Vakif Emeklilik ve Hayat        5062596313. 7.5%

Pie chart would make sense for market share researches

ggplot(data = ex1, aes(x=pension_fund_company, y=mshare, fill=pension_fund_company)) +
  geom_bar(stat="identity")+
  cf +
  coord_polar()+
  theme(legend.position = "right", axis.text.x = element_text(angle = 0))+
  geom_text(aes(y = mshare , label = mshare))+
  labs(title = "Market shares of top 5 companies", x="", y="")