Our raw excel file is in our repository. We can automatically download that file and put it in a temporary file. Then we can read that excel document into R and remove the temp file.
# Create a temporary file
tmp=tempfile(fileext=".xlsx")
# Download file from repository to the temp file
download.file("https://github.com/MEF-BDA503/pj18-baysalu/blob/master/files/egm_example_data.xlsx?raw=true",destfile=tmp,mode='wb')
# Read that excel file using readxl package's read_excel function. You might need to adjust the parameters (skip, col_names) according to your raw file's format.
raw_data=readxl::read_excel(tmp)
# Remove the temp file
file.remove(tmp)
## [1] TRUE
# Let's see our raw data
head(raw_data)
## # A tibble: 6 x 15
## date pension_fund_co~ n_of_participan~ fund_size_parti~ gov_contribution
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 06.0~ Aegon Emeklilik~ 42826 139193651 14409322
## 2 06.0~ Allianz Hayat v~ 118867 1840701596 231156574
## 3 06.0~ Allianz Yasam v~ 726684 7387398235 815356276
## 4 06.0~ Anadolu Hayat E~ 1119572 9975066501 1484745501
## 5 06.0~ Asya Emeklilik ~ 129389 337754079 77689275
## 6 06.0~ Avivasa Emeklil~ 876894 10550032358 1355045933
## # ... with 10 more variables: contribution <dbl>, n_of_pensioners <chr>,
## # n_of_ind_contracts <dbl>, n_of_group_ind_contracts <chr>,
## # n_of_employer_group_certificates <chr>, n_total <dbl>,
## # size_of_ind_contracts <dbl>, size_of_group_ind_contracts <dbl>,
## # size_of_employer_group_certificates <dbl>, size_total <dbl>
tail(raw_data)
## # A tibble: 6 x 15
## date pension_fund_co~ n_of_participan~ fund_size_parti~ gov_contribution
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 30.0~ Halk Hayat ve E~ 511848 2845016752 527923905
## 2 30.0~ Katilim Emeklil~ 205994 907989162 182110867
## 3 30.0~ Metlife Emeklil~ 187483 1368676067 233517732
## 4 30.0~ NN Hayat ve Eme~ 265725 2607702561 382046370
## 5 30.0~ Vakif Emeklilik~ 502994 5375415236 757466988
## 6 30.0~ Ziraat Hayat ve~ 598703 3105479818 590220958
## # ... with 10 more variables: contribution <dbl>, n_of_pensioners <chr>,
## # n_of_ind_contracts <dbl>, n_of_group_ind_contracts <chr>,
## # n_of_employer_group_certificates <chr>, n_total <dbl>,
## # size_of_ind_contracts <dbl>, size_of_group_ind_contracts <dbl>,
## # size_of_employer_group_certificates <dbl>, size_total <dbl>
# Let's convert data frame column from character to numeric
raw_data$n_of_group_ind_contracts = as.numeric(as.character(raw_data$n_of_group_ind_contracts))
raw_data$n_of_pensioners = as.numeric(as.character(raw_data$n_of_pensioners))
raw_data$n_of_employer_group_certificates = as.numeric(as.character(raw_data$n_of_employer_group_certificates))
head(raw_data)
## # A tibble: 6 x 15
## date pension_fund_co~ n_of_participan~ fund_size_parti~ gov_contribution
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 06.0~ Aegon Emeklilik~ 42826 139193651 14409322
## 2 06.0~ Allianz Hayat v~ 118867 1840701596 231156574
## 3 06.0~ Allianz Yasam v~ 726684 7387398235 815356276
## 4 06.0~ Anadolu Hayat E~ 1119572 9975066501 1484745501
## 5 06.0~ Asya Emeklilik ~ 129389 337754079 77689275
## 6 06.0~ Avivasa Emeklil~ 876894 10550032358 1355045933
## # ... with 10 more variables: contribution <dbl>, n_of_pensioners <dbl>,
## # n_of_ind_contracts <dbl>, n_of_group_ind_contracts <dbl>,
## # n_of_employer_group_certificates <dbl>, n_total <dbl>,
## # size_of_ind_contracts <dbl>, size_of_group_ind_contracts <dbl>,
## # size_of_employer_group_certificates <dbl>, size_total <dbl>
raw_data2 = raw_data %>%
group_by(pension_fund_company) %>%
summarise(sum_n_total=sum(n_total)) %>%
arrange(desc(sum_n_total)) %>%
filter(sum_n_total > 45000000)
print(raw_data2)
## # A tibble: 5 x 2
## pension_fund_company sum_n_total
## <chr> <dbl>
## 1 Anadolu Hayat Emeklilik 99557810
## 2 Garanti Emeklilik ve Hayat 96132459
## 3 Avivasa Emeklilik ve Hayat 84707101
## 4 Allianz Yasam ve Emeklilik 70880813
## 5 Ziraat Hayat ve Emeklilik 48296866
raw_data2 %>%
ggplot(data = ., aes(x = pension_fund_company, y = sum_n_total,
fill = pension_fund_company)) + geom_bar(stat = "identity") + aes(x = reorder(pension_fund_company, -sum_n_total), y = sum_n_total) + labs(x = "", y = "", title = "Top 5 according to Total Pension Funds Number") + theme_bw() + theme(legend.position = "none", axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 0.5, size = 12))
raw_data3 = raw_data %>%
group_by(pension_fund_company) %>%
summarise(sum_size_total=sum(size_total)) %>%
arrange(desc(sum_size_total)) %>%
filter(sum_size_total > 165000000000)
print(raw_data3)
## # A tibble: 5 x 2
## pension_fund_company sum_size_total
## <chr> <dbl>
## 1 Avivasa Emeklilik ve Hayat 716465921455
## 2 Anadolu Hayat Emeklilik 680350979685
## 3 Garanti Emeklilik ve Hayat 558007446946
## 4 Allianz Yasam ve Emeklilik 481613261122
## 5 Ziraat Hayat ve Emeklilik 166173034923
raw_data3 %>%
ggplot(data = ., aes(x = pension_fund_company, y = sum_size_total,
fill = pension_fund_company)) + geom_bar(stat = "identity") + aes(x = reorder(pension_fund_company, -sum_size_total), y = sum_size_total) + labs(x = "", y = "", title = "Top 5 according to Total Pension Funds Size") + theme_bw() + theme(legend.position = "none", axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 0.5, size = 12))