At first we need to handle encoding problem for proper visiualization of Turkish characters:
Sys.setlocale(locale = "Turkish_Turkey.1254")
## [1] "LC_COLLATE=Turkish_Turkey.1254;LC_CTYPE=Turkish_Turkey.1254;LC_MONETARY=Turkish_Turkey.1254;LC_NUMERIC=C;LC_TIME=Turkish_Turkey.1254"
Here are the libraries that we are going to use:
library(data.table)
library(ggplot2) # visualization
library(scales) # visualization
library(dplyr) # data manipulation
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
We control our workspace path and after confirmation we load OSYM 2017 data as follows:
getwd()
## [1] "C:/Users/duygu/Documents/GitHub/gpj-2yaka"
#load("C:/Users/merye/Desktop/osym_data_2017.RData")
download.file("https://mef-bda503.github.io/files/osym_data_2017_v2.RData",
"osym_data_2017.Rdata")
load("osym_data_2017.RData")
Dimensions for reshaped data are:
d=osym_data_2017 %>% mutate(general_quota =as.numeric(general_quota),general_placement=as.numeric(general_placement))
dim(d)
## [1] 11465 14
We give an alias to our dataset as ‘d’ and see the detailed information about data:
glimpse(d)
## Observations: 11,465
## Variables: 14
## $ program_id <chr> "100110266", "100110487", "100110724", "1001...
## $ university_name <chr> "ABANT İZZET BAYSAL ÜNİVERSİTESİ", "ABANT İZ...
## $ city <chr> "BOLU", "BOLU", "BOLU", "BOLU", "BOLU", "BOL...
## $ faculty_name <chr> "Bolu Sağlık Yüksekokulu", "Bolu Turizm İşle...
## $ program_name <chr> "Hemşirelik", "Gastronomi ve Mutfak Sanatlar...
## $ exam_type <chr> "YGS_2", "YGS_4", "YGS_6", "YGS_6", "MF_3", ...
## $ general_quota <dbl> 150, 60, 60, 60, 80, 1, 40, 60, 60, 80, 60, ...
## $ general_placement <dbl> 150, 60, 62, 26, 80, 1, 9, 62, 60, 81, 60, 7...
## $ min_score <dbl> 328.8790, 346.4491, 225.7170, 199.2710, 446....
## $ max_score <dbl> 376.3817, 388.3141, 290.2683, 234.9510, 451....
## $ val_quota <dbl> 4, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2, 2, 2, 3, 2,...
## $ val_placement <dbl> 4, 2, 0, 0, 2, 0, 0, 0, 2, 1, 2, 2, 2, 3, 1,...
## $ val_min_score <dbl> 312.8462, 293.6994, 180.0000, 180.0000, 437....
## $ val_max_score <dbl> 328.0626, 328.7560, 180.0000, 180.0000, 442....
Let’s see average values of minimum and maximum scores of the universities:
d %>% group_by(university_name) %>% summarise(avgmin=mean(min_score),avgmax=mean(max_score))
## # A tibble: 200 x 3
## university_name avgmin avgmax
## <chr> <dbl> <dbl>
## 1 ABANT İZZET BAYSAL ÜNİVERSİTESİ 295.8501 339.7075
## 2 ABDULLAH GÜL ÜNİVERSİTESİ 412.3217 467.5316
## 3 ACIBADEM MEHMET ALİ AYDINLAR ÜNİVERSİTESİ 364.1779 404.4084
## 4 ADANA BİLİM VE TEKNOLOJİ ÜNİVERSİTESİ 280.9509 324.5178
## 5 ADIYAMAN ÜNİVERSİTESİ 297.6436 360.0265
## 6 ADNAN MENDERES ÜNİVERSİTESİ 279.2663 327.5020
## 7 AFYON KOCATEPE ÜNİVERSİTESİ 276.4040 324.1268
## 8 AĞRI İBRAHİM ÇEÇEN ÜNİVERSİTESİ 295.9362 357.8026
## 9 AHİ EVRAN ÜNİVERSİTESİ 277.5119 322.9361
## 10 AKDENİZ ÜNİVERSİTESİ 304.0331 364.3833
## # ... with 190 more rows
We excluded abroad universities and state universities by using filter and give it a new name:
dtr=d %>% filter(!(substr(program_id,1,1) %in% c('3','4','1')))
print(dtr)
## # A tibble: 4,497 x 14
## program_id university_name city
## <chr> <chr> <chr>
## 1 200110283 ACIBADEM MEHMET ALİ AYDINLAR ÜNİVERSİTESİ İSTANBUL
## 2 200110274 ACIBADEM MEHMET ALİ AYDINLAR ÜNİVERSİTESİ İSTANBUL
## 3 200110265 ACIBADEM MEHMET ALİ AYDINLAR ÜNİVERSİTESİ İSTANBUL
## 4 200110229 ACIBADEM MEHMET ALİ AYDINLAR ÜNİVERSİTESİ İSTANBUL
## 5 200110247 ACIBADEM MEHMET ALİ AYDINLAR ÜNİVERSİTESİ İSTANBUL
## 6 200110211 ACIBADEM MEHMET ALİ AYDINLAR ÜNİVERSİTESİ İSTANBUL
## 7 200110123 ACIBADEM MEHMET ALİ AYDINLAR ÜNİVERSİTESİ İSTANBUL
## 8 200110114 ACIBADEM MEHMET ALİ AYDINLAR ÜNİVERSİTESİ İSTANBUL
## 9 200110308 ACIBADEM MEHMET ALİ AYDINLAR ÜNİVERSİTESİ İSTANBUL
## 10 200110317 ACIBADEM MEHMET ALİ AYDINLAR ÜNİVERSİTESİ İSTANBUL
## # ... with 4,487 more rows, and 11 more variables: faculty_name <chr>,
## # program_name <chr>, exam_type <chr>, general_quota <dbl>,
## # general_placement <dbl>, min_score <dbl>, max_score <dbl>,
## # val_quota <dbl>, val_placement <dbl>, val_min_score <dbl>,
## # val_max_score <dbl>
dim(d)
## [1] 11465 14
Later we groupped private universities in Ä°stanbul by their program names and ordered by max scores in descending order to compare their success orders.
dtr %>% filter(city=='İSTANBUL') %>% select(university_name,program_name,max_score,general_quota,general_placement) %>% group_by(program_name) %>% arrange(desc(max_score))
## # A tibble: 3,270 x 5
## # Groups: program_name [969]
## university_name
## <chr>
## 1 KOÇ ÜNİVERSİTESİ
## 2 İSTANBUL MEDİPOL ÜNİVERSİTESİ
## 3 KOÇ ÜNİVERSİTESİ
## 4 KOÇ ÜNİVERSİTESİ
## 5 KOÇ ÜNİVERSİTESİ
## 6 ACIBADEM MEHMET ALİ AYDINLAR ÜNİVERSİTESİ
## 7 KOÇ ÜNİVERSİTESİ
## 8 KOÇ ÜNİVERSİTESİ
## 9 SABANCI ÜNİVERSİTESİ
## 10 KOÇ ÜNİVERSİTESİ
## # ... with 3,260 more rows, and 4 more variables: program_name <chr>,
## # max_score <dbl>, general_quota <dbl>, general_placement <dbl>
To get a better look on the data, we calculated quota fill rate of these universities and ordered them by teir fill rates
dpr = dtr %>% filter(city=='İSTANBUL') %>% group_by(university_name) %>% summarise(sayi =n(), totalquota=sum(as.numeric(general_quota)), totalplacement=sum(general_placement), fillrate =sum(totalplacement*100/totalquota)) %>% group_by(university_name) %>% arrange(desc(fillrate))
print(head(dpr,100))
## # A tibble: 41 x 5
## # Groups: university_name [41]
## university_name sayi totalquota
## <chr> <int> <dbl>
## 1 İBN HALDUN ÜNİVERSİTESİ 11 150
## 2 İSTANBUL AYVANSARAY ÜNİVERSİTESİ 4 100
## 3 SABANCI ÜNİVERSİTESİ 16 760
## 4 KOÇ ÜNİVERSİTESİ 63 1054
## 5 ÖZYEĞİN ÜNİVERSİTESİ 69 1342
## 6 BEZM - İ ÂLEM VAKIF ÜNİVERSİTESİ 27 495
## 7 İSTANBUL SABAHATTİN ZAİM ÜNİVERSİTESİ 90 1300
## 8 İSTANBUL MEDİPOL ÜNİVERSİTESİ 155 4495
## 9 ACIBADEM MEHMET ALİ AYDINLAR ÜNİVERSİTESİ 23 540
## 10 NİŞANTAŞI ÜNİVERSİTESİ 94 1434
## # ... with 31 more rows, and 2 more variables: totalplacement <dbl>,
## # fillrate <dbl>
We find that MEF University has a fill rate of over 90 percent:
dpr %>% filter(university_name=="MEF ÜNİVERSİTESİ")
## # A tibble: 1 x 5
## # Groups: university_name [1]
## university_name sayi totalquota totalplacement fillrate
## <chr> <int> <dbl> <dbl> <dbl>
## 1 MEF ÜNİVERSİTESİ 44 817 768 94.00245
Let’s visualize our private universities data with bar a chart
#dord=order(dpr$fillrate)
#dord2=head(dord,12)
dpr$university_name=substr(dpr$university_name,1,10)
dpr=head(dpr,12)
ggplot(data=dpr, aes(x=reorder(dpr$university_name, desc(dpr$fillrate)),y=dpr$fillrate,fill=dpr$fillrate))+
theme(axis.text.x = element_text(angle = 60, hjust=1))+
geom_bar(stat="identity",width = 0.95)+
labs(title = "Top 12 Private Universities in İstanbul By Quota Fill Rate")+
labs(x = "Universities", y = "Quota\nFill Rate") +
expand_limits(y=100)