Exploring OSYM 2017 Data

At first we need to handle encoding problem for proper visiualization of Turkish characters:

Sys.setlocale(locale = "Turkish_Turkey.1254")
## [1] "LC_COLLATE=Turkish_Turkey.1254;LC_CTYPE=Turkish_Turkey.1254;LC_MONETARY=Turkish_Turkey.1254;LC_NUMERIC=C;LC_TIME=Turkish_Turkey.1254"

Here are the libraries that we are going to use:

library(data.table)
library(ggplot2) # visualization
library(scales) # visualization
library(dplyr) # data manipulation
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

We control our workspace path and after confirmation we load OSYM 2017 data as follows:

getwd()
## [1] "C:/Users/duygu/Documents/GitHub/gpj-2yaka"
#load("C:/Users/merye/Desktop/osym_data_2017.RData")
download.file("https://mef-bda503.github.io/files/osym_data_2017_v2.RData", 
    "osym_data_2017.Rdata")
load("osym_data_2017.RData")

Dimensions for reshaped data are:

d=osym_data_2017 %>% mutate(general_quota =as.numeric(general_quota),general_placement=as.numeric(general_placement))

dim(d)
## [1] 11465    14

We give an alias to our dataset as ‘d’ and see the detailed information about data:

glimpse(d)
## Observations: 11,465
## Variables: 14
## $ program_id        <chr> "100110266", "100110487", "100110724", "1001...
## $ university_name   <chr> "ABANT İZZET BAYSAL ÜNİVERSİTESİ", "ABANT İZ...
## $ city              <chr> "BOLU", "BOLU", "BOLU", "BOLU", "BOLU", "BOL...
## $ faculty_name      <chr> "Bolu Sağlık Yüksekokulu", "Bolu Turizm İşle...
## $ program_name      <chr> "Hemşirelik", "Gastronomi ve Mutfak Sanatlar...
## $ exam_type         <chr> "YGS_2", "YGS_4", "YGS_6", "YGS_6", "MF_3", ...
## $ general_quota     <dbl> 150, 60, 60, 60, 80, 1, 40, 60, 60, 80, 60, ...
## $ general_placement <dbl> 150, 60, 62, 26, 80, 1, 9, 62, 60, 81, 60, 7...
## $ min_score         <dbl> 328.8790, 346.4491, 225.7170, 199.2710, 446....
## $ max_score         <dbl> 376.3817, 388.3141, 290.2683, 234.9510, 451....
## $ val_quota         <dbl> 4, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2, 2, 2, 3, 2,...
## $ val_placement     <dbl> 4, 2, 0, 0, 2, 0, 0, 0, 2, 1, 2, 2, 2, 3, 1,...
## $ val_min_score     <dbl> 312.8462, 293.6994, 180.0000, 180.0000, 437....
## $ val_max_score     <dbl> 328.0626, 328.7560, 180.0000, 180.0000, 442....

Let’s see average values of minimum and maximum scores of the universities:

d %>% group_by(university_name) %>% summarise(avgmin=mean(min_score),avgmax=mean(max_score))
## # A tibble: 200 x 3
##                              university_name   avgmin   avgmax
##                                        <chr>    <dbl>    <dbl>
##  1           ABANT İZZET BAYSAL ÜNİVERSİTESİ 295.8501 339.7075
##  2                 ABDULLAH GÜL ÜNİVERSİTESİ 412.3217 467.5316
##  3 ACIBADEM MEHMET ALİ AYDINLAR ÜNİVERSİTESİ 364.1779 404.4084
##  4     ADANA BİLİM VE TEKNOLOJİ ÜNİVERSİTESİ 280.9509 324.5178
##  5                     ADIYAMAN ÜNİVERSİTESİ 297.6436 360.0265
##  6               ADNAN MENDERES ÜNİVERSİTESİ 279.2663 327.5020
##  7               AFYON KOCATEPE ÜNİVERSİTESİ 276.4040 324.1268
##  8           AĞRI İBRAHİM ÇEÇEN ÜNİVERSİTESİ 295.9362 357.8026
##  9                    AHİ EVRAN ÜNİVERSİTESİ 277.5119 322.9361
## 10                      AKDENİZ ÜNİVERSİTESİ 304.0331 364.3833
## # ... with 190 more rows

We excluded abroad universities and state universities by using filter and give it a new name:

dtr=d %>% filter(!(substr(program_id,1,1) %in% c('3','4','1')))
print(dtr)
## # A tibble: 4,497 x 14
##    program_id                           university_name     city
##         <chr>                                     <chr>    <chr>
##  1  200110283 ACIBADEM MEHMET ALİ AYDINLAR ÜNİVERSİTESİ İSTANBUL
##  2  200110274 ACIBADEM MEHMET ALİ AYDINLAR ÜNİVERSİTESİ İSTANBUL
##  3  200110265 ACIBADEM MEHMET ALİ AYDINLAR ÜNİVERSİTESİ İSTANBUL
##  4  200110229 ACIBADEM MEHMET ALİ AYDINLAR ÜNİVERSİTESİ İSTANBUL
##  5  200110247 ACIBADEM MEHMET ALİ AYDINLAR ÜNİVERSİTESİ İSTANBUL
##  6  200110211 ACIBADEM MEHMET ALİ AYDINLAR ÜNİVERSİTESİ İSTANBUL
##  7  200110123 ACIBADEM MEHMET ALİ AYDINLAR ÜNİVERSİTESİ İSTANBUL
##  8  200110114 ACIBADEM MEHMET ALİ AYDINLAR ÜNİVERSİTESİ İSTANBUL
##  9  200110308 ACIBADEM MEHMET ALİ AYDINLAR ÜNİVERSİTESİ İSTANBUL
## 10  200110317 ACIBADEM MEHMET ALİ AYDINLAR ÜNİVERSİTESİ İSTANBUL
## # ... with 4,487 more rows, and 11 more variables: faculty_name <chr>,
## #   program_name <chr>, exam_type <chr>, general_quota <dbl>,
## #   general_placement <dbl>, min_score <dbl>, max_score <dbl>,
## #   val_quota <dbl>, val_placement <dbl>, val_min_score <dbl>,
## #   val_max_score <dbl>
dim(d)
## [1] 11465    14

Later we groupped private universities in Ä°stanbul by their program names and ordered by max scores in descending order to compare their success orders.

dtr %>% filter(city=='İSTANBUL') %>% select(university_name,program_name,max_score,general_quota,general_placement)  %>% group_by(program_name) %>%  arrange(desc(max_score))
## # A tibble: 3,270 x 5
## # Groups:   program_name [969]
##                              university_name
##                                        <chr>
##  1                          KOÇ ÜNİVERSİTESİ
##  2             İSTANBUL MEDİPOL ÜNİVERSİTESİ
##  3                          KOÇ ÜNİVERSİTESİ
##  4                          KOÇ ÜNİVERSİTESİ
##  5                          KOÇ ÜNİVERSİTESİ
##  6 ACIBADEM MEHMET ALİ AYDINLAR ÜNİVERSİTESİ
##  7                          KOÇ ÜNİVERSİTESİ
##  8                          KOÇ ÜNİVERSİTESİ
##  9                      SABANCI ÜNİVERSİTESİ
## 10                          KOÇ ÜNİVERSİTESİ
## # ... with 3,260 more rows, and 4 more variables: program_name <chr>,
## #   max_score <dbl>, general_quota <dbl>, general_placement <dbl>

To get a better look on the data, we calculated quota fill rate of these universities and ordered them by teir fill rates

dpr = dtr %>% filter(city=='İSTANBUL') %>% group_by(university_name) %>% summarise(sayi =n(), totalquota=sum(as.numeric(general_quota)), totalplacement=sum(general_placement), fillrate =sum(totalplacement*100/totalquota))  %>% group_by(university_name) %>%  arrange(desc(fillrate))

print(head(dpr,100))
## # A tibble: 41 x 5
## # Groups:   university_name [41]
##                              university_name  sayi totalquota
##                                        <chr> <int>      <dbl>
##  1                   İBN HALDUN ÜNİVERSİTESİ    11        150
##  2          İSTANBUL AYVANSARAY ÜNİVERSİTESİ     4        100
##  3                      SABANCI ÜNİVERSİTESİ    16        760
##  4                          KOÇ ÜNİVERSİTESİ    63       1054
##  5                      ÖZYEĞİN ÜNİVERSİTESİ    69       1342
##  6          BEZM - İ ÂLEM VAKIF ÜNİVERSİTESİ    27        495
##  7     İSTANBUL SABAHATTİN ZAİM ÜNİVERSİTESİ    90       1300
##  8             İSTANBUL MEDİPOL ÜNİVERSİTESİ   155       4495
##  9 ACIBADEM MEHMET ALİ AYDINLAR ÜNİVERSİTESİ    23        540
## 10                    NİŞANTAŞI ÜNİVERSİTESİ    94       1434
## # ... with 31 more rows, and 2 more variables: totalplacement <dbl>,
## #   fillrate <dbl>

We find that MEF University has a fill rate of over 90 percent:

dpr %>%  filter(university_name=="MEF ÜNİVERSİTESİ")
## # A tibble: 1 x 5
## # Groups:   university_name [1]
##    university_name  sayi totalquota totalplacement fillrate
##              <chr> <int>      <dbl>          <dbl>    <dbl>
## 1 MEF ÜNİVERSİTESİ    44        817            768 94.00245

Let’s visualize our private universities data with bar a chart

#dord=order(dpr$fillrate)
#dord2=head(dord,12)
dpr$university_name=substr(dpr$university_name,1,10)
dpr=head(dpr,12)
ggplot(data=dpr, aes(x=reorder(dpr$university_name, desc(dpr$fillrate)),y=dpr$fillrate,fill=dpr$fillrate))+
theme(axis.text.x = element_text(angle = 60, hjust=1))+
geom_bar(stat="identity",width = 0.95)+
   labs(title = "Top 12 Private Universities in İstanbul By Quota Fill Rate")+
    labs(x = "Universities", y = "Quota\nFill Rate") +
expand_limits(y=100)