getwd()
## [1] "C:/Users/Lenovo/Desktop/BDA 503 Data Analytics Essentials"
# Load tidyverse package
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
# Load the data
load("osym_data_2017.RData")
# lets see how our data looks like by glimpse function.
glimpse(osym_data_2017)
## Observations: 11,465
## Variables: 14
## $ program_id        <chr> "100110266", "100110487", "100110724", "1001...
## $ university_name   <chr> "ABANT Ä°ZZET BAYSAL ÃœNÄ°VERSÄ°TESÄ°", "ABANT Ä°Z...
## $ city              <chr> "BOLU", "BOLU", "BOLU", "BOLU", "BOLU", "BOL...
## $ faculty_name      <chr> "Bolu Sağlık Yüksekokulu", "Bolu Turizm İşle...
## $ program_name      <chr> "HemÅŸirelik", "Gastronomi ve Mutfak Sanatlar...
## $ exam_type         <chr> "YGS_2", "YGS_4", "YGS_6", "YGS_6", "MF_3", ...
## $ general_quota     <chr> "150", "60", "60", "60", "80", "1", "40", "6...
## $ general_placement <chr> "150", "60", "62", "26", "80", "1", "9", "62...
## $ min_score         <dbl> 328.8790, 346.4491, 225.7170, 199.2710, 446....
## $ max_score         <dbl> 376.3817, 388.3141, 290.2683, 234.9510, 451....
## $ val_quota         <dbl> 4, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2, 2, 2, 3, 2,...
## $ val_placement     <dbl> 4, 2, 0, 0, 2, 0, 0, 0, 2, 1, 2, 2, 2, 3, 1,...
## $ val_min_score     <dbl> 312.8462, 293.6994, 180.0000, 180.0000, 437....
## $ val_max_score     <dbl> 328.0626, 328.7560, 180.0000, 180.0000, 442....
summary(osym_data_2017)
##   program_id        university_name        city          
##  Length:11465       Length:11465       Length:11465      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##  faculty_name       program_name        exam_type        
##  Length:11465       Length:11465       Length:11465      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##  general_quota      general_placement    min_score       max_score    
##  Length:11465       Length:11465       Min.   :180.0   Min.   :180.0  
##  Class :character   Class :character   1st Qu.:235.7   1st Qu.:280.0  
##  Mode  :character   Mode  :character   Median :281.0   Median :341.0  
##                                        Mean   :294.8   Mean   :338.0  
##                                        3rd Qu.:351.7   3rd Qu.:397.9  
##                                        Max.   :543.7   Max.   :575.2  
##    val_quota       val_placement     val_min_score   val_max_score  
##  Min.   :  0.000   Min.   : 0.0000   Min.   :180.0   Min.   :180.0  
##  1st Qu.:  0.000   1st Qu.: 0.0000   1st Qu.:180.0   1st Qu.:180.0  
##  Median :  0.000   Median : 0.0000   Median :180.0   Median :180.0  
##  Mean   :  1.135   Mean   : 0.4283   Mean   :207.6   Mean   :209.7  
##  3rd Qu.:  2.000   3rd Qu.: 0.0000   3rd Qu.:180.0   3rd Qu.:180.0  
##  Max.   :225.000   Max.   :16.0000   Max.   :528.6   Max.   :529.2

It seems like there are 14 variables and 11K observations. Let’s use summary function, so we can see some simple statistics:

summary(osym_data_2017)
##   program_id        university_name        city          
##  Length:11465       Length:11465       Length:11465      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##  faculty_name       program_name        exam_type        
##  Length:11465       Length:11465       Length:11465      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##  general_quota      general_placement    min_score       max_score    
##  Length:11465       Length:11465       Min.   :180.0   Min.   :180.0  
##  Class :character   Class :character   1st Qu.:235.7   1st Qu.:280.0  
##  Mode  :character   Mode  :character   Median :281.0   Median :341.0  
##                                        Mean   :294.8   Mean   :338.0  
##                                        3rd Qu.:351.7   3rd Qu.:397.9  
##                                        Max.   :543.7   Max.   :575.2  
##    val_quota       val_placement     val_min_score   val_max_score  
##  Min.   :  0.000   Min.   : 0.0000   Min.   :180.0   Min.   :180.0  
##  1st Qu.:  0.000   1st Qu.: 0.0000   1st Qu.:180.0   1st Qu.:180.0  
##  Median :  0.000   Median : 0.0000   Median :180.0   Median :180.0  
##  Mean   :  1.135   Mean   : 0.4283   Mean   :207.6   Mean   :209.7  
##  3rd Qu.:  2.000   3rd Qu.: 0.0000   3rd Qu.:180.0   3rd Qu.:180.0  
##  Max.   :225.000   Max.   :16.0000   Max.   :528.6   Max.   :529.2

Here we see that val_placement is very skewed, we keep this in mind. We know that Mef University managers does not really interested in universities abroad. So we filter the program ids, we dont want to see the programs with ids that start with 3 and 4.

So lets create mydata1 and see a summary

mydata1 <- 
  osym_data_2017 %>% filter(floor(as.numeric(program_id)/100000000) %in% c(1,2))

summary(mydata1)
##   program_id        university_name        city          
##  Length:10186       Length:10186       Length:10186      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##  faculty_name       program_name        exam_type        
##  Length:10186       Length:10186       Length:10186      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##  general_quota      general_placement    min_score       max_score    
##  Length:10186       Length:10186       Min.   :180.0   Min.   :180.0  
##  Class :character   Class :character   1st Qu.:242.5   1st Qu.:290.9  
##  Mode  :character   Mode  :character   Median :288.2   Median :349.6  
##                                        Mean   :301.2   Mean   :347.3  
##                                        3rd Qu.:357.7   3rd Qu.:403.4  
##                                        Max.   :543.7   Max.   :575.2  
##    val_quota       val_placement     val_min_score   val_max_score  
##  Min.   :  0.000   Min.   : 0.0000   Min.   :180.0   Min.   :180.0  
##  1st Qu.:  0.000   1st Qu.: 0.0000   1st Qu.:180.0   1st Qu.:180.0  
##  Median :  1.000   Median : 0.0000   Median :180.0   Median :180.0  
##  Mean   :  1.277   Mean   : 0.4821   Mean   :211.1   Mean   :213.4  
##  3rd Qu.:  2.000   3rd Qu.: 0.0000   3rd Qu.:180.0   3rd Qu.:180.0  
##  Max.   :225.000   Max.   :16.0000   Max.   :528.6   Max.   :529.2

That’s great we don’t see program ids starting with 3 and 4.

Lets make some analysis for min_score and max_score with this data.

qplot(x=min_score, data=mydata1, binwidth=50)

qplot(x=max_score, data=mydata1, binwidth=50)

Here we see that max_score’s distribution is very much like a normal distribution where min_score is a bit more positive skewed.

Now, we want to check how the general placement and valedictorian placement vary by city. We use qplot.

qplot(city, as.numeric(general_placement), data=mydata1)

qplot(city, as.numeric(val_placement), data=mydata1)

Here, we would like to filter our data and create mydata with the columns of mean of min scores, university names and statement if the university is MEF University or not(ismef).

mydata <- 
  osym_data_2017 %>% filter(floor(as.numeric(program_id)/100000000) %in% c(1,2)) %>% group_by(university_name) %>% summarise(mms=mean(min_score)) %>% mutate(ismef=(university_name == "MEF ?N?VERS?TES?"))

summary(mydata)
##  university_name         mms          ismef        
##  Length:172         Min.   :237.2   Mode :logical  
##  Class :character   1st Qu.:273.3   FALSE:172      
##  Mode  :character   Median :293.9                  
##                     Mean   :301.1                  
##                     3rd Qu.:319.8                  
##                     Max.   :457.0

And we would like to check the distribution of mean of min_score

qplot(mms, data=mydata, binwidth=30)

And finally, We want to show Mef University with blue while the other universities are red.

ggplot(mydata, aes(x=university_name,y=mms)) + geom_point(aes(color=ismef))