getwd()
## [1] "C:/Users/Lenovo/Desktop/BDA 503 Data Analytics Essentials"
# Load tidyverse package
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
# Load the data
load("osym_data_2017.RData")
# lets see how our data looks like by glimpse function.
glimpse(osym_data_2017)
## Observations: 11,465
## Variables: 14
## $ program_id <chr> "100110266", "100110487", "100110724", "1001...
## $ university_name <chr> "ABANT Ä°ZZET BAYSAL ÃœNÄ°VERSÄ°TESÄ°", "ABANT Ä°Z...
## $ city <chr> "BOLU", "BOLU", "BOLU", "BOLU", "BOLU", "BOL...
## $ faculty_name <chr> "Bolu Sağlık Yüksekokulu", "Bolu Turizm İşle...
## $ program_name <chr> "HemÅŸirelik", "Gastronomi ve Mutfak Sanatlar...
## $ exam_type <chr> "YGS_2", "YGS_4", "YGS_6", "YGS_6", "MF_3", ...
## $ general_quota <chr> "150", "60", "60", "60", "80", "1", "40", "6...
## $ general_placement <chr> "150", "60", "62", "26", "80", "1", "9", "62...
## $ min_score <dbl> 328.8790, 346.4491, 225.7170, 199.2710, 446....
## $ max_score <dbl> 376.3817, 388.3141, 290.2683, 234.9510, 451....
## $ val_quota <dbl> 4, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2, 2, 2, 3, 2,...
## $ val_placement <dbl> 4, 2, 0, 0, 2, 0, 0, 0, 2, 1, 2, 2, 2, 3, 1,...
## $ val_min_score <dbl> 312.8462, 293.6994, 180.0000, 180.0000, 437....
## $ val_max_score <dbl> 328.0626, 328.7560, 180.0000, 180.0000, 442....
summary(osym_data_2017)
## program_id university_name city
## Length:11465 Length:11465 Length:11465
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## faculty_name program_name exam_type
## Length:11465 Length:11465 Length:11465
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## general_quota general_placement min_score max_score
## Length:11465 Length:11465 Min. :180.0 Min. :180.0
## Class :character Class :character 1st Qu.:235.7 1st Qu.:280.0
## Mode :character Mode :character Median :281.0 Median :341.0
## Mean :294.8 Mean :338.0
## 3rd Qu.:351.7 3rd Qu.:397.9
## Max. :543.7 Max. :575.2
## val_quota val_placement val_min_score val_max_score
## Min. : 0.000 Min. : 0.0000 Min. :180.0 Min. :180.0
## 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.:180.0 1st Qu.:180.0
## Median : 0.000 Median : 0.0000 Median :180.0 Median :180.0
## Mean : 1.135 Mean : 0.4283 Mean :207.6 Mean :209.7
## 3rd Qu.: 2.000 3rd Qu.: 0.0000 3rd Qu.:180.0 3rd Qu.:180.0
## Max. :225.000 Max. :16.0000 Max. :528.6 Max. :529.2
It seems like there are 14 variables and 11K observations. Let’s use summary function, so we can see some simple statistics:
summary(osym_data_2017)
## program_id university_name city
## Length:11465 Length:11465 Length:11465
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## faculty_name program_name exam_type
## Length:11465 Length:11465 Length:11465
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## general_quota general_placement min_score max_score
## Length:11465 Length:11465 Min. :180.0 Min. :180.0
## Class :character Class :character 1st Qu.:235.7 1st Qu.:280.0
## Mode :character Mode :character Median :281.0 Median :341.0
## Mean :294.8 Mean :338.0
## 3rd Qu.:351.7 3rd Qu.:397.9
## Max. :543.7 Max. :575.2
## val_quota val_placement val_min_score val_max_score
## Min. : 0.000 Min. : 0.0000 Min. :180.0 Min. :180.0
## 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.:180.0 1st Qu.:180.0
## Median : 0.000 Median : 0.0000 Median :180.0 Median :180.0
## Mean : 1.135 Mean : 0.4283 Mean :207.6 Mean :209.7
## 3rd Qu.: 2.000 3rd Qu.: 0.0000 3rd Qu.:180.0 3rd Qu.:180.0
## Max. :225.000 Max. :16.0000 Max. :528.6 Max. :529.2
Here we see that val_placement is very skewed, we keep this in mind. We know that Mef University managers does not really interested in universities abroad. So we filter the program ids, we dont want to see the programs with ids that start with 3 and 4.
So lets create mydata1 and see a summary
mydata1 <-
osym_data_2017 %>% filter(floor(as.numeric(program_id)/100000000) %in% c(1,2))
summary(mydata1)
## program_id university_name city
## Length:10186 Length:10186 Length:10186
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## faculty_name program_name exam_type
## Length:10186 Length:10186 Length:10186
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## general_quota general_placement min_score max_score
## Length:10186 Length:10186 Min. :180.0 Min. :180.0
## Class :character Class :character 1st Qu.:242.5 1st Qu.:290.9
## Mode :character Mode :character Median :288.2 Median :349.6
## Mean :301.2 Mean :347.3
## 3rd Qu.:357.7 3rd Qu.:403.4
## Max. :543.7 Max. :575.2
## val_quota val_placement val_min_score val_max_score
## Min. : 0.000 Min. : 0.0000 Min. :180.0 Min. :180.0
## 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.:180.0 1st Qu.:180.0
## Median : 1.000 Median : 0.0000 Median :180.0 Median :180.0
## Mean : 1.277 Mean : 0.4821 Mean :211.1 Mean :213.4
## 3rd Qu.: 2.000 3rd Qu.: 0.0000 3rd Qu.:180.0 3rd Qu.:180.0
## Max. :225.000 Max. :16.0000 Max. :528.6 Max. :529.2
That’s great we don’t see program ids starting with 3 and 4.
Lets make some analysis for min_score and max_score with this data.
qplot(x=min_score, data=mydata1, binwidth=50)
qplot(x=max_score, data=mydata1, binwidth=50)
Here we see that max_score’s distribution is very much like a normal distribution where min_score is a bit more positive skewed.
Now, we want to check how the general placement and valedictorian placement vary by city. We use qplot.
qplot(city, as.numeric(general_placement), data=mydata1)
qplot(city, as.numeric(val_placement), data=mydata1)
Here, we would like to filter our data and create mydata with the columns of mean of min scores, university names and statement if the university is MEF University or not(ismef).
mydata <-
osym_data_2017 %>% filter(floor(as.numeric(program_id)/100000000) %in% c(1,2)) %>% group_by(university_name) %>% summarise(mms=mean(min_score)) %>% mutate(ismef=(university_name == "MEF ?N?VERS?TES?"))
summary(mydata)
## university_name mms ismef
## Length:172 Min. :237.2 Mode :logical
## Class :character 1st Qu.:273.3 FALSE:172
## Mode :character Median :293.9
## Mean :301.1
## 3rd Qu.:319.8
## Max. :457.0
And we would like to check the distribution of mean of min_score
qplot(mms, data=mydata, binwidth=30)
And finally, We want to show Mef University with blue while the other universities are red.
ggplot(mydata, aes(x=university_name,y=mms)) + geom_point(aes(color=ismef))