Packages
- dplyr (data manipulation)
- ggplot2 (visualization)
- tidyr, readr, purrr, tibble
Oct 10, 2017
dplyr
Five fundamental words and you can do almost anything.
select
/ rename
filter
/ distinct
arrange
mutate
/ transmute
group-by
+ summarize
and the pipe operator %>%
to bind them all.
We are going to use secimler
package as a data source and apply dplyr
commmands. (See instructions to install.)
library(tidyverse) library(secimler) secim150607g #Our data set
## # A tibble: 176,831 x 30 ## il bolge ilce cevre cevre_turu sandik kayitli_secmen ## <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl> ## 1 Adana 1 İli Genel Toplam 4426 1477328 ## 2 Adana 1 İli İl-İlçe Genel Toplam 4426 1477328 ## 3 Adana 1 Seyhan Genel Toplam 1473 529658 ## 4 Adana 1 Seyhan İl-İlçe Genel Toplam 1473 529658 ## 5 Adana 1 Seyhan Ahmet Remzi Yüreğir Mahalle 3001 341 ## 6 Adana 1 Seyhan Ahmet Remzi Yüreğir Mahalle 3002 339 ## 7 Adana 1 Seyhan Ahmet Remzi Yüreğir Mahalle 3003 341 ## 8 Adana 1 Seyhan Akkapı Mahalle 2001 366 ## 9 Adana 1 Seyhan Akkapı Mahalle 2002 365 ## 10 Adana 1 Seyhan Akkapı Mahalle 2003 366 ## # ... with 176,821 more rows, and 23 more variables: oy_kullanan <dbl>, ## # gecerli_oy <dbl>, ak_parti <dbl>, chp <dbl>, mhp <dbl>, hdp <dbl>, ## # dyp <dbl>, anadolu_partisi <dbl>, hak_par <dbl>, kp <dbl>, ## # millet_partisi <dbl>, hap <dbl>, mep <dbl>, turk_parti <dbl>, ## # hkp <dbl>, ldp <dbl>, saadet_partisi <dbl>, dsp <dbl>, ## # yurt_parti <dbl>, dp <dbl>, vatan_partisi <dbl>, btp <dbl>, ## # bagimsiz <dbl>
glimpse
of the dataglimpse
is similar to str
.
glimpse(secim150607g)
## Observations: 176,831 ## Variables: 30 ## $ il <chr> "Adana", "Adana", "Adana", "Adana", "Adana", "... ## $ bolge <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1... ## $ ilce <chr> "İli", "İli", "Seyhan", "Seyhan", "Seyhan", "S... ## $ cevre <chr> "Genel", "İl-İlçe Genel", "Genel", "İl-İlçe Ge... ## $ cevre_turu <chr> "Toplam", "Toplam", "Toplam", "Toplam", "Mahal... ## $ sandik <dbl> 4426, 4426, 1473, 1473, 3001, 3002, 3003, 2001... ## $ kayitli_secmen <dbl> 1477328, 1477328, 529658, 529658, 341, 339, 34... ## $ oy_kullanan <dbl> 1254393, 1254393, 449743, 449743, 285, 278, 29... ## $ gecerli_oy <dbl> 1229817, 1229817, 441384, 441384, 279, 270, 28... ## $ ak_parti <dbl> 368055, 368055, 113966, 113966, 111, 98, 88, 1... ## $ chp <dbl> 354389, 354389, 138646, 138646, 49, 59, 67, 28... ## $ mhp <dbl> 287799, 287799, 79714, 79714, 81, 66, 81, 4, 4... ## $ hdp <dbl> 177730, 177730, 94887, 94887, 29, 35, 49, 9, 5... ## $ dyp <dbl> 1100, 1100, 435, 435, 0, 0, 0, 1, 0, 0, 0, 0, ... ## $ anadolu_partisi <dbl> 590, 590, 227, 227, 0, 0, 0, 0, 0, 0, 0, 0, 0,... ## $ hak_par <dbl> 1265, 1265, 379, 379, 0, 0, 0, 0, 0, 0, 0, 0, ... ## $ kp <dbl> 310, 310, 120, 120, 0, 0, 0, 0, 1, 0, 0, 2, 0,... ## $ millet_partisi <dbl> 509, 509, 133, 133, 0, 0, 0, 0, 0, 0, 0, 0, 0,... ## $ hap <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0... ## $ mep <dbl> 644, 644, 170, 170, 1, 0, 0, 0, 0, 0, 0, 0, 0,... ## $ turk_parti <dbl> 2199, 2199, 545, 545, 1, 0, 0, 0, 0, 1, 0, 0, ... ## $ hkp <dbl> 1432, 1432, 408, 408, 0, 0, 0, 0, 0, 0, 1, 0, ... ## $ ldp <dbl> 967, 967, 288, 288, 0, 1, 0, 0, 1, 0, 0, 0, 0,... ## $ saadet_partisi <dbl> 18917, 18917, 6605, 6605, 0, 3, 1, 1, 2, 1, 5,... ## $ dsp <dbl> 1897, 1897, 643, 643, 0, 0, 0, 0, 0, 0, 0, 1, ... ## $ yurt_parti <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0... ## $ dp <dbl> 1534, 1534, 383, 383, 0, 0, 0, 0, 0, 1, 0, 0, ... ## $ vatan_partisi <dbl> 3731, 3731, 1284, 1284, 2, 0, 0, 2, 0, 0, 0, 2... ## $ btp <dbl> 1735, 1735, 607, 607, 0, 0, 0, 0, 0, 1, 0, 0, ... ## $ bagimsiz <dbl> 5014, 5014, 1944, 1944, 5, 8, 2, 0, 1, 0, 0, 0...
select
/ rename
It is basically column select and renaming.
secim150607g %>% select(il, cevre, kayitli_secmen)
## # A tibble: 176,831 x 3 ## il cevre kayitli_secmen ## <chr> <chr> <dbl> ## 1 Adana Genel 1477328 ## 2 Adana İl-İlçe Genel 1477328 ## 3 Adana Genel 529658 ## 4 Adana İl-İlçe Genel 529658 ## 5 Adana Ahmet Remzi Yüreğir 341 ## 6 Adana Ahmet Remzi Yüreğir 339 ## 7 Adana Ahmet Remzi Yüreğir 341 ## 8 Adana Akkapı 366 ## 9 Adana Akkapı 365 ## 10 Adana Akkapı 366 ## # ... with 176,821 more rows
select
/ rename
secim150607g %>% rename(city = il)
## # A tibble: 176,831 x 30 ## city bolge ilce cevre cevre_turu sandik kayitli_secmen ## <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl> ## 1 Adana 1 İli Genel Toplam 4426 1477328 ## 2 Adana 1 İli İl-İlçe Genel Toplam 4426 1477328 ## 3 Adana 1 Seyhan Genel Toplam 1473 529658 ## 4 Adana 1 Seyhan İl-İlçe Genel Toplam 1473 529658 ## 5 Adana 1 Seyhan Ahmet Remzi Yüreğir Mahalle 3001 341 ## 6 Adana 1 Seyhan Ahmet Remzi Yüreğir Mahalle 3002 339 ## 7 Adana 1 Seyhan Ahmet Remzi Yüreğir Mahalle 3003 341 ## 8 Adana 1 Seyhan Akkapı Mahalle 2001 366 ## 9 Adana 1 Seyhan Akkapı Mahalle 2002 365 ## 10 Adana 1 Seyhan Akkapı Mahalle 2003 366 ## # ... with 176,821 more rows, and 23 more variables: oy_kullanan <dbl>, ## # gecerli_oy <dbl>, ak_parti <dbl>, chp <dbl>, mhp <dbl>, hdp <dbl>, ## # dyp <dbl>, anadolu_partisi <dbl>, hak_par <dbl>, kp <dbl>, ## # millet_partisi <dbl>, hap <dbl>, mep <dbl>, turk_parti <dbl>, ## # hkp <dbl>, ldp <dbl>, saadet_partisi <dbl>, dsp <dbl>, ## # yurt_parti <dbl>, dp <dbl>, vatan_partisi <dbl>, btp <dbl>, ## # bagimsiz <dbl>
select
/ rename
You can rename with select also.
secim150607g %>% select(city = il, district = cevre, n_voters = kayitli_secmen)
## # A tibble: 176,831 x 3 ## city district n_voters ## <chr> <chr> <dbl> ## 1 Adana Genel 1477328 ## 2 Adana İl-İlçe Genel 1477328 ## 3 Adana Genel 529658 ## 4 Adana İl-İlçe Genel 529658 ## 5 Adana Ahmet Remzi Yüreğir 341 ## 6 Adana Ahmet Remzi Yüreğir 339 ## 7 Adana Ahmet Remzi Yüreğir 341 ## 8 Adana Akkapı 366 ## 9 Adana Akkapı 365 ## 10 Adana Akkapı 366 ## # ... with 176,821 more rows
filter
/ distinct
filter
returns the rows given the criteria.
secim150607g %>% filter(il == "Ä°stanbul")
## # A tibble: 27,977 x 30 ## il bolge ilce cevre cevre_turu sandik kayitli_secmen ## <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl> ## 1 İstanbul 1 İli Genel Toplam 10055 3682297 ## 2 İstanbul 1 İli İl-İlçe Genel Toplam 10055 3682297 ## 3 İstanbul 1 Adalar Genel Toplam 37 12087 ## 4 İstanbul 1 Adalar İl-İlçe Genel Toplam 37 12087 ## 5 İstanbul 1 Adalar Burgazadası Mahalle 1001 309 ## 6 İstanbul 1 Adalar Burgazadası Mahalle 1002 312 ## 7 İstanbul 1 Adalar Burgazadası Mahalle 1003 310 ## 8 İstanbul 1 Adalar Burgazadası Mahalle 1004 310 ## 9 İstanbul 1 Adalar Büyükada-Maden Mahalle 1005 277 ## 10 İstanbul 1 Adalar Büyükada-Maden Mahalle 1006 275 ## # ... with 27,967 more rows, and 23 more variables: oy_kullanan <dbl>, ## # gecerli_oy <dbl>, ak_parti <dbl>, chp <dbl>, mhp <dbl>, hdp <dbl>, ## # dyp <dbl>, anadolu_partisi <dbl>, hak_par <dbl>, kp <dbl>, ## # millet_partisi <dbl>, hap <dbl>, mep <dbl>, turk_parti <dbl>, ## # hkp <dbl>, ldp <dbl>, saadet_partisi <dbl>, dsp <dbl>, ## # yurt_parti <dbl>, dp <dbl>, vatan_partisi <dbl>, btp <dbl>, ## # bagimsiz <dbl>
filter
/ distinct
secim150607g %>% filter(il == "Ä°zmir" & kayitli_secmen > 99 & kayitli_secmen <= 300)
## # A tibble: 925 x 30 ## il bolge ilce cevre cevre_turu sandik kayitli_secmen ## <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl> ## 1 İzmir 1 Çeşme Ardıç Mahalle 1026 191 ## 2 İzmir 1 Çeşme Ardıç Mahalle 1027 192 ## 3 İzmir 1 Çeşme Boyalık Mahalle 1028 199 ## 4 İzmir 1 Çeşme Boyalık Mahalle 1029 199 ## 5 İzmir 1 Çeşme Celal Bayar Mahalle 1030 257 ## 6 İzmir 1 Çeşme Celal Bayar Mahalle 1031 255 ## 7 İzmir 1 Çeşme Çakabey Mahalle 1034 138 ## 8 İzmir 1 Çeşme Fahrettinpaşa Mahalle 1044 279 ## 9 İzmir 1 Çeşme Ildır Mahalle 1048 275 ## 10 İzmir 1 Çeşme Ildır Mahalle 1049 276 ## # ... with 915 more rows, and 23 more variables: oy_kullanan <dbl>, ## # gecerli_oy <dbl>, ak_parti <dbl>, chp <dbl>, mhp <dbl>, hdp <dbl>, ## # dyp <dbl>, anadolu_partisi <dbl>, hak_par <dbl>, kp <dbl>, ## # millet_partisi <dbl>, hap <dbl>, mep <dbl>, turk_parti <dbl>, ## # hkp <dbl>, ldp <dbl>, saadet_partisi <dbl>, dsp <dbl>, ## # yurt_parti <dbl>, dp <dbl>, vatan_partisi <dbl>, btp <dbl>, ## # bagimsiz <dbl>
filter
/ distinct
distinct
removes the duplicate values.
secim150607g %>% distinct(il, .keep_all = TRUE)
## # A tibble: 81 x 30 ## il bolge ilce cevre cevre_turu sandik kayitli_secmen ## <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl> ## 1 Adana 1 İli Genel Toplam 4426 1477328 ## 2 Adıyaman 1 İli Genel Toplam 1386 380745 ## 3 Afyonkarahisar 1 İli Genel Toplam 1766 493765 ## 4 Ağrı 1 İli Genel Toplam 1153 291539 ## 5 Amasya 1 İli Genel Toplam 916 238506 ## 6 Ankara 1 İli Genel Toplam 5936 2074389 ## 7 Antalya 1 İli Genel Toplam 4691 1568153 ## 8 Artvin 1 İli Genel Toplam 621 129961 ## 9 Aydın 1 İli Genel Toplam 2452 772649 ## 10 Balıkesir 1 İli Genel Toplam 3100 906421 ## # ... with 71 more rows, and 23 more variables: oy_kullanan <dbl>, ## # gecerli_oy <dbl>, ak_parti <dbl>, chp <dbl>, mhp <dbl>, hdp <dbl>, ## # dyp <dbl>, anadolu_partisi <dbl>, hak_par <dbl>, kp <dbl>, ## # millet_partisi <dbl>, hap <dbl>, mep <dbl>, turk_parti <dbl>, ## # hkp <dbl>, ldp <dbl>, saadet_partisi <dbl>, dsp <dbl>, ## # yurt_parti <dbl>, dp <dbl>, vatan_partisi <dbl>, btp <dbl>, ## # bagimsiz <dbl>
arrange
arrange
is to order by columns.
secim150607g %>% distinct(il, .keep_all = TRUE) %>% arrange(desc(kayitli_secmen))
## # A tibble: 81 x 30 ## il bolge ilce cevre cevre_turu sandik kayitli_secmen ## <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl> ## 1 Ä°stanbul 1 Ä°li Genel Toplam 10055 3682297 ## 2 Ankara 1 Ä°li Genel Toplam 5936 2074389 ## 3 Bursa 1 Ä°li Genel Toplam 5968 2028417 ## 4 Antalya 1 Ä°li Genel Toplam 4691 1568153 ## 5 Ä°zmir 1 Ä°li Genel Toplam 4304 1513998 ## 6 Adana 1 Ä°li Genel Toplam 4426 1477328 ## 7 Konya 1 Ä°li Genel Toplam 4523 1430306 ## 8 Kocaeli 1 Ä°li Genel Toplam 3577 1221771 ## 9 Mersin 1 Ä°li Genel Toplam 3698 1210552 ## 10 Gaziantep 1 Ä°li Genel Toplam 3462 1130109 ## # ... with 71 more rows, and 23 more variables: oy_kullanan <dbl>, ## # gecerli_oy <dbl>, ak_parti <dbl>, chp <dbl>, mhp <dbl>, hdp <dbl>, ## # dyp <dbl>, anadolu_partisi <dbl>, hak_par <dbl>, kp <dbl>, ## # millet_partisi <dbl>, hap <dbl>, mep <dbl>, turk_parti <dbl>, ## # hkp <dbl>, ldp <dbl>, saadet_partisi <dbl>, dsp <dbl>, ## # yurt_parti <dbl>, dp <dbl>, vatan_partisi <dbl>, btp <dbl>, ## # bagimsiz <dbl>
mutate
/ transmute
mutate
is used to do operations on columns.
secim150607g %>% distinct(il, .keep_all = TRUE) %>% select(il, sandik, kayitli_secmen, gecerli_oy) %>% mutate(katilim = round(gecerli_oy/kayitli_secmen, 2))
## # A tibble: 81 x 5 ## il sandik kayitli_secmen gecerli_oy katilim ## <chr> <dbl> <dbl> <dbl> <dbl> ## 1 Adana 4426 1477328 1229817 0.83 ## 2 Adıyaman 1386 380745 306562 0.81 ## 3 Afyonkarahisar 1766 493765 424884 0.86 ## 4 Ağrı 1153 291539 237172 0.81 ## 5 Amasya 916 238506 209549 0.88 ## 6 Ankara 5936 2074389 1797588 0.87 ## 7 Antalya 4691 1568153 1306778 0.83 ## 8 Artvin 621 129961 108191 0.83 ## 9 Aydın 2452 772649 658731 0.85 ## 10 Balıkesir 3100 906421 786183 0.87 ## # ... with 71 more rows
mutate
/ transmute
transmute
is mutate
+ select
.
secim150607g %>% distinct(il, .keep_all = TRUE) %>% transmute(il, sandik, katilim = round(gecerli_oy/kayitli_secmen, 2))
## # A tibble: 81 x 3 ## il sandik katilim ## <chr> <dbl> <dbl> ## 1 Adana 4426 0.83 ## 2 Adıyaman 1386 0.81 ## 3 Afyonkarahisar 1766 0.86 ## 4 Ağrı 1153 0.81 ## 5 Amasya 916 0.88 ## 6 Ankara 5936 0.87 ## 7 Antalya 4691 0.83 ## 8 Artvin 621 0.83 ## 9 Aydın 2452 0.85 ## 10 Balıkesir 3100 0.87 ## # ... with 71 more rows
group_by
/ summarise
Analogous to pivot table.
secim150607g %>% filter(cevre_turu != "Toplam") %>% group_by(cevre_turu) %>% summarise(count = n(), gecerli_oy = sum(gecerli_oy), kayitli_secmen = sum(kayitli_secmen), katilim = round(sum(gecerli_oy)/sum(kayitli_secmen), 2))
## # A tibble: 4 x 5 ## cevre_turu count gecerli_oy kayitli_secmen katilim ## <chr> <int> <dbl> <dbl> <dbl> ## 1 Belde 2954 666967 799165 0.83 ## 2 Cezaevi 343 30610 0 Inf ## 3 Köy 21631 3140794 3692727 0.85 ## 4 Mahalle 149292 41283402 49249946 0.84
dplyr
extrasslice
, sample_n
, sample_frac
, mutate_at
, summarise_at
.left_join
, semi_join
, full_join
, anti_join
.spread
, gather
.ddply
, adply
, mdply
and ldply
.See course webpage for more examples and implementations. Definitely get dplyr cheat sheet.
ggplot2
ggplot2
?gg
stands for grammar of graphics.
# Instead of %>% we use + for ggplot2. ggplot(data = data_set, aes(x = x_axis_col, y = y_axis_col)) + geom_point()
geom_point
scatter plot.geom_bar
bar charts and pie charts.geom_line
line charts.geom_boxplot
box plots.geom_tile
heatmaps.geom_histogram
histograms.geom_dotplot
, geom_polygon
, geom_density
,geom_contour
,geom_ribbon
,geom_area
,…the_df <- secim150607g %>% filter(il == "Ä°zmir" & !(cevre_turu %in% c("Cezaevi", "Toplam"))) %>% transmute(county = ilce, district = cevre, district_type = cevre_turu, participation = gecerli_oy/kayitli_secmen, n_voters = kayitli_secmen, n_valid_votes = gecerli_oy) the_df
## # A tibble: 8,963 x 6 ## county district district_type participation n_voters n_valid_votes ## <chr> <chr> <chr> <dbl> <dbl> <dbl> ## 1 Çeşme Alaçatı Mahalle 0.8442623 366 309 ## 2 Çeşme Alaçatı Mahalle 0.8879781 366 325 ## 3 Çeşme Alaçatı Mahalle 0.8060109 366 295 ## 4 Çeşme Alaçatı Mahalle 0.9068493 365 331 ## 5 Çeşme Alaçatı Mahalle 0.7788162 321 250 ## 6 Çeşme Alaçatı Mahalle 0.8317757 321 267 ## 7 Çeşme Alaçatı Mahalle 0.8535826 321 274 ## 8 Çeşme Alaçatı Mahalle 0.7962382 319 254 ## 9 Çeşme Alaçatı Mahalle 0.7994100 339 271 ## 10 Çeşme Alaçatı Mahalle 0.8230088 339 279 ## # ... with 8,953 more rows
geom_point
ggplot(data = the_df, aes(x = n_voters, y = participation)) + geom_point()
geom_bar
ggplot(data = the_df %>% group_by(county) %>% summarise(participation = weighted.mean(participation, n_voters), total_voters = sum(n_voters)), aes(x = county, y = total_voters, fill = participation)) + geom_bar(stat = "identity") + theme(axis.text.x = element_text(angle = 45))
geom_bar
pie chartggplot(data = the_df %>% group_by(county) %>% summarise(total_voters = sum(n_voters)), aes(x = "", y = total_voters, fill = county)) + geom_bar(stat = "identity", width = 1) + coord_polar("y", direction = -1) + theme_void()
ggplot2
extrastheme
adds and modifies style.fill
, alpha
, color
, shape
parameters.ggthemes
, ggnetwork
etc.abline
, or fit splines with geom_smooth
.plotly
package.See course webpage for extra materials. Definitely get cheat sheet.
Note: ggplot2 is very easy for basic stuff. But as requirements get complex, it might need more attention.
There are some other amazing packages that you can use in your data processes. This is not an exhaustive but definitely delightful list.
lubridate
is good for time data.stringr
for string data.readxl
and writexl
for xlsx files i/o operations.rvest
for parsing from web sites.haven
, readr
, DBI
, foreign
for different file types.devtools
, roxygen2
for package making and documentation.data.table
, sparklyr
for big data stuff.shiny
for web pages.