#Firstly I upload Tidyverse and call esoph data-set
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
esoph
##    agegp     alcgp    tobgp ncases ncontrols
## 1  25-34 0-39g/day 0-9g/day      0        40
## 2  25-34 0-39g/day    10-19      0        10
## 3  25-34 0-39g/day    20-29      0         6
## 4  25-34 0-39g/day      30+      0         5
## 5  25-34     40-79 0-9g/day      0        27
## 6  25-34     40-79    10-19      0         7
## 7  25-34     40-79    20-29      0         4
## 8  25-34     40-79      30+      0         7
## 9  25-34    80-119 0-9g/day      0         2
## 10 25-34    80-119    10-19      0         1
## 11 25-34    80-119      30+      0         2
## 12 25-34      120+ 0-9g/day      0         1
## 13 25-34      120+    10-19      1         1
## 14 25-34      120+    20-29      0         1
## 15 25-34      120+      30+      0         2
## 16 35-44 0-39g/day 0-9g/day      0        60
## 17 35-44 0-39g/day    10-19      1        14
## 18 35-44 0-39g/day    20-29      0         7
## 19 35-44 0-39g/day      30+      0         8
## 20 35-44     40-79 0-9g/day      0        35
## 21 35-44     40-79    10-19      3        23
## 22 35-44     40-79    20-29      1        14
## 23 35-44     40-79      30+      0         8
## 24 35-44    80-119 0-9g/day      0        11
## 25 35-44    80-119    10-19      0         6
## 26 35-44    80-119    20-29      0         2
## 27 35-44    80-119      30+      0         1
## 28 35-44      120+ 0-9g/day      2         3
## 29 35-44      120+    10-19      0         3
## 30 35-44      120+    20-29      2         4
## 31 45-54 0-39g/day 0-9g/day      1        46
## 32 45-54 0-39g/day    10-19      0        18
## 33 45-54 0-39g/day    20-29      0        10
## 34 45-54 0-39g/day      30+      0         4
## 35 45-54     40-79 0-9g/day      6        38
## 36 45-54     40-79    10-19      4        21
## 37 45-54     40-79    20-29      5        15
## 38 45-54     40-79      30+      5         7
## 39 45-54    80-119 0-9g/day      3        16
## 40 45-54    80-119    10-19      6        14
## 41 45-54    80-119    20-29      1         5
## 42 45-54    80-119      30+      2         4
## 43 45-54      120+ 0-9g/day      4         4
## 44 45-54      120+    10-19      3         4
## 45 45-54      120+    20-29      2         3
## 46 45-54      120+      30+      4         4
## 47 55-64 0-39g/day 0-9g/day      2        49
## 48 55-64 0-39g/day    10-19      3        22
## 49 55-64 0-39g/day    20-29      3        12
## 50 55-64 0-39g/day      30+      4         6
## 51 55-64     40-79 0-9g/day      9        40
## 52 55-64     40-79    10-19      6        21
## 53 55-64     40-79    20-29      4        17
## 54 55-64     40-79      30+      3         6
## 55 55-64    80-119 0-9g/day      9        18
## 56 55-64    80-119    10-19      8        15
## 57 55-64    80-119    20-29      3         6
## 58 55-64    80-119      30+      4         4
## 59 55-64      120+ 0-9g/day      5        10
## 60 55-64      120+    10-19      6         7
## 61 55-64      120+    20-29      2         3
## 62 55-64      120+      30+      5         6
## 63 65-74 0-39g/day 0-9g/day      5        48
## 64 65-74 0-39g/day    10-19      4        14
## 65 65-74 0-39g/day    20-29      2         7
## 66 65-74 0-39g/day      30+      0         2
## 67 65-74     40-79 0-9g/day     17        34
## 68 65-74     40-79    10-19      3        10
## 69 65-74     40-79    20-29      5         9
## 70 65-74    80-119 0-9g/day      6        13
## 71 65-74    80-119    10-19      4        12
## 72 65-74    80-119    20-29      2         3
## 73 65-74    80-119      30+      1         1
## 74 65-74      120+ 0-9g/day      3         4
## 75 65-74      120+    10-19      1         2
## 76 65-74      120+    20-29      1         1
## 77 65-74      120+      30+      1         1
## 78   75+ 0-39g/day 0-9g/day      1        18
## 79   75+ 0-39g/day    10-19      2         6
## 80   75+ 0-39g/day      30+      1         3
## 81   75+     40-79 0-9g/day      2         5
## 82   75+     40-79    10-19      1         3
## 83   75+     40-79    20-29      0         3
## 84   75+     40-79      30+      1         1
## 85   75+    80-119 0-9g/day      1         1
## 86   75+    80-119    10-19      1         1
## 87   75+      120+ 0-9g/day      2         2
## 88   75+      120+    10-19      1         1
#Taking a look at the summary of the data
summary(esoph)
##    agegp          alcgp         tobgp        ncases         ncontrols    
##  25-34:15   0-39g/day:23   0-9g/day:24   Min.   : 0.000   Min.   : 1.00  
##  35-44:15   40-79    :23   10-19   :24   1st Qu.: 0.000   1st Qu.: 3.00  
##  45-54:16   80-119   :21   20-29   :20   Median : 1.000   Median : 6.00  
##  55-64:16   120+     :21   30+     :20   Mean   : 2.273   Mean   :11.08  
##  65-74:15                                3rd Qu.: 4.000   3rd Qu.:14.00  
##  75+  :11                                Max.   :17.000   Max.   :60.00
#Seems like I have 5 variables, and the cancer cases are shown by "ncases"
#Lets check the number of cases by alcohol consumption and show them according to age groups
qplot(x=ncases,y=alcgp, data=esoph)+
facet_wrap(~agegp)

#I would like to check mean of the cases by alcohol consumption.
alc <- group_by(esoph, alcgp)
alc
## # A tibble: 88 x 5
## # Groups:   alcgp [4]
##    agegp     alcgp    tobgp ncases ncontrols
##    <ord>     <ord>    <ord>  <dbl>     <dbl>
##  1 25-34 0-39g/day 0-9g/day      0        40
##  2 25-34 0-39g/day    10-19      0        10
##  3 25-34 0-39g/day    20-29      0         6
##  4 25-34 0-39g/day      30+      0         5
##  5 25-34     40-79 0-9g/day      0        27
##  6 25-34     40-79    10-19      0         7
##  7 25-34     40-79    20-29      0         4
##  8 25-34     40-79      30+      0         7
##  9 25-34    80-119 0-9g/day      0         2
## 10 25-34    80-119    10-19      0         1
## # ... with 78 more rows
a <- summarise(alc,ncases_mean = mean(ncases))
a
## # A tibble: 4 x 2
##       alcgp ncases_mean
##       <ord>       <dbl>
## 1 0-39g/day    1.260870
## 2     40-79    3.260870
## 3    80-119    2.428571
## 4      120+    2.142857
#And I check the median

b <- summarize(alc,ncases_median=median(ncases))

b
## # A tibble: 4 x 2
##       alcgp ncases_median
##       <ord>         <dbl>
## 1 0-39g/day             1
## 2     40-79             3
## 3    80-119             1
## 4      120+             2
# ggplot the ncases and alcgp

ggplot(aes(ncases, alcgp), data=esoph)+
  scale_x_continuous(lim = c(0,20))+
  geom_point(color="purple")

# ggplot the ncases and tobgp

ggplot(aes(x=ncases, y=tobgp), data=esoph)+
  scale_x_continuous(lim = c(0,50))+
  geom_point(color="red")

#Finally I check the tobacco usage by grouping alcohol usage
ggplot(aes(x=ncases, y=tobgp), data=esoph)+
  scale_x_continuous(lim = c(0,20))+
  geom_point(color="pink")+
  facet_wrap(~alcgp)