library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(glmnet)
## Warning: package 'glmnet' was built under R version 3.4.3
## Loading required package: Matrix
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 3.4.3
## Loaded glmnet 2.0-13
library(reshape2)

setwd("C:\\Users\\serba\\Desktop\\R\\Machine Learning")
yr_data <-
  read.csv("responses.csv",sep=",")

yr_hi_data <- yr_data %>% select(History:Pets)

yr_hi_data[is.na(yr_hi_data) ] <- 0

print(head(yr_hi_data))
##   History Psychology Politics Mathematics Physics Internet PC
## 1       1          5        1           3       3        5  3
## 2       1          3        4           5       2        4  4
## 3       1          2        1           5       2        4  2
## 4       4          4        5           4       1        3  1
## 5       3          2        3           2       2        2  2
## 6       5          3        4           2       3        4  4
##   Economy.Management Biology Chemistry Reading Geography Foreign.languages
## 1                  5       3         3       3         3                 5
## 2                  5       1         1       4         4                 5
## 3                  4       1         1       5         2                 5
## 4                  2       3         3       5         4                 4
## 5                  2       3         3       5         2                 3
## 6                  1       4         4       3         3                 4
##   Medicine Law Cars Art.exhibitions Religion Countryside..outdoors Dancing
## 1        3   1    1               1        1                     5       3
## 2        1   2    2               2        1                     1       1
## 3        2   3    1               5        5                     5       5
## 4        2   5    1               5        4                     1       1
## 5        3   2    3               1        4                     4       1
## 6        4   3    5               2        2                     5       1
##   Musical.instruments Writing Passive.sport Active.sport Gardening
## 1                   3       2             1            5         5
## 2                   1       1             1            1         1
## 3                   5       5             5            2         1
## 4                   1       3             1            1         1
## 5                   3       1             3            1         4
## 6                   5       1             5            4         2
##   Celebrities Shopping Science.and.technology Theatre Fun.with.friends
## 1           1        4                      4       2                5
## 2           2        3                      3       2                4
## 3           1        4                      2       5                5
## 4           2        4                      3       1                2
## 5           3        3                      3       2                4
## 6           1        2                      3       1                3
##   Adrenaline.sports Pets
## 1                 4    4
## 2                 2    5
## 3                 5    5
## 4                 1    1
## 5                 2    1
## 6                 3    2
knitr::opts_chunk$set(echo = TRUE)
yr_dist <- 1 - cor(yr_hi_data)
#Apply MDS
yr_mds_hi <- cmdscale(yr_dist,k=2)
#Provide column names
colnames(yr_mds_hi) <- c("x","y")
print(yr_mds_hi)
##                                  x            y
## History                 0.03789596 -0.016562470
## Psychology              0.23435049 -0.004552863
## Politics               -0.16947155  0.072604087
## Mathematics            -0.28384388 -0.318581493
## Physics                -0.27223786 -0.525909828
## Internet               -0.43513000  0.156962024
## PC                     -0.56521102 -0.149434742
## Economy.Management     -0.33966110  0.332828930
## Biology                 0.28191489 -0.317498563
## Chemistry               0.16963485 -0.435678043
## Reading                 0.53053608  0.013890328
## Geography              -0.07527901  0.022073123
## Foreign.languages       0.16800586  0.231896972
## Medicine                0.24936268 -0.299340267
## Law                    -0.09403297  0.213582749
## Cars                   -0.58602027  0.027548543
## Art.exhibitions         0.35749643  0.002257128
## Religion                0.20804632 -0.186595701
## Countryside..outdoors   0.13504690 -0.080259540
## Dancing                 0.25484850  0.169634549
## Musical.instruments     0.19065277 -0.163990828
## Writing                 0.31718194 -0.041071669
## Passive.sport          -0.28742931  0.105842668
## Active.sport           -0.23915728  0.042946030
## Gardening               0.19862305 -0.052769685
## Celebrities             0.04519467  0.500470119
## Shopping                0.16219660  0.510814974
## Science.and.technology -0.33046299 -0.312778419
## Theatre                 0.42885572  0.066184156
## Fun.with.friends       -0.10195405  0.278068093
## Adrenaline.sports      -0.28396276  0.041468464
## Pets                    0.09401033  0.115951174
knitr::opts_chunk$set(echo = TRUE)
#Plot
ggplot(data.frame(yr_mds_hi),aes(x=x,y=y)) + geom_text(label=rownames(yr_mds_hi),angle=45,size=3)

K-Means

genre_cluster<-kmeans(yr_mds_hi,centers=8)
##Get the clusters
mds_clusters<-data.frame(genre=names(genre_cluster$cluster),cluster_mds=genre_cluster$cluster) %>% arrange(cluster_mds,genre)

mds_clusters
##                     genre cluster_mds
## 1   Countryside..outdoors           1
## 2                 Dancing           1
## 3       Foreign.languages           1
## 4               Gardening           1
## 5                 History           1
## 6                    Pets           1
## 7              Psychology           1
## 8             Mathematics           2
## 9                 Physics           2
## 10 Science.and.technology           2
## 11                Biology           3
## 12              Chemistry           3
## 13               Medicine           3
## 14    Musical.instruments           3
## 15               Religion           3
## 16        Art.exhibitions           4
## 17                Reading           4
## 18                Theatre           4
## 19                Writing           4
## 20                   Cars           5
## 21                     PC           5
## 22            Celebrities           6
## 23               Shopping           6
## 24     Economy.Management           7
## 25               Internet           7
## 26           Active.sport           8
## 27      Adrenaline.sports           8
## 28       Fun.with.friends           8
## 29              Geography           8
## 30                    Law           8
## 31          Passive.sport           8
## 32               Politics           8
ggplot(data.frame(yr_mds_hi) %>% 
         mutate(clusters=as.factor(genre_cluster$cluster),genres=rownames(yr_mds_hi)),aes(x=x,y=y)) + 
  geom_text(aes(label=genres,color=clusters),angle=45,size=3) + 
  geom_point(data=as.data.frame(genre_cluster$centers),aes(x=x,y=y)
)

ESOPH

"alcgp"
## [1] "alcgp"
boxplot(esoph$ncases ~ esoph$alcgp)

"tobgp"
## [1] "tobgp"
boxplot(esoph$ncases ~ esoph$tobgp)

"agegp"
## [1] "agegp"
boxplot(esoph$ncases ~ esoph$agegp)

model_esoph <- glm(cbind(ncases, ncontrols) ~ agegp + unclass(tobgp)  + unclass(alcgp),
              data = esoph, family = binomial())
summary(model_esoph)
## 
## Call:
## glm(formula = cbind(ncases, ncontrols) ~ agegp + unclass(tobgp) + 
##     unclass(alcgp), family = binomial(), data = esoph)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7628  -0.6426  -0.2709   0.3043   2.0421  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -4.01097    0.31224 -12.846  < 2e-16 ***
## agegp.L         2.96113    0.65092   4.549 5.39e-06 ***
## agegp.Q        -1.33735    0.58918  -2.270  0.02322 *  
## agegp.C         0.15292    0.44792   0.341  0.73281    
## agegp^4         0.06668    0.30776   0.217  0.82848    
## agegp^5        -0.20288    0.19523  -1.039  0.29872    
## unclass(tobgp)  0.26162    0.08198   3.191  0.00142 ** 
## unclass(alcgp)  0.65308    0.08452   7.727 1.10e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 227.241  on 87  degrees of freedom
## Residual deviance:  59.277  on 80  degrees of freedom
## AIC: 222.76
## 
## Number of Fisher Scoring iterations: 6