library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(glmnet)
## Warning: package 'glmnet' was built under R version 3.4.3
## Loading required package: Matrix
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 3.4.3
## Loaded glmnet 2.0-13
library(reshape2)
setwd("C:\\Users\\serba\\Desktop\\R\\Machine Learning")
yr_data <-
read.csv("responses.csv",sep=",")
yr_hi_data <- yr_data %>% select(History:Pets)
yr_hi_data[is.na(yr_hi_data) ] <- 0
print(head(yr_hi_data))
## History Psychology Politics Mathematics Physics Internet PC
## 1 1 5 1 3 3 5 3
## 2 1 3 4 5 2 4 4
## 3 1 2 1 5 2 4 2
## 4 4 4 5 4 1 3 1
## 5 3 2 3 2 2 2 2
## 6 5 3 4 2 3 4 4
## Economy.Management Biology Chemistry Reading Geography Foreign.languages
## 1 5 3 3 3 3 5
## 2 5 1 1 4 4 5
## 3 4 1 1 5 2 5
## 4 2 3 3 5 4 4
## 5 2 3 3 5 2 3
## 6 1 4 4 3 3 4
## Medicine Law Cars Art.exhibitions Religion Countryside..outdoors Dancing
## 1 3 1 1 1 1 5 3
## 2 1 2 2 2 1 1 1
## 3 2 3 1 5 5 5 5
## 4 2 5 1 5 4 1 1
## 5 3 2 3 1 4 4 1
## 6 4 3 5 2 2 5 1
## Musical.instruments Writing Passive.sport Active.sport Gardening
## 1 3 2 1 5 5
## 2 1 1 1 1 1
## 3 5 5 5 2 1
## 4 1 3 1 1 1
## 5 3 1 3 1 4
## 6 5 1 5 4 2
## Celebrities Shopping Science.and.technology Theatre Fun.with.friends
## 1 1 4 4 2 5
## 2 2 3 3 2 4
## 3 1 4 2 5 5
## 4 2 4 3 1 2
## 5 3 3 3 2 4
## 6 1 2 3 1 3
## Adrenaline.sports Pets
## 1 4 4
## 2 2 5
## 3 5 5
## 4 1 1
## 5 2 1
## 6 3 2
knitr::opts_chunk$set(echo = TRUE)
yr_dist <- 1 - cor(yr_hi_data)
#Apply MDS
yr_mds_hi <- cmdscale(yr_dist,k=2)
#Provide column names
colnames(yr_mds_hi) <- c("x","y")
print(yr_mds_hi)
## x y
## History 0.03789596 -0.016562470
## Psychology 0.23435049 -0.004552863
## Politics -0.16947155 0.072604087
## Mathematics -0.28384388 -0.318581493
## Physics -0.27223786 -0.525909828
## Internet -0.43513000 0.156962024
## PC -0.56521102 -0.149434742
## Economy.Management -0.33966110 0.332828930
## Biology 0.28191489 -0.317498563
## Chemistry 0.16963485 -0.435678043
## Reading 0.53053608 0.013890328
## Geography -0.07527901 0.022073123
## Foreign.languages 0.16800586 0.231896972
## Medicine 0.24936268 -0.299340267
## Law -0.09403297 0.213582749
## Cars -0.58602027 0.027548543
## Art.exhibitions 0.35749643 0.002257128
## Religion 0.20804632 -0.186595701
## Countryside..outdoors 0.13504690 -0.080259540
## Dancing 0.25484850 0.169634549
## Musical.instruments 0.19065277 -0.163990828
## Writing 0.31718194 -0.041071669
## Passive.sport -0.28742931 0.105842668
## Active.sport -0.23915728 0.042946030
## Gardening 0.19862305 -0.052769685
## Celebrities 0.04519467 0.500470119
## Shopping 0.16219660 0.510814974
## Science.and.technology -0.33046299 -0.312778419
## Theatre 0.42885572 0.066184156
## Fun.with.friends -0.10195405 0.278068093
## Adrenaline.sports -0.28396276 0.041468464
## Pets 0.09401033 0.115951174
knitr::opts_chunk$set(echo = TRUE)
#Plot
ggplot(data.frame(yr_mds_hi),aes(x=x,y=y)) + geom_text(label=rownames(yr_mds_hi),angle=45,size=3)
K-Means
genre_cluster<-kmeans(yr_mds_hi,centers=8)
##Get the clusters
mds_clusters<-data.frame(genre=names(genre_cluster$cluster),cluster_mds=genre_cluster$cluster) %>% arrange(cluster_mds,genre)
mds_clusters
## genre cluster_mds
## 1 Countryside..outdoors 1
## 2 Dancing 1
## 3 Foreign.languages 1
## 4 Gardening 1
## 5 History 1
## 6 Pets 1
## 7 Psychology 1
## 8 Mathematics 2
## 9 Physics 2
## 10 Science.and.technology 2
## 11 Biology 3
## 12 Chemistry 3
## 13 Medicine 3
## 14 Musical.instruments 3
## 15 Religion 3
## 16 Art.exhibitions 4
## 17 Reading 4
## 18 Theatre 4
## 19 Writing 4
## 20 Cars 5
## 21 PC 5
## 22 Celebrities 6
## 23 Shopping 6
## 24 Economy.Management 7
## 25 Internet 7
## 26 Active.sport 8
## 27 Adrenaline.sports 8
## 28 Fun.with.friends 8
## 29 Geography 8
## 30 Law 8
## 31 Passive.sport 8
## 32 Politics 8
ggplot(data.frame(yr_mds_hi) %>%
mutate(clusters=as.factor(genre_cluster$cluster),genres=rownames(yr_mds_hi)),aes(x=x,y=y)) +
geom_text(aes(label=genres,color=clusters),angle=45,size=3) +
geom_point(data=as.data.frame(genre_cluster$centers),aes(x=x,y=y)
)
ESOPH
"alcgp"
## [1] "alcgp"
boxplot(esoph$ncases ~ esoph$alcgp)
"tobgp"
## [1] "tobgp"
boxplot(esoph$ncases ~ esoph$tobgp)
"agegp"
## [1] "agegp"
boxplot(esoph$ncases ~ esoph$agegp)
model_esoph <- glm(cbind(ncases, ncontrols) ~ agegp + unclass(tobgp) + unclass(alcgp),
data = esoph, family = binomial())
summary(model_esoph)
##
## Call:
## glm(formula = cbind(ncases, ncontrols) ~ agegp + unclass(tobgp) +
## unclass(alcgp), family = binomial(), data = esoph)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7628 -0.6426 -0.2709 0.3043 2.0421
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.01097 0.31224 -12.846 < 2e-16 ***
## agegp.L 2.96113 0.65092 4.549 5.39e-06 ***
## agegp.Q -1.33735 0.58918 -2.270 0.02322 *
## agegp.C 0.15292 0.44792 0.341 0.73281
## agegp^4 0.06668 0.30776 0.217 0.82848
## agegp^5 -0.20288 0.19523 -1.039 0.29872
## unclass(tobgp) 0.26162 0.08198 3.191 0.00142 **
## unclass(alcgp) 0.65308 0.08452 7.727 1.10e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 227.241 on 87 degrees of freedom
## Residual deviance: 59.277 on 80 degrees of freedom
## AIC: 222.76
##
## Number of Fisher Scoring iterations: 6