library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
yr_data <-
read.csv("responses.csv",sep=",") %>%
filter(complete.cases(.)) %>%
# mutate(id=row_number()) %>%
tbl_df()


yr_pca<-
yr_data[,sapply(yr_data,class)=="integer"] %>%
select(History:Pets)

glimpse (yr_pca)
## Observations: 686
## Variables: 32
## $ History                <int> 1, 1, 1, 3, 5, 3, 5, 3, 3, 2, 4, 2, 2, ...
## $ Psychology             <int> 5, 3, 2, 2, 3, 3, 2, 2, 3, 2, 4, 2, 5, ...
## $ Politics               <int> 1, 4, 1, 3, 4, 1, 3, 3, 3, 5, 4, 1, 1, ...
## $ Mathematics            <int> 3, 5, 5, 2, 2, 1, 1, 3, 2, 1, 1, 1, 1, ...
## $ Physics                <int> 3, 2, 2, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ Internet               <int> 5, 4, 4, 2, 4, 2, 5, 5, 4, 5, 3, 3, 4, ...
## $ PC                     <int> 3, 4, 2, 2, 4, 1, 4, 1, 5, 4, 2, 3, 2, ...
## $ Economy.Management     <int> 5, 5, 4, 2, 1, 3, 1, 4, 3, 1, 1, 3, 3, ...
## $ Biology                <int> 3, 1, 1, 3, 4, 5, 2, 2, 2, 1, 5, 1, 2, ...
## $ Chemistry              <int> 3, 1, 1, 3, 4, 5, 2, 1, 1, 1, 5, 1, 1, ...
## $ Reading                <int> 3, 4, 5, 5, 3, 3, 2, 4, 3, 3, 5, 4, 4, ...
## $ Geography              <int> 3, 4, 2, 2, 3, 3, 3, 4, 3, 5, 3, 1, 1, ...
## $ Foreign.languages      <int> 5, 5, 5, 3, 4, 4, 4, 5, 5, 2, 5, 5, 3, ...
## $ Medicine               <int> 3, 1, 2, 3, 4, 5, 1, 1, 2, 1, 5, 1, 1, ...
## $ Law                    <int> 1, 2, 3, 2, 3, 3, 2, 1, 4, 3, 2, 1, 1, ...
## $ Cars                   <int> 1, 2, 1, 3, 5, 4, 1, 1, 2, 1, 3, 1, 1, ...
## $ Art.exhibitions        <int> 1, 2, 5, 1, 2, 1, 1, 4, 2, 5, 1, 3, 4, ...
## $ Religion               <int> 1, 1, 5, 4, 2, 1, 2, 4, 2, 1, 1, 1, 2, ...
## $ Countryside..outdoors  <int> 5, 1, 5, 4, 5, 4, 2, 4, 4, 5, 5, 5, 3, ...
## $ Dancing                <int> 3, 1, 5, 1, 1, 3, 1, 5, 1, 1, 3, 3, 1, ...
## $ Musical.instruments    <int> 3, 1, 5, 3, 5, 2, 1, 3, 1, 1, 4, 3, 1, ...
## $ Writing                <int> 2, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, ...
## $ Passive.sport          <int> 1, 1, 5, 3, 5, 5, 4, 4, 5, 5, 5, 3, 3, ...
## $ Active.sport           <int> 5, 1, 2, 1, 4, 3, 5, 4, 1, 3, 3, 3, 1, ...
## $ Gardening              <int> 5, 1, 1, 4, 2, 3, 1, 1, 3, 1, 4, 1, 5, ...
## $ Celebrities            <int> 1, 2, 1, 3, 1, 1, 3, 2, 2, 2, 3, 5, 5, ...
## $ Shopping               <int> 4, 3, 4, 3, 2, 3, 3, 4, 5, 3, 2, 5, 5, ...
## $ Science.and.technology <int> 4, 3, 2, 3, 3, 4, 2, 3, 4, 3, 3, 2, 2, ...
## $ Theatre                <int> 2, 2, 5, 2, 1, 3, 2, 5, 2, 1, 2, 3, 4, ...
## $ Fun.with.friends       <int> 5, 4, 5, 4, 3, 5, 4, 5, 4, 3, 4, 5, 5, ...
## $ Adrenaline.sports      <int> 4, 2, 5, 2, 3, 1, 2, 2, 1, 1, 1, 4, 1, ...
## $ Pets                   <int> 4, 5, 5, 1, 2, 5, 5, 2, 5, 1, 2, 5, 5, ...
head(yr_pca)
## # A tibble: 6 x 32
##   History Psychology Politics Mathematics Physics Internet    PC
##     <int>      <int>    <int>       <int>   <int>    <int> <int>
## 1       1          5        1           3       3        5     3
## 2       1          3        4           5       2        4     4
## 3       1          2        1           5       2        4     2
## 4       3          2        3           2       2        2     2
## 5       5          3        4           2       3        4     4
## 6       3          3        1           1       1        2     1
## # ... with 25 more variables: Economy.Management <int>, Biology <int>,
## #   Chemistry <int>, Reading <int>, Geography <int>,
## #   Foreign.languages <int>, Medicine <int>, Law <int>, Cars <int>,
## #   Art.exhibitions <int>, Religion <int>, Countryside..outdoors <int>,
## #   Dancing <int>, Musical.instruments <int>, Writing <int>,
## #   Passive.sport <int>, Active.sport <int>, Gardening <int>,
## #   Celebrities <int>, Shopping <int>, Science.and.technology <int>,
## #   Theatre <int>, Fun.with.friends <int>, Adrenaline.sports <int>,
## #   Pets <int>
yr_pca_result<-princomp(yr_pca,cor=T)
summary(yr_pca_result)
## Importance of components:
##                           Comp.1    Comp.2     Comp.3     Comp.4
## Standard deviation     2.0374787 1.8233387 1.60327816 1.48530949
## Proportion of Variance 0.1297287 0.1038926 0.08032815 0.06894201
## Cumulative Proportion  0.1297287 0.2336214 0.31394951 0.38289152
##                            Comp.5     Comp.6     Comp.7     Comp.8
## Standard deviation     1.26023134 1.20132035 1.07959740 1.06930606
## Proportion of Variance 0.04963072 0.04509908 0.03642283 0.03573173
## Cumulative Proportion  0.43252224 0.47762132 0.51404415 0.54977588
##                            Comp.9    Comp.10    Comp.11    Comp.12
## Standard deviation     1.05342776 0.98848494 0.96843882 0.94342106
## Proportion of Variance 0.03467844 0.03053445 0.02930855 0.02781385
## Cumulative Proportion  0.58445432 0.61498878 0.64429733 0.67211118
##                           Comp.13    Comp.14    Comp.15    Comp.16
## Standard deviation     0.92688407 0.89350216 0.87546384 0.85981220
## Proportion of Variance 0.02684731 0.02494832 0.02395115 0.02310241
## Cumulative Proportion  0.69895850 0.72390681 0.74785797 0.77096037
##                          Comp.17    Comp.18   Comp.19    Comp.20
## Standard deviation     0.8379842 0.81377184 0.7619490 0.74936288
## Proportion of Variance 0.0219443 0.02069452 0.0181427 0.01754827
## Cumulative Proportion  0.7929047 0.81359919 0.8317419 0.84929016
##                           Comp.21    Comp.22    Comp.23    Comp.24
## Standard deviation     0.73524390 0.72622112 0.70400198 0.70113254
## Proportion of Variance 0.01689324 0.01648116 0.01548809 0.01536209
## Cumulative Proportion  0.86618340 0.88266456 0.89815265 0.91351474
##                           Comp.25    Comp.26    Comp.27    Comp.28
## Standard deviation     0.67137253 0.64706583 0.62748835 0.58745885
## Proportion of Variance 0.01408566 0.01308419 0.01230443 0.01078462
## Cumulative Proportion  0.92760039 0.94068459 0.95298901 0.96377363
##                           Comp.29     Comp.30    Comp.31     Comp.32
## Standard deviation     0.58160258 0.553117299 0.53278768 0.480812500
## Proportion of Variance 0.01057067 0.009560586 0.00887071 0.007224396
## Cumulative Proportion  0.97434431 0.983904895 0.99277560 1.000000000
ggplot(data=data.frame(hobby=1:length(yr_pca_result$sdev),var_exp=cumsum(yr_pca_result$sdev^2/sum(yr_pca_result$sdev^2))),
aes(x=hobby,y=var_exp)) + geom_line() + geom_point() + scale_y_continuous(labels = scales::percent,breaks=seq(0,1,length.out=11)) + scale_x_continuous(breaks=seq(0,135,by=5))

yr_mds_data <- yr_pca %>% select(History:Pets)
print(head(yr_mds_data))
## # A tibble: 6 x 32
##   History Psychology Politics Mathematics Physics Internet    PC
##     <int>      <int>    <int>       <int>   <int>    <int> <int>
## 1       1          5        1           3       3        5     3
## 2       1          3        4           5       2        4     4
## 3       1          2        1           5       2        4     2
## 4       3          2        3           2       2        2     2
## 5       5          3        4           2       3        4     4
## 6       3          3        1           1       1        2     1
## # ... with 25 more variables: Economy.Management <int>, Biology <int>,
## #   Chemistry <int>, Reading <int>, Geography <int>,
## #   Foreign.languages <int>, Medicine <int>, Law <int>, Cars <int>,
## #   Art.exhibitions <int>, Religion <int>, Countryside..outdoors <int>,
## #   Dancing <int>, Musical.instruments <int>, Writing <int>,
## #   Passive.sport <int>, Active.sport <int>, Gardening <int>,
## #   Celebrities <int>, Shopping <int>, Science.and.technology <int>,
## #   Theatre <int>, Fun.with.friends <int>, Adrenaline.sports <int>,
## #   Pets <int>
yr_dist <- 1 - cor(yr_mds_data)

yr_mds <- cmdscale(yr_dist,k=2)

colnames(yr_mds) <- c("x","y")
print(yr_mds)
##                                   x            y
## History                 0.031306226 -0.057585867
## Psychology              0.221673919 -0.034255291
## Politics               -0.210706395  0.084207670
## Mathematics            -0.366106787 -0.301011922
## Physics                -0.338602615 -0.488946202
## Internet               -0.435984609  0.193575251
## PC                     -0.602886441 -0.104205122
## Economy.Management     -0.313464728  0.369369369
## Biology                 0.249496019 -0.352229723
## Chemistry               0.111150203 -0.427188341
## Reading                 0.525002226 -0.036403993
## Geography              -0.082496760 -0.010735122
## Foreign.languages       0.197742370  0.164499851
## Medicine                0.190107055 -0.319538506
## Law                    -0.091720302  0.234211611
## Cars                   -0.593017958  0.091655704
## Art.exhibitions         0.380132671 -0.025937391
## Religion                0.177085206 -0.229380678
## Countryside..outdoors   0.159766147 -0.101417980
## Dancing                 0.326087243  0.151882648
## Musical.instruments     0.165516710 -0.156630640
## Writing                 0.291051922 -0.065898705
## Passive.sport          -0.288889494  0.153049302
## Active.sport           -0.180541543  0.077474365
## Gardening               0.203069385 -0.051532921
## Celebrities             0.059109759  0.515094714
## Shopping                0.248971459  0.505415685
## Science.and.technology -0.368579088 -0.307211932
## Theatre                 0.452232345  0.006680821
## Fun.with.friends        0.002109219  0.282818197
## Adrenaline.sports      -0.251016420  0.072054588
## Pets                    0.132403055  0.168120560
ggplot(data.frame(yr_mds),aes(x=x,y=y)) + geom_text(label=rownames(yr_mds),angle=45,size=2)

set.seed(58)

genre_cluster<-kmeans(yr_mds,centers=4)

mds_clusters<-data.frame(genre=names(genre_cluster$cluster),cluster_mds=genre_cluster$cluster) %>% arrange(cluster_mds,genre)
mds_clusters
##                     genre cluster_mds
## 1         Art.exhibitions           1
## 2                 Biology           1
## 3               Chemistry           1
## 4   Countryside..outdoors           1
## 5               Gardening           1
## 6                 History           1
## 7                Medicine           1
## 8     Musical.instruments           1
## 9              Psychology           1
## 10                Reading           1
## 11               Religion           1
## 12                Theatre           1
## 13                Writing           1
## 14            Mathematics           2
## 15                     PC           2
## 16                Physics           2
## 17 Science.and.technology           2
## 18            Celebrities           3
## 19                Dancing           3
## 20      Foreign.languages           3
## 21       Fun.with.friends           3
## 22                   Pets           3
## 23               Shopping           3
## 24           Active.sport           4
## 25      Adrenaline.sports           4
## 26                   Cars           4
## 27     Economy.Management           4
## 28              Geography           4
## 29               Internet           4
## 30                    Law           4
## 31          Passive.sport           4
## 32               Politics           4
ggplot(data.frame(yr_mds) %>% mutate(clusters=as.factor(genre_cluster$cluster),genres=rownames(yr_mds)),aes(x=x,y=y)) + geom_text(aes(label=genres,color=clusters),angle=45,size=2) + geom_point(data=as.data.frame(genre_cluster$centers),aes(x=x,y=y)
)

yr_hc<-hclust(as.dist(yr_dist),method="complete")
plot(yr_hc,hang=-1)

# References:
# https://mef-bda503.github.io/files/intro_to_ml.html