library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
yr_data <-
read.csv("responses.csv",sep=",") %>%
filter(complete.cases(.)) %>%
# mutate(id=row_number()) %>%
tbl_df()
yr_pca<-
yr_data[,sapply(yr_data,class)=="integer"] %>%
select(History:Pets)
glimpse (yr_pca)
## Observations: 686
## Variables: 32
## $ History <int> 1, 1, 1, 3, 5, 3, 5, 3, 3, 2, 4, 2, 2, ...
## $ Psychology <int> 5, 3, 2, 2, 3, 3, 2, 2, 3, 2, 4, 2, 5, ...
## $ Politics <int> 1, 4, 1, 3, 4, 1, 3, 3, 3, 5, 4, 1, 1, ...
## $ Mathematics <int> 3, 5, 5, 2, 2, 1, 1, 3, 2, 1, 1, 1, 1, ...
## $ Physics <int> 3, 2, 2, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ Internet <int> 5, 4, 4, 2, 4, 2, 5, 5, 4, 5, 3, 3, 4, ...
## $ PC <int> 3, 4, 2, 2, 4, 1, 4, 1, 5, 4, 2, 3, 2, ...
## $ Economy.Management <int> 5, 5, 4, 2, 1, 3, 1, 4, 3, 1, 1, 3, 3, ...
## $ Biology <int> 3, 1, 1, 3, 4, 5, 2, 2, 2, 1, 5, 1, 2, ...
## $ Chemistry <int> 3, 1, 1, 3, 4, 5, 2, 1, 1, 1, 5, 1, 1, ...
## $ Reading <int> 3, 4, 5, 5, 3, 3, 2, 4, 3, 3, 5, 4, 4, ...
## $ Geography <int> 3, 4, 2, 2, 3, 3, 3, 4, 3, 5, 3, 1, 1, ...
## $ Foreign.languages <int> 5, 5, 5, 3, 4, 4, 4, 5, 5, 2, 5, 5, 3, ...
## $ Medicine <int> 3, 1, 2, 3, 4, 5, 1, 1, 2, 1, 5, 1, 1, ...
## $ Law <int> 1, 2, 3, 2, 3, 3, 2, 1, 4, 3, 2, 1, 1, ...
## $ Cars <int> 1, 2, 1, 3, 5, 4, 1, 1, 2, 1, 3, 1, 1, ...
## $ Art.exhibitions <int> 1, 2, 5, 1, 2, 1, 1, 4, 2, 5, 1, 3, 4, ...
## $ Religion <int> 1, 1, 5, 4, 2, 1, 2, 4, 2, 1, 1, 1, 2, ...
## $ Countryside..outdoors <int> 5, 1, 5, 4, 5, 4, 2, 4, 4, 5, 5, 5, 3, ...
## $ Dancing <int> 3, 1, 5, 1, 1, 3, 1, 5, 1, 1, 3, 3, 1, ...
## $ Musical.instruments <int> 3, 1, 5, 3, 5, 2, 1, 3, 1, 1, 4, 3, 1, ...
## $ Writing <int> 2, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, ...
## $ Passive.sport <int> 1, 1, 5, 3, 5, 5, 4, 4, 5, 5, 5, 3, 3, ...
## $ Active.sport <int> 5, 1, 2, 1, 4, 3, 5, 4, 1, 3, 3, 3, 1, ...
## $ Gardening <int> 5, 1, 1, 4, 2, 3, 1, 1, 3, 1, 4, 1, 5, ...
## $ Celebrities <int> 1, 2, 1, 3, 1, 1, 3, 2, 2, 2, 3, 5, 5, ...
## $ Shopping <int> 4, 3, 4, 3, 2, 3, 3, 4, 5, 3, 2, 5, 5, ...
## $ Science.and.technology <int> 4, 3, 2, 3, 3, 4, 2, 3, 4, 3, 3, 2, 2, ...
## $ Theatre <int> 2, 2, 5, 2, 1, 3, 2, 5, 2, 1, 2, 3, 4, ...
## $ Fun.with.friends <int> 5, 4, 5, 4, 3, 5, 4, 5, 4, 3, 4, 5, 5, ...
## $ Adrenaline.sports <int> 4, 2, 5, 2, 3, 1, 2, 2, 1, 1, 1, 4, 1, ...
## $ Pets <int> 4, 5, 5, 1, 2, 5, 5, 2, 5, 1, 2, 5, 5, ...
head(yr_pca)
## # A tibble: 6 x 32
## History Psychology Politics Mathematics Physics Internet PC
## <int> <int> <int> <int> <int> <int> <int>
## 1 1 5 1 3 3 5 3
## 2 1 3 4 5 2 4 4
## 3 1 2 1 5 2 4 2
## 4 3 2 3 2 2 2 2
## 5 5 3 4 2 3 4 4
## 6 3 3 1 1 1 2 1
## # ... with 25 more variables: Economy.Management <int>, Biology <int>,
## # Chemistry <int>, Reading <int>, Geography <int>,
## # Foreign.languages <int>, Medicine <int>, Law <int>, Cars <int>,
## # Art.exhibitions <int>, Religion <int>, Countryside..outdoors <int>,
## # Dancing <int>, Musical.instruments <int>, Writing <int>,
## # Passive.sport <int>, Active.sport <int>, Gardening <int>,
## # Celebrities <int>, Shopping <int>, Science.and.technology <int>,
## # Theatre <int>, Fun.with.friends <int>, Adrenaline.sports <int>,
## # Pets <int>
yr_pca_result<-princomp(yr_pca,cor=T)
summary(yr_pca_result)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 2.0374787 1.8233387 1.60327816 1.48530949
## Proportion of Variance 0.1297287 0.1038926 0.08032815 0.06894201
## Cumulative Proportion 0.1297287 0.2336214 0.31394951 0.38289152
## Comp.5 Comp.6 Comp.7 Comp.8
## Standard deviation 1.26023134 1.20132035 1.07959740 1.06930606
## Proportion of Variance 0.04963072 0.04509908 0.03642283 0.03573173
## Cumulative Proportion 0.43252224 0.47762132 0.51404415 0.54977588
## Comp.9 Comp.10 Comp.11 Comp.12
## Standard deviation 1.05342776 0.98848494 0.96843882 0.94342106
## Proportion of Variance 0.03467844 0.03053445 0.02930855 0.02781385
## Cumulative Proportion 0.58445432 0.61498878 0.64429733 0.67211118
## Comp.13 Comp.14 Comp.15 Comp.16
## Standard deviation 0.92688407 0.89350216 0.87546384 0.85981220
## Proportion of Variance 0.02684731 0.02494832 0.02395115 0.02310241
## Cumulative Proportion 0.69895850 0.72390681 0.74785797 0.77096037
## Comp.17 Comp.18 Comp.19 Comp.20
## Standard deviation 0.8379842 0.81377184 0.7619490 0.74936288
## Proportion of Variance 0.0219443 0.02069452 0.0181427 0.01754827
## Cumulative Proportion 0.7929047 0.81359919 0.8317419 0.84929016
## Comp.21 Comp.22 Comp.23 Comp.24
## Standard deviation 0.73524390 0.72622112 0.70400198 0.70113254
## Proportion of Variance 0.01689324 0.01648116 0.01548809 0.01536209
## Cumulative Proportion 0.86618340 0.88266456 0.89815265 0.91351474
## Comp.25 Comp.26 Comp.27 Comp.28
## Standard deviation 0.67137253 0.64706583 0.62748835 0.58745885
## Proportion of Variance 0.01408566 0.01308419 0.01230443 0.01078462
## Cumulative Proportion 0.92760039 0.94068459 0.95298901 0.96377363
## Comp.29 Comp.30 Comp.31 Comp.32
## Standard deviation 0.58160258 0.553117299 0.53278768 0.480812500
## Proportion of Variance 0.01057067 0.009560586 0.00887071 0.007224396
## Cumulative Proportion 0.97434431 0.983904895 0.99277560 1.000000000
ggplot(data=data.frame(hobby=1:length(yr_pca_result$sdev),var_exp=cumsum(yr_pca_result$sdev^2/sum(yr_pca_result$sdev^2))),
aes(x=hobby,y=var_exp)) + geom_line() + geom_point() + scale_y_continuous(labels = scales::percent,breaks=seq(0,1,length.out=11)) + scale_x_continuous(breaks=seq(0,135,by=5))
yr_mds_data <- yr_pca %>% select(History:Pets)
print(head(yr_mds_data))
## # A tibble: 6 x 32
## History Psychology Politics Mathematics Physics Internet PC
## <int> <int> <int> <int> <int> <int> <int>
## 1 1 5 1 3 3 5 3
## 2 1 3 4 5 2 4 4
## 3 1 2 1 5 2 4 2
## 4 3 2 3 2 2 2 2
## 5 5 3 4 2 3 4 4
## 6 3 3 1 1 1 2 1
## # ... with 25 more variables: Economy.Management <int>, Biology <int>,
## # Chemistry <int>, Reading <int>, Geography <int>,
## # Foreign.languages <int>, Medicine <int>, Law <int>, Cars <int>,
## # Art.exhibitions <int>, Religion <int>, Countryside..outdoors <int>,
## # Dancing <int>, Musical.instruments <int>, Writing <int>,
## # Passive.sport <int>, Active.sport <int>, Gardening <int>,
## # Celebrities <int>, Shopping <int>, Science.and.technology <int>,
## # Theatre <int>, Fun.with.friends <int>, Adrenaline.sports <int>,
## # Pets <int>
yr_dist <- 1 - cor(yr_mds_data)
yr_mds <- cmdscale(yr_dist,k=2)
colnames(yr_mds) <- c("x","y")
print(yr_mds)
## x y
## History 0.031306226 -0.057585867
## Psychology 0.221673919 -0.034255291
## Politics -0.210706395 0.084207670
## Mathematics -0.366106787 -0.301011922
## Physics -0.338602615 -0.488946202
## Internet -0.435984609 0.193575251
## PC -0.602886441 -0.104205122
## Economy.Management -0.313464728 0.369369369
## Biology 0.249496019 -0.352229723
## Chemistry 0.111150203 -0.427188341
## Reading 0.525002226 -0.036403993
## Geography -0.082496760 -0.010735122
## Foreign.languages 0.197742370 0.164499851
## Medicine 0.190107055 -0.319538506
## Law -0.091720302 0.234211611
## Cars -0.593017958 0.091655704
## Art.exhibitions 0.380132671 -0.025937391
## Religion 0.177085206 -0.229380678
## Countryside..outdoors 0.159766147 -0.101417980
## Dancing 0.326087243 0.151882648
## Musical.instruments 0.165516710 -0.156630640
## Writing 0.291051922 -0.065898705
## Passive.sport -0.288889494 0.153049302
## Active.sport -0.180541543 0.077474365
## Gardening 0.203069385 -0.051532921
## Celebrities 0.059109759 0.515094714
## Shopping 0.248971459 0.505415685
## Science.and.technology -0.368579088 -0.307211932
## Theatre 0.452232345 0.006680821
## Fun.with.friends 0.002109219 0.282818197
## Adrenaline.sports -0.251016420 0.072054588
## Pets 0.132403055 0.168120560
ggplot(data.frame(yr_mds),aes(x=x,y=y)) + geom_text(label=rownames(yr_mds),angle=45,size=2)
set.seed(58)
genre_cluster<-kmeans(yr_mds,centers=4)
mds_clusters<-data.frame(genre=names(genre_cluster$cluster),cluster_mds=genre_cluster$cluster) %>% arrange(cluster_mds,genre)
mds_clusters
## genre cluster_mds
## 1 Art.exhibitions 1
## 2 Biology 1
## 3 Chemistry 1
## 4 Countryside..outdoors 1
## 5 Gardening 1
## 6 History 1
## 7 Medicine 1
## 8 Musical.instruments 1
## 9 Psychology 1
## 10 Reading 1
## 11 Religion 1
## 12 Theatre 1
## 13 Writing 1
## 14 Mathematics 2
## 15 PC 2
## 16 Physics 2
## 17 Science.and.technology 2
## 18 Celebrities 3
## 19 Dancing 3
## 20 Foreign.languages 3
## 21 Fun.with.friends 3
## 22 Pets 3
## 23 Shopping 3
## 24 Active.sport 4
## 25 Adrenaline.sports 4
## 26 Cars 4
## 27 Economy.Management 4
## 28 Geography 4
## 29 Internet 4
## 30 Law 4
## 31 Passive.sport 4
## 32 Politics 4
ggplot(data.frame(yr_mds) %>% mutate(clusters=as.factor(genre_cluster$cluster),genres=rownames(yr_mds)),aes(x=x,y=y)) + geom_text(aes(label=genres,color=clusters),angle=45,size=2) + geom_point(data=as.data.frame(genre_cluster$centers),aes(x=x,y=y)
)
yr_hc<-hclust(as.dist(yr_dist),method="complete")
plot(yr_hc,hang=-1)
# References:
# https://mef-bda503.github.io/files/intro_to_ml.html