1 Introduction

1.1 What type of analysis can be done with this data set?

1.2 Load packages and data

# Download data file from source
#download.file("https://github.com/MEF-BDA503/pj-tektunalic/responses.csv")

# Install tidyverse if not already installed
if (!("tidyverse" %in% installed.packages())) {
    install.packages("tidyverse", repos = "https://cran.r-project.org")
}
# Load tidyverse package
#library(tidyverse)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#install.packages("ggplot2")
library(ggplot2)

#For correlogram plots
#install.packages("ggcorrplot")
library(ggcorrplot)
library("corrplot")
## corrplot 0.84 loaded
library(RColorBrewer)


# Load the data into variable d
d=read.csv("responses.csv")

#Create custom colors vector
custom_colors=c("#E32800", "#FDB205","#FDF505","#009BDF","#E3FD05","#A7FD05","#7CBE00","#639700","#972000","#871D00","#50FF95","#00DEAF","#00B891","#00B5B8","#0080B8","#0063E8","#0047A7","#9F55FF","#C69BFF","#D69BFF","#B956FE","#DF56FE","#FE5681","#9BDF00")

1.3 General View Of Data And Assumptions

As it is seen below, numerical survey responses are mostly collected using a scale of 1 to 5. So we need a systematic approach to firstly group these answers then compare groups that will be created. I separated the answers to two main groups. These groups correspond to:

I ignored neutrals (answering 3 in each category) in the context of this analysis to catch stronger tendencies in both ends.

Also there are general categories to which each question belongs in this survey. These are:

So I will try to catch possible correlations inside the selected categories I will choose and also in between two categories of interest.

#summary(d)
#First five observations and data types for all columns
glimpse(head(d,5))
## Observations: 5
## Variables: 150
## $ Music                          <int> 5, 4, 5, 5, 5
## $ Slow.songs.or.fast.songs       <int> 3, 4, 5, 3, 3
## $ Dance                          <int> 2, 2, 2, 2, 4
## $ Folk                           <int> 1, 1, 2, 1, 3
## $ Country                        <int> 2, 1, 3, 1, 2
## $ Classical.music                <int> 2, 1, 4, 1, 4
## $ Musical                        <int> 1, 2, 5, 1, 3
## $ Pop                            <int> 5, 3, 3, 2, 5
## $ Rock                           <int> 5, 5, 5, 2, 3
## $ Metal.or.Hardrock              <int> 1, 4, 3, 1, 1
## $ Punk                           <int> 1, 4, 4, 4, 2
## $ Hiphop..Rap                    <int> 1, 1, 1, 2, 5
## $ Reggae..Ska                    <int> 1, 3, 4, 2, 3
## $ Swing..Jazz                    <int> 1, 1, 3, 1, 2
## $ Rock.n.roll                    <int> 3, 4, 5, 2, 1
## $ Alternative                    <int> 1, 4, 5, 5, 2
## $ Latino                         <int> 1, 2, 5, 1, 4
## $ Techno..Trance                 <int> 1, 1, 1, 2, 2
## $ Opera                          <int> 1, 1, 3, 1, 2
## $ Movies                         <int> 5, 5, 5, 5, 5
## $ Horror                         <int> 4, 2, 3, 4, 4
## $ Thriller                       <int> 2, 2, 4, 4, 4
## $ Comedy                         <int> 5, 4, 4, 3, 5
## $ Romantic                       <int> 4, 3, 2, 3, 2
## $ Sci.fi                         <int> 4, 4, 4, 4, 3
## $ War                            <int> 1, 1, 2, 3, 3
## $ Fantasy.Fairy.tales            <int> 5, 3, 5, 1, 4
## $ Animated                       <int> 5, 5, 5, 2, 4
## $ Documentary                    <int> 3, 4, 2, 5, 3
## $ Western                        <int> 1, 1, 2, 1, 1
## $ Action                         <int> 2, 4, 1, 2, 4
## $ History                        <int> 1, 1, 1, 4, 3
## $ Psychology                     <int> 5, 3, 2, 4, 2
## $ Politics                       <int> 1, 4, 1, 5, 3
## $ Mathematics                    <int> 3, 5, 5, 4, 2
## $ Physics                        <int> 3, 2, 2, 1, 2
## $ Internet                       <int> 5, 4, 4, 3, 2
## $ PC                             <int> 3, 4, 2, 1, 2
## $ Economy.Management             <int> 5, 5, 4, 2, 2
## $ Biology                        <int> 3, 1, 1, 3, 3
## $ Chemistry                      <int> 3, 1, 1, 3, 3
## $ Reading                        <int> 3, 4, 5, 5, 5
## $ Geography                      <int> 3, 4, 2, 4, 2
## $ Foreign.languages              <int> 5, 5, 5, 4, 3
## $ Medicine                       <int> 3, 1, 2, 2, 3
## $ Law                            <int> 1, 2, 3, 5, 2
## $ Cars                           <int> 1, 2, 1, 1, 3
## $ Art.exhibitions                <int> 1, 2, 5, 5, 1
## $ Religion                       <int> 1, 1, 5, 4, 4
## $ Countryside..outdoors          <int> 5, 1, 5, 1, 4
## $ Dancing                        <int> 3, 1, 5, 1, 1
## $ Musical.instruments            <int> 3, 1, 5, 1, 3
## $ Writing                        <int> 2, 1, 5, 3, 1
## $ Passive.sport                  <int> 1, 1, 5, 1, 3
## $ Active.sport                   <int> 5, 1, 2, 1, 1
## $ Gardening                      <int> 5, 1, 1, 1, 4
## $ Celebrities                    <int> 1, 2, 1, 2, 3
## $ Shopping                       <int> 4, 3, 4, 4, 3
## $ Science.and.technology         <int> 4, 3, 2, 3, 3
## $ Theatre                        <int> 2, 2, 5, 1, 2
## $ Fun.with.friends               <int> 5, 4, 5, 2, 4
## $ Adrenaline.sports              <int> 4, 2, 5, 1, 2
## $ Pets                           <int> 4, 5, 5, 1, 1
## $ Flying                         <int> 1, 1, 1, 2, 1
## $ Storm                          <int> 1, 1, 1, 1, 2
## $ Darkness                       <int> 1, 1, 1, 1, 1
## $ Heights                        <int> 1, 2, 1, 3, 1
## $ Spiders                        <int> 1, 1, 1, 5, 1
## $ Snakes                         <int> 5, 1, 1, 5, 1
## $ Rats                           <int> 3, 1, 1, 5, 2
## $ Ageing                         <int> 1, 3, 1, 4, 2
## $ Dangerous.dogs                 <int> 3, 1, 1, 5, 4
## $ Fear.of.public.speaking        <int> 2, 4, 2, 5, 3
## $ Smoking                        <fctr> never smoked, never smoked, tr...
## $ Alcohol                        <fctr> drink a lot, drink a lot, drin...
## $ Healthy.eating                 <int> 4, 3, 3, 3, 4
## $ Daily.events                   <int> 2, 3, 1, 4, 3
## $ Prioritising.workload          <int> 2, 2, 2, 4, 1
## $ Writing.notes                  <int> 5, 4, 5, 4, 2
## $ Workaholism                    <int> 4, 5, 3, 5, 3
## $ Thinking.ahead                 <int> 2, 4, 5, 3, 5
## $ Final.judgement                <int> 5, 1, 3, 1, 5
## $ Reliability                    <int> 4, 4, 4, 3, 5
## $ Keeping.promises               <int> 4, 4, 5, 4, 4
## $ Loss.of.interest               <int> 1, 3, 1, 5, 2
## $ Friends.versus.money           <int> 3, 4, 5, 2, 3
## $ Funniness                      <int> 5, 3, 2, 1, 3
## $ Fake                           <int> 1, 2, 4, 1, 2
## $ Criminal.damage                <int> 1, 1, 1, 5, 1
## $ Decision.making                <int> 3, 2, 3, 5, 3
## $ Elections                      <int> 4, 5, 5, 5, 5
## $ Self.criticism                 <int> 1, 4, 4, 5, 5
## $ Judgment.calls                 <int> 3, 4, 4, 4, 5
## $ Hypochondria                   <int> 1, 1, 1, 3, 1
## $ Empathy                        <int> 3, 2, 5, 3, 3
## $ Eating.to.survive              <int> 1, 1, 5, 1, 1
## $ Giving                         <int> 4, 2, 5, 1, 3
## $ Compassion.to.animals          <int> 5, 4, 4, 2, 3
## $ Borrowed.stuff                 <int> 4, 3, 2, 5, 4
## $ Loneliness                     <int> 3, 2, 5, 5, 3
## $ Cheating.in.school             <int> 2, 4, 3, 5, 5
## $ Health                         <int> 1, 4, 2, 1, 3
## $ Changing.the.past              <int> 1, 4, 5, 5, 4
## $ God                            <int> 1, 1, 5, 4, 5
## $ Dreams                         <int> 4, 3, 1, 3, 3
## $ Charity                        <int> 2, 1, 3, 3, 3
## $ Number.of.friends              <int> 3, 3, 3, 1, 3
## $ Punctuality                    <fctr> i am always on time, i am ofte...
## $ Lying                          <fctr> never, sometimes, sometimes, o...
## $ Waiting                        <int> 3, 3, 2, 1, 3
## $ New.environment                <int> 4, 4, 3, 1, 4
## $ Mood.swings                    <int> 3, 4, 4, 5, 2
## $ Appearence.and.gestures        <int> 4, 4, 3, 3, 3
## $ Socializing                    <int> 3, 4, 5, 1, 3
## $ Achievements                   <int> 4, 2, 3, 3, 3
## $ Responding.to.a.serious.letter <int> 3, 4, 4, 3, 3
## $ Children                       <int> 5, 2, 4, 2, 5
## $ Assertiveness                  <int> 1, 2, 3, 5, 4
## $ Getting.angry                  <int> 1, 5, 4, 5, 2
## $ Knowing.the.right.people       <int> 3, 4, 3, 4, 3
## $ Public.speaking                <int> 5, 4, 2, 5, 5
## $ Unpopularity                   <int> 5, 4, 4, 3, 5
## $ Life.struggles                 <int> 1, 1, 4, 3, 2
## $ Happiness.in.life              <int> 4, 4, 4, 2, 3
## $ Energy.levels                  <int> 5, 3, 4, 2, 5
## $ Small...big.dogs               <int> 1, 5, 3, 1, 3
## $ Personality                    <int> 4, 3, 3, 2, 3
## $ Finding.lost.valuables         <int> 3, 4, 3, 1, 2
## $ Getting.up                     <int> 2, 5, 4, 1, 4
## $ Interests.or.hobbies           <int> 3, 3, 5, NA, 3
## $ Parents..advice                <int> 4, 2, 3, 2, 3
## $ Questionnaires.or.polls        <int> 3, 3, 1, 4, 3
## $ Internet.usage                 <fctr> few hours a day, few hours a d...
## $ Finances                       <int> 3, 3, 2, 2, 4
## $ Shopping.centres               <int> 4, 4, 4, 4, 3
## $ Branded.clothing               <int> 5, 1, 1, 3, 4
## $ Entertainment.spending         <int> 3, 4, 4, 3, 3
## $ Spending.on.looks              <int> 3, 2, 3, 4, 3
## $ Spending.on.gadgets            <int> 1, 5, 4, 4, 2
## $ Spending.on.healthy.eating     <int> 3, 2, 2, 1, 4
## $ Age                            <int> 20, 19, 20, 22, 20
## $ Height                         <int> 163, 163, 176, 172, 170
## $ Weight                         <int> 48, 58, 67, 59, 59
## $ Number.of.siblings             <int> 1, 2, 2, 1, 1
## $ Gender                         <fctr> female, female, female, female...
## $ Left...right.handed            <fctr> right handed, right handed, ri...
## $ Education                      <fctr> college/bachelor degree, colle...
## $ Only.child                     <fctr> no, no, no, yes, no
## $ Village...town                 <fctr> village, city, city, city, vil...
## $ House...block.of.flats         <fctr> block of flats, block of flats...

1.4 Data Integrity Check

Let’s check whether we have rows witn NA values.

#Get NA totals by columns
na_count <-sapply(d, function(y) sum(length(which(is.na(y)))))
na_count <- data.frame(na_count)

na_count
##                                na_count
## Music                                 3
## Slow.songs.or.fast.songs              2
## Dance                                 4
## Folk                                  5
## Country                               5
## Classical.music                       7
## Musical                               2
## Pop                                   3
## Rock                                  6
## Metal.or.Hardrock                     3
## Punk                                  8
## Hiphop..Rap                           4
## Reggae..Ska                           7
## Swing..Jazz                           6
## Rock.n.roll                           7
## Alternative                           7
## Latino                                8
## Techno..Trance                        7
## Opera                                 1
## Movies                                6
## Horror                                2
## Thriller                              1
## Comedy                                3
## Romantic                              3
## Sci.fi                                2
## War                                   2
## Fantasy.Fairy.tales                   3
## Animated                              3
## Documentary                           8
## Western                               4
## Action                                2
## History                               2
## Psychology                            5
## Politics                              1
## Mathematics                           3
## Physics                               3
## Internet                              4
## PC                                    6
## Economy.Management                    5
## Biology                               6
## Chemistry                            10
## Reading                               6
## Geography                             9
## Foreign.languages                     5
## Medicine                              5
## Law                                   1
## Cars                                  4
## Art.exhibitions                       6
## Religion                              3
## Countryside..outdoors                 7
## Dancing                               3
## Musical.instruments                   1
## Writing                               6
## Passive.sport                        15
## Active.sport                          4
## Gardening                             7
## Celebrities                           2
## Shopping                              2
## Science.and.technology                6
## Theatre                               8
## Fun.with.friends                      4
## Adrenaline.sports                     3
## Pets                                  4
## Flying                                3
## Storm                                 1
## Darkness                              2
## Heights                               3
## Spiders                               5
## Snakes                                0
## Rats                                  3
## Ageing                                1
## Dangerous.dogs                        1
## Fear.of.public.speaking               1
## Smoking                               0
## Alcohol                               0
## Healthy.eating                        3
## Daily.events                          7
## Prioritising.workload                 5
## Writing.notes                         3
## Workaholism                           5
## Thinking.ahead                        3
## Final.judgement                       7
## Reliability                           4
## Keeping.promises                      1
## Loss.of.interest                      4
## Friends.versus.money                  6
## Funniness                             4
## Fake                                  1
## Criminal.damage                       7
## Decision.making                       4
## Elections                             3
## Self.criticism                        5
## Judgment.calls                        4
## Hypochondria                          4
## Empathy                               5
## Eating.to.survive                     0
## Giving                                6
## Compassion.to.animals                 7
## Borrowed.stuff                        2
## Loneliness                            1
## Cheating.in.school                    4
## Health                                1
## Changing.the.past                     2
## God                                   2
## Dreams                                0
## Charity                               3
## Number.of.friends                     0
## Punctuality                           0
## Lying                                 0
## Waiting                               3
## New.environment                       2
## Mood.swings                           4
## Appearence.and.gestures               3
## Socializing                           5
## Achievements                          2
## Responding.to.a.serious.letter        6
## Children                              4
## Assertiveness                         2
## Getting.angry                         4
## Knowing.the.right.people              2
## Public.speaking                       2
## Unpopularity                          3
## Life.struggles                        3
## Happiness.in.life                     4
## Energy.levels                         5
## Small...big.dogs                      4
## Personality                           4
## Finding.lost.valuables                4
## Getting.up                            5
## Interests.or.hobbies                  3
## Parents..advice                       2
## Questionnaires.or.polls               4
## Internet.usage                        0
## Finances                              3
## Shopping.centres                      2
## Branded.clothing                      2
## Entertainment.spending                3
## Spending.on.looks                     3
## Spending.on.gadgets                   0
## Spending.on.healthy.eating            2
## Age                                   7
## Height                               20
## Weight                               20
## Number.of.siblings                    6
## Gender                                0
## Left...right.handed                   0
## Education                             0
## Only.child                            0
## Village...town                        0
## House...block.of.flats                0

Because there is a lot of missing values compared to total row count, removing NA values totally is not preferred. Data removal will be applied to related columns in visualizations where necessary.

2 Visualizations

+Below are some sample visualizations to understand the data set. First, I want to see the respondents’ profile by age and education level.

#See how many empty rows exists
#length(d$Gender[as.character(d$Gender)==""])

#Replace empty values with NA
d$Gender[as.character(d$Gender)==""] <- NA

#Create histogram plot with variable and label parameters
ggplot(d, aes(x=Age, fill=Gender))+
  geom_histogram(binwidth=1, alpha=.5, position="dodge")+
  labs(y="Participant Count",x="Age",title="Participant Numbers By Age (Gender)")+
  scale_fill_manual(values=custom_colors,
                    name="Gender\n",
                    #breaks=c("Female", "Male", "NA"),
                    labels=c("Female", "Male", "NA"))

#length(d$Education[as.character(d$Education)==""])

#Replace empty values with NA
d$Education[as.character(d$Education)==""] <- NA

#Create histogram plot with variable, axis and label parameters
ggplot(d, aes(x=Age, fill=Education))+
  geom_histogram(binwidth=1.25, alpha=1, position="dodge")+
  scale_x_continuous(breaks = c(seq(from = 10, to = 32, by = 1)),limits = c(15,31))+
  scale_y_continuous(breaks = c(seq(from = 0, to = 300, by = 20)),limits = c(0,160))+
  labs(y="Participant Count",x="Age",title="Participant Numbers By Age (Education Level)")+
  scale_fill_manual(values=custom_colors)

# music_pref_agreers <- d  %>% 
#   na.omit() %>% 
#   #select(names(d)[1:19]) %>% 
#   select(d,Music:Opera) %>%
#   filter()

#music_pref_disagreers


music_pref <- d  %>%
  na.omit() %>%
  select(names(d)[1:19]) 

# Correlation matrix
corr <- round(cor(music_pref), 2)
#corr

#Correlogram
ggcorrplot(corr, hc.order = TRUE, 
           type = "lower", 
           lab = TRUE, 
           lab_size = 2, 
           method="circle", 
           colors = custom_colors, 
           title="Correlogram of Music Preferences", 
           ggtheme=theme_bw)

M<-cor(music_pref)


corrplot(M, diag = FALSE, order = "FPC",tl.pos = "td", tl.cex = 0.8, method = "color",type="upper",col=colorRampPalette(c("dark blue","white","orange"))(200))

According to correlation matrix, the highest negative correlation is between Pop music and Metal-Hardrock music choices. Positive correlations exist between Opera and Classical.Music, Metal-Hardrock, Rock and Punk.

# Create plots for habits by age
#p1 <- 
  ggplot(d, aes(x=Age, fill=Alcohol))+
  geom_histogram(binwidth=1.25, alpha=1, position="dodge")+
  scale_x_continuous(breaks = c(seq(from = 10, to = 32, by = 1)),limits = c(15,31))+
  scale_y_continuous(breaks = c(seq(from = 0, to = 300, by = 20)),limits = c(0,150))+
  labs(y="Participant Count",x="Age",title="Participant Numbers By Age (Alcohol Consumption)")+
  scale_fill_manual(values=custom_colors)

#p2<- 
  ggplot(d, aes(x=Age, fill=Smoking))+
  geom_histogram(binwidth=1.25, alpha=1, position="dodge")+
  scale_x_continuous(breaks = c(seq(from = 10, to = 32, by = 1)),limits = c(15,31))+
  scale_y_continuous(breaks = c(seq(from = 0, to = 300, by = 20)),limits = c(0,100))+
  labs(y="Participant Count",x="Age",title="Participant Numbers By Age (Smoking)")+
  scale_fill_manual(values=custom_colors)

#multiplot(p1,p2,cols=1)

Graphs show that harmful habits also settle at the highest rate between ages 18 and 21.

References