Initial Exploratory Analysis
Ted Talks Dataset consists of 2,550 observations and 17 variables. I’ll try to find that the most viewed and most favorited Talks comments and views numbers. Also I will analyse speakers occupations,relationship between languages and views , relationship between comments and views.
#Nature of rows and columns//quick review of dataset
glimpse(ted_talks)
## Observations: 2,550
## Variables: 17
## $ comments <int> 4553, 265, 124, 200, 593, 672, 919, 46, 852...
## $ description <chr> "Sir Ken Robinson makes an entertaining and...
## $ duration <int> 1164, 977, 1286, 1116, 1190, 1305, 992, 119...
## $ event <chr> "TED2006", "TED2006", "TED2006", "TED2006",...
## $ film_date <int> 1140825600, 1140825600, 1140739200, 1140912...
## $ languages <int> 60, 43, 26, 35, 48, 36, 31, 19, 32, 31, 27,...
## $ main_speaker <chr> "Ken Robinson", "Al Gore", "David Pogue", "...
## $ name <chr> "Ken Robinson: Do schools kill creativity?"...
## $ num_speaker <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ published_date <int> 1151367060, 1151367060, 1151367060, 1151367...
## $ ratings <chr> "[{'id': 7, 'name': 'Funny', 'count': 19645...
## $ related_talks <chr> "[{'id': 865, 'hero': 'https://pe.tedcdn.co...
## $ speaker_occupation <chr> "Author/educator", "Climate advocate", "Tec...
## $ tags <chr> "['children', 'creativity', 'culture', 'dan...
## $ title <chr> "Do schools kill creativity?", "Averting th...
## $ url <chr> "https://www.ted.com/talks/ken_robinson_say...
## $ views <int> 47227110, 3200520, 1636292, 1697550, 120058...
#Number of rows and columns
dim(ted_talks)
## [1] 2550 17
#How many views does the most viewed TED Talk take?
max(ted_talks$views)
## [1] 47227110
#How many comments does the most popular TED Talk take?
max(ted_talks$comments, na.rm = FALSE)
## [1] 6404
#How many TED Talks a speaker have?
d1= ted_talks %>%group_by(main_speaker) %>%
summarise(total_ted_talks=n())%>%
arrange(desc(total_ted_talks))
d1
## # A tibble: 2,156 x 2
## main_speaker total_ted_talks
## <chr> <int>
## 1 Hans Rosling 9
## 2 Juan Enriquez 7
## 3 Marco Tempest 6
## 4 Rives 6
## 5 Bill Gates 5
## 6 Clay Shirky 5
## 7 Dan Ariely 5
## 8 Jacqueline Novogratz 5
## 9 Julian Treasure 5
## 10 Nicholas Negroponte 5
## # ... with 2,146 more rows
#What are speakers occupations?
d2= ted_talks %>%group_by(speaker_occupation) %>%
summarise(total_ted_talks=n())%>%
arrange(desc(total_ted_talks))
d2
## # A tibble: 1,449 x 2
## speaker_occupation total_ted_talks
## <chr> <int>
## 1 Writer 45
## 2 Artist 34
## 3 Designer 34
## 4 Journalist 33
## 5 Entrepreneur 31
## 6 Architect 30
## 7 Inventor 27
## 8 Psychologist 26
## 9 Photographer 25
## 10 Filmmaker 21
## # ... with 1,439 more rows
library(ggplot2)#visualisation
#How many speaker have 5/more than 5 TED Talks?
d3 = ted_talks %>%group_by(main_speaker) %>%
summarise(total_ted_talks=n())%>%
filter(total_ted_talks>=5) %>%
arrange(desc(total_ted_talks))
d3
## # A tibble: 10 x 2
## main_speaker total_ted_talks
## <chr> <int>
## 1 Hans Rosling 9
## 2 Juan Enriquez 7
## 3 Marco Tempest 6
## 4 Rives 6
## 5 Bill Gates 5
## 6 Clay Shirky 5
## 7 Dan Ariely 5
## 8 Jacqueline Novogratz 5
## 9 Julian Treasure 5
## 10 Nicholas Negroponte 5
#Who is Top 10 popular TED Talks Speaker?
v1 <- ggplot(data=d3, aes(x=main_speaker, y=total_ted_talks, fill=time)) +
geom_bar(colour="black", fill="#bdc3c7", width=.8, stat="identity") +
guides(fill=FALSE) +
xlab("Names Of Speakers") + ylab("Number of Talks") +
ggtitle("Top 10 TED Speakers")+
theme(axis.text.x = element_text(angle=60, size=10))
v1
# Which occupations are most popular amongst TED speakers?
d4= ted_talks %>%group_by(speaker_occupation) %>%
summarise(total_ted_talks=n())%>%
filter(total_ted_talks>=20) %>%
arrange(desc(total_ted_talks))
d4
## # A tibble: 15 x 2
## speaker_occupation total_ted_talks
## <chr> <int>
## 1 Writer 45
## 2 Artist 34
## 3 Designer 34
## 4 Journalist 33
## 5 Entrepreneur 31
## 6 Architect 30
## 7 Inventor 27
## 8 Psychologist 26
## 9 Photographer 25
## 10 Filmmaker 21
## 11 Neuroscientist 21
## 12 Author 20
## 13 Economist 20
## 14 Educator 20
## 15 Roboticist 20
#What are Top 15 Occupations of TED Speakers?
v2 <- ggplot(data=d4, aes(x=speaker_occupation, y=total_ted_talks, fill=time)) +
geom_bar(colour="black", fill="#bdc3c7", width=.8, stat="identity") +
guides(fill=FALSE) +
xlab("Occupations Of Speakers") + ylab("Number of Speaker") +
ggtitle("Top 15 Occupation of TED Speakers")+
theme(axis.text.x = element_text(angle=60, size=10))
v2
#How many different languages the talk is available?
v3 <- ggplot(aes(x = languages), data = ted_talks) +
geom_histogram(binwidth = 1) +
scale_x_continuous(limits = c(0, 100))
v3
#Is there any relationship between languages and views?
v4 <- ggplot(data = ted_talks, aes(x = languages, y = views)) +
geom_point()
v4
#Is there any relationship between comments and views?
v5 <- ggplot(data = ted_talks, aes(x = comments, y = views)) +
geom_point()
v5