#get data, filter data and str_replace for ASTON MARTÄ°N"
car_agg <- readRDS("C:\\Users\\Kafein\\Documents\\github2\\pj18-Leyla.Yigit\\AssignmentWeek3\\car_data_aggregate.rds")
car_agg <- car_agg %>% filter(brand_name !="TOPLAM:" & !str_detect(brand_name,"ODD"))
car_agg$brand_name <- str_replace(car_agg$brand_name,"ASTON MARTÄ°N","ASTON MARTIN")
tbl_df(car_agg)
## # A tibble: 1,477 x 12
## brand_name auto_dom auto_imp auto_total comm_dom comm_imp comm_total
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALFA ROMEO 0 13 13 0 0 0
## 2 ASTON MAR~ 0 2 2 0 0 0
## 3 AUDI 0 350 350 0 0 0
## 4 BENTLEY 0 0 0 0 0 0
## 5 BMW 0 158 158 0 0 0
## 6 CITROEN 0 134 134 0 197 197
## 7 DACIA 0 1141 1141 0 319 319
## 8 DS 0 9 9 0 0 0
## 9 FERRARI 0 3 3 0 0 0
## 10 FIAT 632 57 689 789 199 988
## # ... with 1,467 more rows, and 5 more variables: total_dom <dbl>,
## # total_imp <dbl>, total_total <dbl>, year <dbl>, month <dbl>
#glimpse(car_agg) # gives structure of data
You can also embed plots, for example:
## function (data)
## {
## as_tibble(data, .name_repair = "check_unique")
## }
## <bytecode: 0x000000001cc5e670>
## <environment: namespace:dplyr>
## # A tibble: 33 x 1
## Date
## <date>
## 1 2018-09-30
## 2 2018-08-31
## 3 2018-07-31
## 4 2018-06-30
## 5 2018-05-31
## 6 2018-04-30
## 7 2018-03-31
## 8 2018-02-28
## 9 2018-01-31
## 10 2017-12-31
## # ... with 23 more rows
#Calculate the mean of sales numbers
#Total car sales per year and month .
yearly_sales <- car_agg %>% group_by(year) %>% summarise(total_auto=sum(auto_total))%>% arrange(year)%>%slice(1:12)
yearly_sales
## # A tibble: 3 x 2
## year total_auto
## <dbl> <dbl>
## 1 2016 755368
## 2 2017 678539
## 3 2018 361913
monthly_sales <- car_agg %>% group_by(Date) %>% summarise(total_auto=sum(auto_total))%>% arrange(Date)%>%slice(1:12)
monthly_sales
## # A tibble: 12 x 2
## Date total_auto
## <date> <dbl>
## 1 2016-01-31 23278
## 2 2016-02-29 40588
## 3 2016-03-31 63629
## 4 2016-04-30 65618
## 5 2016-05-31 73832
## 6 2016-06-30 70567
## 7 2016-07-31 45566
## 8 2016-08-31 53977
## 9 2016-09-30 50777
## 10 2016-10-31 63709
## 11 2016-11-30 95783
## 12 2016-12-31 108044
#Total car sales per year and month graph
ggplot(monthly_sales) + geom_point(aes(x=Date, y=total_auto, color="steelblue")) + geom_smooth(aes(x=Date, y=total_auto, color="steelblue")) # Same as above but specifying the aesthetics inside the geoms.
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#Scatterplot
#ou might want to add the plot’s main title and perhaps change the X and Y axis titles. This can be accomplished using the labs layer, meant for specifying the labels. However, manipulating the size, color of the labels is the job of the ‘Theme’.
library(ggplot2)
gg <- ggplot(monthly_sales, aes(x=Date, y=total_auto, color="cut")) + geom_point() + labs(title="Scatterplot", x="Date", y="amount") # add axis lables and plot title.
print(gg)
#adjusting theme
gg1 <- gg + theme(plot.title=element_text(size=30, face="bold"),
axis.text.x=element_text(size=15),
axis.text.y=element_text(size=15),
axis.title.x=element_text(size=25),
axis.title.y=element_text(size=25)) +
scale_color_discrete(name="Cut of diamonds") # add title and axis text, change legend title.
print(gg1) # print the plot
library(ggplot2)
#make clorfull
#adjusting theme
ggplot(data = car_agg, aes(x = Date, y = total_total),size=100,height=10,width=20) +
geom_point(alpha = 0.3, aes(color = brand_name))
#gg1 + facet_wrap( ~ brand_name, ncol=3) # columns defined by 'cut'
#Boxplot
ggplot(data = car_agg, aes(x = brand_name, y = total_imp)) +
geom_boxplot()
#By adding points to boxplot, we can have a better idea of the number of measurements and of their distribution:
ggplot(data = car_agg, aes(x = brand_name, y = total_imp)) +
geom_boxplot(alpha = 0) +
geom_jitter(alpha = 0.3, color = "tomato")+
theme(axis.text.x = element_text(angle = 90)) + ylab("Total Cars Ä°mported")
#Let’s calculate number of counts per year for each species. First we need to group the data and count records within each group:
count_by_month <- car_agg %>%
group_by(Date) %>%
tally
count_by_month
## # A tibble: 33 x 2
## Date n
## <date> <int>
## 1 2016-01-31 46
## 2 2016-02-29 48
## 3 2016-03-31 46
## 4 2016-04-30 48
## 5 2016-05-31 48
## 6 2016-06-30 46
## 7 2016-07-31 48
## 8 2016-08-31 48
## 9 2016-09-30 46
## 10 2016-10-31 47
## # ... with 23 more rows
p <- ggplot(count_by_month, aes(x=Date, y=n),color=Date)
p + geom_col()+
geom_bar(stat="identity", position=position_dodge())+
geom_text(aes(label=n), vjust=1.6, color="white", size=3, position = position_dodge(0.10))+
scale_fill_brewer(palette="Paired")+
labs(y = "The Sales of Each Brands", x="Years", fill="Brands") +
theme_minimal()
#p + geom_line() + geom_point()
#p + geom_line() + geom_point(aes(color=Date))
#ggplot line
luxury_data = car_agg %>% filter(brand_name %in% c("BMW", "MERCEDES-BENZ", "AUDI"))
luxury_data %>%
mutate(date = as.Date(paste(year, month, 1, sep='-'))) %>%
ggplot(data = ., aes(x = date, y = total_total, color = brand_name)) +
labs(y = "The Sales of Each Brands", x="Months", fill="Brands") +
geom_line()+ geom_point()
# the ggpairs function from the GGally package to plot all pairs of scatterplots for several variables
car_agg_summary <-
car_agg %>%
group_by(year)%>%
summarise(total_total=sum(total_total),auto_total=sum(auto_total),comm_total=sum(comm_total))%>%
arrange(desc(total_total))
car_agg_summary
## # A tibble: 3 x 4
## year total_total auto_total comm_total
## <dbl> <dbl> <dbl> <dbl>
## 1 2016 982150 755368 226782
## 2 2017 900859 678539 222320
## 3 2018 462904 361913 100991
#install.packages("GGally")
library("GGally")
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
ggpairs(car_agg_summary %>% select( total_total, auto_total, comm_total))
car_agg_summary_brand <-
car_agg %>%
group_by(brand_name)%>%
summarise(total_total=sum(total_total),auto_total=sum(auto_total),comm_total=sum(comm_total))%>%
arrange(desc(total_total))
car_agg_summary_brand
## # A tibble: 49 x 4
## brand_name total_total auto_total comm_total
## <chr> <dbl> <dbl> <dbl>
## 1 RENAULT 318500 279831 38669
## 2 FIAT 275900 142022 133878
## 3 FORD 271157 100172 170985
## 4 VOLKSWAGEN 262041 198371 63670
## 5 HYUNDAI 131666 125446 6220
## 6 DACIA 117978 103206 14772
## 7 OPEL 117122 117122 0
## 8 TOYOTA 106950 95582 11368
## 9 PEUGEOT 98036 74042 23994
## 10 MERCEDES-BENZ 93944 70584 23360
## # ... with 39 more rows
ggplot(car_agg_summary_brand, aes(x = total_total, y = auto_total, color = auto_total)) +
geom_point(size = 3) +
ggtitle("Auto trend in the total cars number") +
xlab("total_total") + ylab("auto_total")+
geom_jitter(alpha = 0.3, color = "tomato")+
theme(plot.margin = margin(2,.8,2,.8, "cm"),
plot.background = element_rect(fill = "darkgrey"))
library(ggplot2)
#install.packages("treemapify")
library(treemapify)
ggplot(car_agg_summary_brand, aes(area = total_total, fill = brand_name, label = brand_name)) +
geom_treemap() +
geom_treemap_text(fontface = "italic", colour = "white", place = "centre", grow = TRUE)+
theme(legend.position = "bottom")