df <- read.csv(file="C:/Users/serba/Desktop/Forbes2017.csv", header=TRUE, sep=",")
table(df$country.status)
## < table of extent 0 >
#Column names of "Forbes_top_2000_list"
names(df)
## [1] "X" "Rank" "Company" "Country"
## [5] "Sales" "Profits" "Assets" "Market.Value"
## [9] "Sector" "Industry"
# Summary of "Forbes_top_2000_list" structure
str(df)
## 'data.frame': 2000 obs. of 10 variables:
## $ X : logi NA NA NA NA NA NA ...
## $ Rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Company : Factor w/ 1999 levels "3i Group","3M",..: 921 393 277 1021 1927 43 216 219 124 1824 ...
## $ Country : Factor w/ 61 levels "Argentina","Australia",..: 10 10 59 59 59 10 59 10 59 26 ...
## $ Sales : num 151.4 134.2 222.9 102.5 97.6 ...
## $ Profits : num 42 35 24.1 24.2 21.9 27.8 16.6 24.9 45.2 17.1 ...
## $ Assets : num 3473 3017 621 2513 1943 ...
## $ Market.Value: num 230 200 410 307 274 ...
## $ Sector : Factor w/ 11 levels "","Consumer Discretionary",..: 5 5 5 5 5 5 5 5 8 2 ...
## $ Industry : Factor w/ 81 levels "","Advertising",..: 53 69 50 53 53 69 53 53 19 9 ...
library(dplyr)#glimpse function
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#Nature of rows and columns
glimpse(df)
## Observations: 2,000
## Variables: 10
## $ X <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ Rank <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15...
## $ Company <fctr> ICBC, China Construction Bank, Berkshire Hathawa...
## $ Country <fctr> China, China, United States, United States, Unit...
## $ Sales <dbl> 151.4, 134.2, 222.9, 102.5, 97.6, 115.7, 92.2, 11...
## $ Profits <dbl> 42.0, 35.0, 24.1, 24.2, 21.9, 27.8, 16.6, 24.9, 4...
## $ Assets <dbl> 3473.2, 3016.6, 620.9, 2513.0, 1943.4, 2816.0, 21...
## $ Market.Value <dbl> 229.8, 200.5, 409.9, 306.6, 274.4, 149.2, 231.9, ...
## $ Sector <fctr> Financials, Financials, Financials, Financials, ...
## $ Industry <fctr> Major Banks, Regional Banks, Investment Services...
#Row and coloumn number of "Forbes_top_2000_list"
dim(df)
## [1] 2000 10
#How much money has been gined by the most profitable company?
max(df$Profits, na.rm = FALSE)
## [1] 45.2
#How many country has been placed and how many times in the top 2000 list?
q1= df %>%group_by(Country) %>%
summarise(country_int=n())
q1
## # A tibble: 61 x 2
## Country country_int
## <fctr> <int>
## 1 Argentina 3
## 2 Australia 39
## 3 Austria 8
## 4 Bahrain 2
## 5 Belgium 9
## 6 Bermuda 9
## 7 Brazil 20
## 8 Canada 58
## 9 Chile 7
## 10 China 200
## # ... with 51 more rows
library(ggplot2)
#Which sector is the most successful in which country?
ggplot(data = df, aes(x = Sector, y = Country)) +
geom_point()
#How many country have 10/more than 10 company?
q2 = df %>%group_by(Country) %>%
summarise(country_int=n())%>%
filter(country_int>=10) %>%
arrange(desc(country_int))
q2
## # A tibble: 29 x 2
## Country country_int
## <fctr> <int>
## 1 United States 564
## 2 Japan 229
## 3 China 200
## 4 United Kingdom 91
## 5 South Korea 64
## 6 Hong Kong 62
## 7 France 59
## 8 Canada 58
## 9 India 58
## 10 Germany 51
## # ... with 19 more rows