Download Raw Data

The raw data (excel file) has been uploaded to my github repository. First we download that file and put the data in a temp file. Then we can read that excel file into R and finally we will remove the temp file.

# Create a temporary file
tmp<-tempfile(fileext=".xlsx")
# Download file from repository to the temp file
download.file("https://github.com/MEF-BDA503/pj18-KadirKemal/blob/master/Week2/Odd_Retail_Sales_2017_08.xlsx?raw=true",destfile=tmp)
# Read that excel file using readxl package's read_excel function. You might need to adjust the parameters (skip, col_names) according to your raw file's format.
raw_data<-readxl::read_excel(tmp,skip=7,col_names=FALSE)
# Remove the temp file
file.remove(tmp)
## [1] TRUE

See The Raw Data

You can also embed plots, for example:

# Let's see our raw data
head(raw_data)
## # A tibble: 6 x 10
##   X__1          X__2  X__3  X__4  X__5  X__6  X__7  X__8  X__9 X__10
##   <chr>        <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALFA ROMEO      NA    31    31    NA    NA     0     0    31    31
## 2 ASTON MARTIN    NA     3     3    NA    NA     0     0     3     3
## 3 AUDI            NA   903   903    NA    NA     0     0   903   903
## 4 BENTLEY         NA    NA     0    NA    NA     0     0     0     0
## 5 BMW             NA  2230  2230    NA    NA     0     0  2230  2230
## 6 CHERY           NA     1     1    NA    NA     0     0     1     1

Remove Total Colums and Last Two Rows

lastRowIndex = length(raw_data$X__1)-2
raw_data = raw_data[1:lastRowIndex, c(1:10)]

# Let's see our raw data
head(raw_data)
## # A tibble: 6 x 10
##   X__1          X__2  X__3  X__4  X__5  X__6  X__7  X__8  X__9 X__10
##   <chr>        <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALFA ROMEO      NA    31    31    NA    NA     0     0    31    31
## 2 ASTON MARTIN    NA     3     3    NA    NA     0     0     3     3
## 3 AUDI            NA   903   903    NA    NA     0     0   903   903
## 4 BENTLEY         NA    NA     0    NA    NA     0     0     0     0
## 5 BMW             NA  2230  2230    NA    NA     0     0  2230  2230
## 6 CHERY           NA     1     1    NA    NA     0     0     1     1
tail(raw_data)
## # A tibble: 6 x 10
##   X__1        X__2  X__3  X__4  X__5  X__6  X__7  X__8  X__9 X__10
##   <chr>      <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SUBARU        NA   103   103    NA    NA     0     0   103   103
## 2 SUZUKI        NA   370   370    NA    NA     0     0   370   370
## 3 TATA          NA    NA     0    NA    NA     0     0     0     0
## 4 TOYOTA      2374  1104  3478    NA   549   549  2374  1653  4027
## 5 VOLKSWAGEN    NA  8438  8438    NA  2729  2729     0 11167 11167
## 6 VOLVO         NA   203   203    NA    NA     0     0   203   203

Set the column titles

colnames(raw_data) <- c("brand_name","auto_dom","auto_imp","auto_total","comm_dom","comm_imp","comm_total","total_dom","total_imp","total_total")

head(raw_data)
## # A tibble: 6 x 10
##   brand_name auto_dom auto_imp auto_total comm_dom comm_imp comm_total
##   <chr>         <dbl>    <dbl>      <dbl>    <dbl>    <dbl>      <dbl>
## 1 ALFA ROMEO       NA       31         31       NA       NA          0
## 2 ASTON MAR…       NA        3          3       NA       NA          0
## 3 AUDI             NA      903        903       NA       NA          0
## 4 BENTLEY          NA       NA          0       NA       NA          0
## 5 BMW              NA     2230       2230       NA       NA          0
## 6 CHERY            NA        1          1       NA       NA          0
## # ... with 3 more variables: total_dom <dbl>, total_imp <dbl>,
## #   total_total <dbl>

Replace NA values With 0

raw_data[is.na(raw_data)] = 0
head(raw_data)
## # A tibble: 6 x 10
##   brand_name auto_dom auto_imp auto_total comm_dom comm_imp comm_total
##   <chr>         <dbl>    <dbl>      <dbl>    <dbl>    <dbl>      <dbl>
## 1 ALFA ROMEO        0       31         31        0        0          0
## 2 ASTON MAR…        0        3          3        0        0          0
## 3 AUDI              0      903        903        0        0          0
## 4 BENTLEY           0        0          0        0        0          0
## 5 BMW               0     2230       2230        0        0          0
## 6 CHERY             0        1          1        0        0          0
## # ... with 3 more variables: total_dom <dbl>, total_imp <dbl>,
## #   total_total <dbl>
car_data_agu_16 = raw_data
car_data_agu_16 %>% mutate(year=2016,month=8)
## # A tibble: 48 x 12
##    brand_name auto_dom auto_imp auto_total comm_dom comm_imp comm_total
##    <chr>         <dbl>    <dbl>      <dbl>    <dbl>    <dbl>      <dbl>
##  1 ALFA ROMEO        0       31         31        0        0          0
##  2 ASTON MAR…        0        3          3        0        0          0
##  3 AUDI              0      903        903        0        0          0
##  4 BENTLEY           0        0          0        0        0          0
##  5 BMW               0     2230       2230        0        0          0
##  6 CHERY             0        1          1        0        0          0
##  7 CITROEN           0     1080       1080      131      592        723
##  8 DACIA             0     3120       3120        0      389        389
##  9 DS                0       34         34        0        0          0
## 10 FERRARI           0        0          0        0        0          0
## # ... with 38 more rows, and 5 more variables: total_dom <dbl>,
## #   total_imp <dbl>, total_total <dbl>, year <dbl>, month <dbl>

Save Data

saveRDS(car_data_agu_16,file="~/Documents/GitHub/pj18-KadirKemal/Week2/odd_car_sales_data_agu_16.rds")

Find the lines that auto_total and comm_total is bigger than 0

car_data_agu_16[car_data_agu_16$auto_total > 0 & car_data_agu_16$comm_total > 0,c(1,4,7)]
## # A tibble: 14 x 3
##    brand_name    auto_total comm_total
##    <chr>              <dbl>      <dbl>
##  1 CITROEN             1080        723
##  2 DACIA               3120        389
##  3 FIAT                3325       4395
##  4 FORD                2679       4936
##  5 HYUNDAI             3382        240
##  6 KIA                 1378        354
##  7 MERCEDES-BENZ       2456        485
##  8 MITSUBISHI            61        387
##  9 NISSAN              2183         91
## 10 PEUGEOT             1601        973
## 11 RENAULT             6589        864
## 12 SSANGYONG             55         22
## 13 TOYOTA              3478        549
## 14 VOLKSWAGEN          8438       2729

Find average of auto_total

mean(car_data_agu_16$auto_total)
## [1] 1124.521

Summary Data

summary(car_data_agu_16)
##   brand_name           auto_dom         auto_imp        auto_total    
##  Length:48          Min.   :   0.0   Min.   :   0.0   Min.   :   0.0  
##  Class :character   1st Qu.:   0.0   1st Qu.:   3.0   1st Qu.:   3.0  
##  Mode  :character   Median :   0.0   Median :  93.5   Median :  93.5  
##                     Mean   : 212.7   Mean   : 911.8   Mean   :1124.5  
##                     3rd Qu.:   0.0   3rd Qu.:1433.8   3rd Qu.:2003.0  
##                     Max.   :3063.0   Max.   :8438.0   Max.   :8438.0  
##     comm_dom         comm_imp        comm_total       total_dom     
##  Min.   :   0.0   Min.   :   0.0   Min.   :   0.0   Min.   :   0.0  
##  1st Qu.:   0.0   1st Qu.:   0.0   1st Qu.:   0.0   1st Qu.:   0.0  
##  Median :   0.0   Median :   0.0   Median :   0.0   Median :   0.0  
##  Mean   : 186.8   Mean   : 179.4   Mean   : 366.2   Mean   : 399.5  
##  3rd Qu.:   0.0   3rd Qu.: 166.5   3rd Qu.: 193.5   3rd Qu.:   0.0  
##  Max.   :4449.0   Max.   :2729.0   Max.   :4936.0   Max.   :7014.0  
##    total_imp        total_total      
##  Min.   :    0.0   Min.   :    0.00  
##  1st Qu.:    5.0   1st Qu.:    5.75  
##  Median :  132.5   Median :  147.00  
##  Mean   : 1091.2   Mean   : 1490.75  
##  3rd Qu.: 1784.8   3rd Qu.: 2241.00  
##  Max.   :11167.0   Max.   :11167.00