Analyzing Credit Card Limits

BDA503: Default Of Credit Card Clients

================================================================
Group Name: Paranormal Distribution
Instructor: Associate Prof. Özgür Özlük
Data Set: https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients
================================================================

Load Required Modules

#if you dont use these libraries before, please install them first!
#install.packages('ggplot2')
#install.packages('RColorBrewer')
#install.packages('gridExtra')
#install.packages('corrplot')
library(ggplot2)
library(RColorBrewer)
library(gridExtra)
library(corrplot)

Load and Investigate Data

ccdata <- read.csv("default of credit card clients.csv")
#Before tidying data set
dim(ccdata)
[1] 30000    25
#Before tidying data set
str(ccdata)
'data.frame':   30000 obs. of  25 variables:
 $ ID         : int  1 2 3 4 5 6 7 8 9 10 ...
 $ LIMIT_BAL  : int  20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
 $ SEX        : int  2 2 2 2 1 1 1 2 2 1 ...
 $ EDUCATION  : int  2 2 2 2 2 1 1 2 3 3 ...
 $ MARRIAGE   : int  1 2 2 1 1 2 2 2 1 2 ...
 $ AGE        : int  24 26 34 37 57 37 29 23 28 35 ...
 $ PAY_0      : int  2 -1 0 0 -1 0 0 0 0 -2 ...
 $ PAY_2      : int  2 2 0 0 0 0 0 -1 0 -2 ...
 $ PAY_3      : int  -1 0 0 0 -1 0 0 -1 2 -2 ...
 $ PAY_4      : int  -1 0 0 0 0 0 0 0 0 -2 ...
 $ PAY_5      : int  -2 0 0 0 0 0 0 0 0 -1 ...
 $ PAY_6      : int  -2 2 0 0 0 0 0 -1 0 -1 ...
 $ BILL_AMT1  : int  3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
 $ BILL_AMT2  : int  3102 1725 14027 48233 5670 57069 412023 380 14096 0 ...
 $ BILL_AMT3  : int  689 2682 13559 49291 35835 57608 445007 601 12108 0 ...
 $ BILL_AMT4  : int  0 3272 14331 28314 20940 19394 542653 221 12211 0 ...
 $ BILL_AMT5  : int  0 3455 14948 28959 19146 19619 483003 -159 11793 13007 ...
 $ BILL_AMT6  : int  0 3261 15549 29547 19131 20024 473944 567 3719 13912 ...
 $ PAY_AMT1   : int  0 0 1518 2000 2000 2500 55000 380 3329 0 ...
 $ PAY_AMT2   : int  689 1000 1500 2019 36681 1815 40000 601 0 0 ...
 $ PAY_AMT3   : int  0 1000 1000 1200 10000 657 38000 0 432 0 ...
 $ PAY_AMT4   : int  0 1000 1000 1100 9000 1000 20239 581 1000 13007 ...
 $ PAY_AMT5   : int  0 0 1000 1069 689 1000 13750 1687 1000 1122 ...
 $ PAY_AMT6   : int  0 2000 5000 1000 679 800 13770 1542 1000 0 ...
 $ DEF_PAYMENT: int  1 1 0 0 0 0 0 0 0 0 ...
#Before tidying data set
summary(ccdata)
       ID          LIMIT_BAL            SEX          EDUCATION    
 Min.   :    1   Min.   :  10000   Min.   :1.000   Min.   :0.000  
 1st Qu.: 7501   1st Qu.:  50000   1st Qu.:1.000   1st Qu.:1.000  
 Median :15000   Median : 140000   Median :2.000   Median :2.000  
 Mean   :15000   Mean   : 167484   Mean   :1.604   Mean   :1.853  
 3rd Qu.:22500   3rd Qu.: 240000   3rd Qu.:2.000   3rd Qu.:2.000  
 Max.   :30000   Max.   :1000000   Max.   :2.000   Max.   :6.000  
    MARRIAGE          AGE            PAY_0             PAY_2        
 Min.   :0.000   Min.   :21.00   Min.   :-2.0000   Min.   :-2.0000  
 1st Qu.:1.000   1st Qu.:28.00   1st Qu.:-1.0000   1st Qu.:-1.0000  
 Median :2.000   Median :34.00   Median : 0.0000   Median : 0.0000  
 Mean   :1.552   Mean   :35.49   Mean   :-0.0167   Mean   :-0.1338  
 3rd Qu.:2.000   3rd Qu.:41.00   3rd Qu.: 0.0000   3rd Qu.: 0.0000  
 Max.   :3.000   Max.   :79.00   Max.   : 8.0000   Max.   : 8.0000  
     PAY_3             PAY_4             PAY_5             PAY_6        
 Min.   :-2.0000   Min.   :-2.0000   Min.   :-2.0000   Min.   :-2.0000  
 1st Qu.:-1.0000   1st Qu.:-1.0000   1st Qu.:-1.0000   1st Qu.:-1.0000  
 Median : 0.0000   Median : 0.0000   Median : 0.0000   Median : 0.0000  
 Mean   :-0.1662   Mean   :-0.2207   Mean   :-0.2662   Mean   :-0.2911  
 3rd Qu.: 0.0000   3rd Qu.: 0.0000   3rd Qu.: 0.0000   3rd Qu.: 0.0000  
 Max.   : 8.0000   Max.   : 8.0000   Max.   : 8.0000   Max.   : 8.0000  
   BILL_AMT1         BILL_AMT2        BILL_AMT3         BILL_AMT4      
 Min.   :-165580   Min.   :-69777   Min.   :-157264   Min.   :-170000  
 1st Qu.:   3559   1st Qu.:  2985   1st Qu.:   2666   1st Qu.:   2327  
 Median :  22382   Median : 21200   Median :  20088   Median :  19052  
 Mean   :  51223   Mean   : 49179   Mean   :  47013   Mean   :  43263  
 3rd Qu.:  67091   3rd Qu.: 64006   3rd Qu.:  60165   3rd Qu.:  54506  
 Max.   : 964511   Max.   :983931   Max.   :1664089   Max.   : 891586  
   BILL_AMT5        BILL_AMT6          PAY_AMT1         PAY_AMT2      
 Min.   :-81334   Min.   :-339603   Min.   :     0   Min.   :      0  
 1st Qu.:  1763   1st Qu.:   1256   1st Qu.:  1000   1st Qu.:    833  
 Median : 18104   Median :  17071   Median :  2100   Median :   2009  
 Mean   : 40311   Mean   :  38872   Mean   :  5664   Mean   :   5921  
 3rd Qu.: 50190   3rd Qu.:  49198   3rd Qu.:  5006   3rd Qu.:   5000  
 Max.   :927171   Max.   : 961664   Max.   :873552   Max.   :1684259  
    PAY_AMT3         PAY_AMT4         PAY_AMT5           PAY_AMT6       
 Min.   :     0   Min.   :     0   Min.   :     0.0   Min.   :     0.0  
 1st Qu.:   390   1st Qu.:   296   1st Qu.:   252.5   1st Qu.:   117.8  
 Median :  1800   Median :  1500   Median :  1500.0   Median :  1500.0  
 Mean   :  5226   Mean   :  4826   Mean   :  4799.4   Mean   :  5215.5  
 3rd Qu.:  4505   3rd Qu.:  4013   3rd Qu.:  4031.5   3rd Qu.:  4000.0  
 Max.   :896040   Max.   :621000   Max.   :426529.0   Max.   :528666.0  
  DEF_PAYMENT    
 Min.   :0.0000  
 1st Qu.:0.0000  
 Median :0.0000  
 Mean   :0.2212  
 3rd Qu.:0.0000  
 Max.   :1.0000  

Reshaping Data 1: Creating New Variables

ccdata$workstate <- ""
ccdata$genderH <- ""
ccdata$educationH <- ""
ccdata$maritalH <- ""

for (i in 1:nrow(ccdata)) {
  if ((ccdata[i,7] + ccdata[i,8] +ccdata[i,9]+ccdata[i,10] +ccdata[i,11]+ccdata[i,12]) <= 0){
    ccdata[i,26] <- "YES"  
  }
  else {
    ccdata[i,26] <- "NO"         
  }
}

for (i in 1:nrow(ccdata)) {
  if (ccdata[i,3] == 1) {
    ccdata[i,27] <- "Male"  
  }
  else {
    ccdata[i,27] <- "Female"         
  }
}

for (i in 1:nrow(ccdata)) {
  if (ccdata[i,4] == 1) {
    ccdata[i,28] <- "Graduate"
  } else if (ccdata [i,4] == 2) {
    ccdata[i,28] <- "University" 
  } else if (ccdata [i,4] == 3) {
    ccdata[i,28] <- "High School" 
  } else {
    ccdata[i,28] <- "Unknown" 
  }
}

for (i in 1:nrow(ccdata)) {
  if(ccdata[i,5] == 1) {
    ccdata[i,29] <- "Married"
  } else if (ccdata[i,5] == 2) {
    ccdata[i,29] <- "Single"
  } else {
    ccdata[i,29] <- "Other"
  }
}

Cut a Variable

ccdata$AGE.bucket<-cut(ccdata$AGE,c(10,20,30,40,50,60,70))

Reshaping Data 2: Converting Quantitative Variables To New Factorial Variables

ccdata$workstate <-factor(ccdata$workstate)
ccdata$SEX <-factor(ccdata$SEX)
ccdata$EDUCATION <- factor(ccdata$EDUCATION)
ccdata$MARRIAGE <- factor(ccdata$MARRIAGE)
ccdata$AGEf <- factor(ccdata$AGE)
ccdata$DEF_PAYMENT<-factor(ccdata$DEF_PAYMENT)
ccdata$genderH <- factor(ccdata$genderH)
ccdata$educationH <- factor(ccdata$educationH)
ccdata$maritalH <- factor(ccdata$maritalH)

Reshaping Data 3: Removing Variables After Reshaping 1

ccdata = subset(ccdata, select = -c(PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6))

Investigate Data Again

#After tidying data set
dim(ccdata)
[1] 30000    25
#After tidying data set
str(ccdata)
'data.frame':   30000 obs. of  25 variables:
 $ ID         : int  1 2 3 4 5 6 7 8 9 10 ...
 $ LIMIT_BAL  : int  20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
 $ SEX        : Factor w/ 2 levels "1","2": 2 2 2 2 1 1 1 2 2 1 ...
 $ EDUCATION  : Factor w/ 7 levels "0","1","2","3",..: 3 3 3 3 3 2 2 3 4 4 ...
 $ MARRIAGE   : Factor w/ 4 levels "0","1","2","3": 2 3 3 2 2 3 3 3 2 3 ...
 $ AGE        : int  24 26 34 37 57 37 29 23 28 35 ...
 $ BILL_AMT1  : int  3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
 $ BILL_AMT2  : int  3102 1725 14027 48233 5670 57069 412023 380 14096 0 ...
 $ BILL_AMT3  : int  689 2682 13559 49291 35835 57608 445007 601 12108 0 ...
 $ BILL_AMT4  : int  0 3272 14331 28314 20940 19394 542653 221 12211 0 ...
 $ BILL_AMT5  : int  0 3455 14948 28959 19146 19619 483003 -159 11793 13007 ...
 $ BILL_AMT6  : int  0 3261 15549 29547 19131 20024 473944 567 3719 13912 ...
 $ PAY_AMT1   : int  0 0 1518 2000 2000 2500 55000 380 3329 0 ...
 $ PAY_AMT2   : int  689 1000 1500 2019 36681 1815 40000 601 0 0 ...
 $ PAY_AMT3   : int  0 1000 1000 1200 10000 657 38000 0 432 0 ...
 $ PAY_AMT4   : int  0 1000 1000 1100 9000 1000 20239 581 1000 13007 ...
 $ PAY_AMT5   : int  0 0 1000 1069 689 1000 13750 1687 1000 1122 ...
 $ PAY_AMT6   : int  0 2000 5000 1000 679 800 13770 1542 1000 0 ...
 $ DEF_PAYMENT: Factor w/ 2 levels "0","1": 2 2 1 1 1 1 1 1 1 1 ...
 $ workstate  : Factor w/ 2 levels "NO","YES": 2 1 2 2 2 2 2 2 1 2 ...
 $ genderH    : Factor w/ 2 levels "Female","Male": 1 1 1 1 2 2 2 1 1 2 ...
 $ educationH : Factor w/ 4 levels "Graduate","High School",..: 3 3 3 3 3 1 1 3 2 2 ...
 $ maritalH   : Factor w/ 3 levels "Married","Other",..: 1 3 3 1 1 3 3 3 1 3 ...
 $ AGE.bucket : Factor w/ 6 levels "(10,20]","(20,30]",..: 2 2 3 3 5 3 2 2 2 3 ...
 $ AGEf       : Factor w/ 56 levels "21","22","23",..: 4 6 14 17 37 17 9 3 8 15 ...
#After tidying data set
summary(ccdata)
       ID          LIMIT_BAL       SEX       EDUCATION MARRIAGE 
 Min.   :    1   Min.   :  10000   1:11888   0:   14   0:   54  
 1st Qu.: 7501   1st Qu.:  50000   2:18112   1:10585   1:13659  
 Median :15000   Median : 140000             2:14030   2:15964  
 Mean   :15000   Mean   : 167484             3: 4917   3:  323  
 3rd Qu.:22500   3rd Qu.: 240000             4:  123            
 Max.   :30000   Max.   :1000000             5:  280            
                                             6:   51            
      AGE          BILL_AMT1         BILL_AMT2        BILL_AMT3      
 Min.   :21.00   Min.   :-165580   Min.   :-69777   Min.   :-157264  
 1st Qu.:28.00   1st Qu.:   3559   1st Qu.:  2985   1st Qu.:   2666  
 Median :34.00   Median :  22382   Median : 21200   Median :  20088  
 Mean   :35.49   Mean   :  51223   Mean   : 49179   Mean   :  47013  
 3rd Qu.:41.00   3rd Qu.:  67091   3rd Qu.: 64006   3rd Qu.:  60165  
 Max.   :79.00   Max.   : 964511   Max.   :983931   Max.   :1664089  
                                                                     
   BILL_AMT4         BILL_AMT5        BILL_AMT6          PAY_AMT1     
 Min.   :-170000   Min.   :-81334   Min.   :-339603   Min.   :     0  
 1st Qu.:   2327   1st Qu.:  1763   1st Qu.:   1256   1st Qu.:  1000  
 Median :  19052   Median : 18104   Median :  17071   Median :  2100  
 Mean   :  43263   Mean   : 40311   Mean   :  38872   Mean   :  5664  
 3rd Qu.:  54506   3rd Qu.: 50190   3rd Qu.:  49198   3rd Qu.:  5006  
 Max.   : 891586   Max.   :927171   Max.   : 961664   Max.   :873552  
                                                                      
    PAY_AMT2          PAY_AMT3         PAY_AMT4         PAY_AMT5       
 Min.   :      0   Min.   :     0   Min.   :     0   Min.   :     0.0  
 1st Qu.:    833   1st Qu.:   390   1st Qu.:   296   1st Qu.:   252.5  
 Median :   2009   Median :  1800   Median :  1500   Median :  1500.0  
 Mean   :   5921   Mean   :  5226   Mean   :  4826   Mean   :  4799.4  
 3rd Qu.:   5000   3rd Qu.:  4505   3rd Qu.:  4013   3rd Qu.:  4031.5  
 Max.   :1684259   Max.   :896040   Max.   :621000   Max.   :426529.0  
                                                                       
    PAY_AMT6        DEF_PAYMENT workstate     genderH     
 Min.   :     0.0   0:23364     NO : 7133   Female:18112  
 1st Qu.:   117.8   1: 6636     YES:22867   Male  :11888  
 Median :  1500.0                                         
 Mean   :  5215.5                                         
 3rd Qu.:  4000.0                                         
 Max.   :528666.0                                         
                                                          
       educationH       maritalH       AGE.bucket         AGEf      
 Graduate   :10585   Married:13659   (10,20]:    0   29     : 1605  
 High School: 4917   Other  :  377   (20,30]:11013   27     : 1477  
 University :14030   Single :15964   (30,40]:10713   28     : 1409  
 Unknown    :  468                   (40,50]: 6005   30     : 1395  
                                     (50,60]: 1997   26     : 1256  
                                     (60,70]:  257   31     : 1217  
                                     NA's   :   15   (Other):21641  

Balance Limit by Gender, Education, Work State

# Balance limits by gender and education
d1 <- ggplot(ccdata, aes(factor(genderH), (LIMIT_BAL/1000), fill=educationH)) + 
  geom_boxplot() +
  xlab("Gender") + 
  ylab("BLimit(x1000 NT$)") + 
  scale_fill_brewer(palette = "Accent")

# Balance limits by education and gender
d2 <- ggplot(ccdata, aes(factor(educationH), (LIMIT_BAL/1000), fill=genderH)) + 
  geom_boxplot() +
  xlab("Education") + 
  ylab("BLimit(x1000 NT$)") + 
  scale_fill_brewer(palette = "Paired")

# Balance limits by workstate and education
d3 <-ggplot(ccdata, aes(factor(educationH), (LIMIT_BAL/1000), fill=workstate)) + 
  geom_boxplot() +
  xlab("Education") + 
  ylab("BLimit(x1000 NT$)") 

grid.arrange(d1, d2, d3)

#Result: When we compare the balance limits with gender, education and work status, we saw that gender has no effects on balance limit determination process of bank while the education level is has a positive effect on this process. Also it can be seen that work status is a very important factor at balance limit determination.

Relation Between Marital Status & Balance Limits By Gender

ggplot(ccdata, aes(factor(maritalH), (LIMIT_BAL/1000), fill=genderH)) + 
  geom_boxplot() +
  xlab("Marital Status") + 
  ylab("Balance Limit ( x 1000 NT$)") + 
  coord_cartesian(ylim = c(0,350)) +
  scale_fill_brewer(palette = "Paired")

#Result: By this graph, we can see it again that, there is no change at females side such as balance limits depending on their marital status, however it changes a lot of things on males’ side starting with the expenditures which is the reason on increased balance limits. 

Histogram of Limit Balance & Default Payment

ggplot(aes(x = ccdata$LIMIT_BAL/1000), data = ccdata) +
geom_histogram(aes(fill = ccdata$DEF_PAYMENT)) +
xlab("Balance Limit x 1000") +
ylab("Count") +
scale_fill_discrete(name="Default Payment Next Month",
                      breaks=c(0, 1),
                      labels=c("No", "Yes")) +
xlim(c(0,750)) +
facet_wrap(~educationH)