================================================================
Group Name: Paranormal Distribution
Instructor: Associate Prof. Özgür Özlük
Data Set: https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients
================================================================
#if you dont use these libraries before, please install them first!
#install.packages('ggplot2')
#install.packages('RColorBrewer')
#install.packages('gridExtra')
#install.packages('corrplot')
library(ggplot2)
library(RColorBrewer)
library(gridExtra)
library(corrplot)
ccdata <- read.csv("default of credit card clients.csv")
#Before tidying data set
dim(ccdata)
[1] 30000 25
#Before tidying data set
str(ccdata)
'data.frame': 30000 obs. of 25 variables:
$ ID : int 1 2 3 4 5 6 7 8 9 10 ...
$ LIMIT_BAL : int 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
$ SEX : int 2 2 2 2 1 1 1 2 2 1 ...
$ EDUCATION : int 2 2 2 2 2 1 1 2 3 3 ...
$ MARRIAGE : int 1 2 2 1 1 2 2 2 1 2 ...
$ AGE : int 24 26 34 37 57 37 29 23 28 35 ...
$ PAY_0 : int 2 -1 0 0 -1 0 0 0 0 -2 ...
$ PAY_2 : int 2 2 0 0 0 0 0 -1 0 -2 ...
$ PAY_3 : int -1 0 0 0 -1 0 0 -1 2 -2 ...
$ PAY_4 : int -1 0 0 0 0 0 0 0 0 -2 ...
$ PAY_5 : int -2 0 0 0 0 0 0 0 0 -1 ...
$ PAY_6 : int -2 2 0 0 0 0 0 -1 0 -1 ...
$ BILL_AMT1 : int 3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
$ BILL_AMT2 : int 3102 1725 14027 48233 5670 57069 412023 380 14096 0 ...
$ BILL_AMT3 : int 689 2682 13559 49291 35835 57608 445007 601 12108 0 ...
$ BILL_AMT4 : int 0 3272 14331 28314 20940 19394 542653 221 12211 0 ...
$ BILL_AMT5 : int 0 3455 14948 28959 19146 19619 483003 -159 11793 13007 ...
$ BILL_AMT6 : int 0 3261 15549 29547 19131 20024 473944 567 3719 13912 ...
$ PAY_AMT1 : int 0 0 1518 2000 2000 2500 55000 380 3329 0 ...
$ PAY_AMT2 : int 689 1000 1500 2019 36681 1815 40000 601 0 0 ...
$ PAY_AMT3 : int 0 1000 1000 1200 10000 657 38000 0 432 0 ...
$ PAY_AMT4 : int 0 1000 1000 1100 9000 1000 20239 581 1000 13007 ...
$ PAY_AMT5 : int 0 0 1000 1069 689 1000 13750 1687 1000 1122 ...
$ PAY_AMT6 : int 0 2000 5000 1000 679 800 13770 1542 1000 0 ...
$ DEF_PAYMENT: int 1 1 0 0 0 0 0 0 0 0 ...
#Before tidying data set
summary(ccdata)
ID LIMIT_BAL SEX EDUCATION
Min. : 1 Min. : 10000 Min. :1.000 Min. :0.000
1st Qu.: 7501 1st Qu.: 50000 1st Qu.:1.000 1st Qu.:1.000
Median :15000 Median : 140000 Median :2.000 Median :2.000
Mean :15000 Mean : 167484 Mean :1.604 Mean :1.853
3rd Qu.:22500 3rd Qu.: 240000 3rd Qu.:2.000 3rd Qu.:2.000
Max. :30000 Max. :1000000 Max. :2.000 Max. :6.000
MARRIAGE AGE PAY_0 PAY_2
Min. :0.000 Min. :21.00 Min. :-2.0000 Min. :-2.0000
1st Qu.:1.000 1st Qu.:28.00 1st Qu.:-1.0000 1st Qu.:-1.0000
Median :2.000 Median :34.00 Median : 0.0000 Median : 0.0000
Mean :1.552 Mean :35.49 Mean :-0.0167 Mean :-0.1338
3rd Qu.:2.000 3rd Qu.:41.00 3rd Qu.: 0.0000 3rd Qu.: 0.0000
Max. :3.000 Max. :79.00 Max. : 8.0000 Max. : 8.0000
PAY_3 PAY_4 PAY_5 PAY_6
Min. :-2.0000 Min. :-2.0000 Min. :-2.0000 Min. :-2.0000
1st Qu.:-1.0000 1st Qu.:-1.0000 1st Qu.:-1.0000 1st Qu.:-1.0000
Median : 0.0000 Median : 0.0000 Median : 0.0000 Median : 0.0000
Mean :-0.1662 Mean :-0.2207 Mean :-0.2662 Mean :-0.2911
3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.0000
Max. : 8.0000 Max. : 8.0000 Max. : 8.0000 Max. : 8.0000
BILL_AMT1 BILL_AMT2 BILL_AMT3 BILL_AMT4
Min. :-165580 Min. :-69777 Min. :-157264 Min. :-170000
1st Qu.: 3559 1st Qu.: 2985 1st Qu.: 2666 1st Qu.: 2327
Median : 22382 Median : 21200 Median : 20088 Median : 19052
Mean : 51223 Mean : 49179 Mean : 47013 Mean : 43263
3rd Qu.: 67091 3rd Qu.: 64006 3rd Qu.: 60165 3rd Qu.: 54506
Max. : 964511 Max. :983931 Max. :1664089 Max. : 891586
BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2
Min. :-81334 Min. :-339603 Min. : 0 Min. : 0
1st Qu.: 1763 1st Qu.: 1256 1st Qu.: 1000 1st Qu.: 833
Median : 18104 Median : 17071 Median : 2100 Median : 2009
Mean : 40311 Mean : 38872 Mean : 5664 Mean : 5921
3rd Qu.: 50190 3rd Qu.: 49198 3rd Qu.: 5006 3rd Qu.: 5000
Max. :927171 Max. : 961664 Max. :873552 Max. :1684259
PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6
Min. : 0 Min. : 0 Min. : 0.0 Min. : 0.0
1st Qu.: 390 1st Qu.: 296 1st Qu.: 252.5 1st Qu.: 117.8
Median : 1800 Median : 1500 Median : 1500.0 Median : 1500.0
Mean : 5226 Mean : 4826 Mean : 4799.4 Mean : 5215.5
3rd Qu.: 4505 3rd Qu.: 4013 3rd Qu.: 4031.5 3rd Qu.: 4000.0
Max. :896040 Max. :621000 Max. :426529.0 Max. :528666.0
DEF_PAYMENT
Min. :0.0000
1st Qu.:0.0000
Median :0.0000
Mean :0.2212
3rd Qu.:0.0000
Max. :1.0000
ccdata$workstate <- ""
ccdata$genderH <- ""
ccdata$educationH <- ""
ccdata$maritalH <- ""
for (i in 1:nrow(ccdata)) {
if ((ccdata[i,7] + ccdata[i,8] +ccdata[i,9]+ccdata[i,10] +ccdata[i,11]+ccdata[i,12]) <= 0){
ccdata[i,26] <- "YES"
}
else {
ccdata[i,26] <- "NO"
}
}
for (i in 1:nrow(ccdata)) {
if (ccdata[i,3] == 1) {
ccdata[i,27] <- "Male"
}
else {
ccdata[i,27] <- "Female"
}
}
for (i in 1:nrow(ccdata)) {
if (ccdata[i,4] == 1) {
ccdata[i,28] <- "Graduate"
} else if (ccdata [i,4] == 2) {
ccdata[i,28] <- "University"
} else if (ccdata [i,4] == 3) {
ccdata[i,28] <- "High School"
} else {
ccdata[i,28] <- "Unknown"
}
}
for (i in 1:nrow(ccdata)) {
if(ccdata[i,5] == 1) {
ccdata[i,29] <- "Married"
} else if (ccdata[i,5] == 2) {
ccdata[i,29] <- "Single"
} else {
ccdata[i,29] <- "Other"
}
}
ccdata$AGE.bucket<-cut(ccdata$AGE,c(10,20,30,40,50,60,70))
ccdata$workstate <-factor(ccdata$workstate)
ccdata$SEX <-factor(ccdata$SEX)
ccdata$EDUCATION <- factor(ccdata$EDUCATION)
ccdata$MARRIAGE <- factor(ccdata$MARRIAGE)
ccdata$AGEf <- factor(ccdata$AGE)
ccdata$DEF_PAYMENT<-factor(ccdata$DEF_PAYMENT)
ccdata$genderH <- factor(ccdata$genderH)
ccdata$educationH <- factor(ccdata$educationH)
ccdata$maritalH <- factor(ccdata$maritalH)
ccdata = subset(ccdata, select = -c(PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6))
#After tidying data set
dim(ccdata)
[1] 30000 25
#After tidying data set
str(ccdata)
'data.frame': 30000 obs. of 25 variables:
$ ID : int 1 2 3 4 5 6 7 8 9 10 ...
$ LIMIT_BAL : int 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
$ SEX : Factor w/ 2 levels "1","2": 2 2 2 2 1 1 1 2 2 1 ...
$ EDUCATION : Factor w/ 7 levels "0","1","2","3",..: 3 3 3 3 3 2 2 3 4 4 ...
$ MARRIAGE : Factor w/ 4 levels "0","1","2","3": 2 3 3 2 2 3 3 3 2 3 ...
$ AGE : int 24 26 34 37 57 37 29 23 28 35 ...
$ BILL_AMT1 : int 3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
$ BILL_AMT2 : int 3102 1725 14027 48233 5670 57069 412023 380 14096 0 ...
$ BILL_AMT3 : int 689 2682 13559 49291 35835 57608 445007 601 12108 0 ...
$ BILL_AMT4 : int 0 3272 14331 28314 20940 19394 542653 221 12211 0 ...
$ BILL_AMT5 : int 0 3455 14948 28959 19146 19619 483003 -159 11793 13007 ...
$ BILL_AMT6 : int 0 3261 15549 29547 19131 20024 473944 567 3719 13912 ...
$ PAY_AMT1 : int 0 0 1518 2000 2000 2500 55000 380 3329 0 ...
$ PAY_AMT2 : int 689 1000 1500 2019 36681 1815 40000 601 0 0 ...
$ PAY_AMT3 : int 0 1000 1000 1200 10000 657 38000 0 432 0 ...
$ PAY_AMT4 : int 0 1000 1000 1100 9000 1000 20239 581 1000 13007 ...
$ PAY_AMT5 : int 0 0 1000 1069 689 1000 13750 1687 1000 1122 ...
$ PAY_AMT6 : int 0 2000 5000 1000 679 800 13770 1542 1000 0 ...
$ DEF_PAYMENT: Factor w/ 2 levels "0","1": 2 2 1 1 1 1 1 1 1 1 ...
$ workstate : Factor w/ 2 levels "NO","YES": 2 1 2 2 2 2 2 2 1 2 ...
$ genderH : Factor w/ 2 levels "Female","Male": 1 1 1 1 2 2 2 1 1 2 ...
$ educationH : Factor w/ 4 levels "Graduate","High School",..: 3 3 3 3 3 1 1 3 2 2 ...
$ maritalH : Factor w/ 3 levels "Married","Other",..: 1 3 3 1 1 3 3 3 1 3 ...
$ AGE.bucket : Factor w/ 6 levels "(10,20]","(20,30]",..: 2 2 3 3 5 3 2 2 2 3 ...
$ AGEf : Factor w/ 56 levels "21","22","23",..: 4 6 14 17 37 17 9 3 8 15 ...
#After tidying data set
summary(ccdata)
ID LIMIT_BAL SEX EDUCATION MARRIAGE
Min. : 1 Min. : 10000 1:11888 0: 14 0: 54
1st Qu.: 7501 1st Qu.: 50000 2:18112 1:10585 1:13659
Median :15000 Median : 140000 2:14030 2:15964
Mean :15000 Mean : 167484 3: 4917 3: 323
3rd Qu.:22500 3rd Qu.: 240000 4: 123
Max. :30000 Max. :1000000 5: 280
6: 51
AGE BILL_AMT1 BILL_AMT2 BILL_AMT3
Min. :21.00 Min. :-165580 Min. :-69777 Min. :-157264
1st Qu.:28.00 1st Qu.: 3559 1st Qu.: 2985 1st Qu.: 2666
Median :34.00 Median : 22382 Median : 21200 Median : 20088
Mean :35.49 Mean : 51223 Mean : 49179 Mean : 47013
3rd Qu.:41.00 3rd Qu.: 67091 3rd Qu.: 64006 3rd Qu.: 60165
Max. :79.00 Max. : 964511 Max. :983931 Max. :1664089
BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1
Min. :-170000 Min. :-81334 Min. :-339603 Min. : 0
1st Qu.: 2327 1st Qu.: 1763 1st Qu.: 1256 1st Qu.: 1000
Median : 19052 Median : 18104 Median : 17071 Median : 2100
Mean : 43263 Mean : 40311 Mean : 38872 Mean : 5664
3rd Qu.: 54506 3rd Qu.: 50190 3rd Qu.: 49198 3rd Qu.: 5006
Max. : 891586 Max. :927171 Max. : 961664 Max. :873552
PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5
Min. : 0 Min. : 0 Min. : 0 Min. : 0.0
1st Qu.: 833 1st Qu.: 390 1st Qu.: 296 1st Qu.: 252.5
Median : 2009 Median : 1800 Median : 1500 Median : 1500.0
Mean : 5921 Mean : 5226 Mean : 4826 Mean : 4799.4
3rd Qu.: 5000 3rd Qu.: 4505 3rd Qu.: 4013 3rd Qu.: 4031.5
Max. :1684259 Max. :896040 Max. :621000 Max. :426529.0
PAY_AMT6 DEF_PAYMENT workstate genderH
Min. : 0.0 0:23364 NO : 7133 Female:18112
1st Qu.: 117.8 1: 6636 YES:22867 Male :11888
Median : 1500.0
Mean : 5215.5
3rd Qu.: 4000.0
Max. :528666.0
educationH maritalH AGE.bucket AGEf
Graduate :10585 Married:13659 (10,20]: 0 29 : 1605
High School: 4917 Other : 377 (20,30]:11013 27 : 1477
University :14030 Single :15964 (30,40]:10713 28 : 1409
Unknown : 468 (40,50]: 6005 30 : 1395
(50,60]: 1997 26 : 1256
(60,70]: 257 31 : 1217
NA's : 15 (Other):21641
# Balance limits by gender and education
d1 <- ggplot(ccdata, aes(factor(genderH), (LIMIT_BAL/1000), fill=educationH)) +
geom_boxplot() +
xlab("Gender") +
ylab("BLimit(x1000 NT$)") +
scale_fill_brewer(palette = "Accent")
# Balance limits by education and gender
d2 <- ggplot(ccdata, aes(factor(educationH), (LIMIT_BAL/1000), fill=genderH)) +
geom_boxplot() +
xlab("Education") +
ylab("BLimit(x1000 NT$)") +
scale_fill_brewer(palette = "Paired")
# Balance limits by workstate and education
d3 <-ggplot(ccdata, aes(factor(educationH), (LIMIT_BAL/1000), fill=workstate)) +
geom_boxplot() +
xlab("Education") +
ylab("BLimit(x1000 NT$)")
grid.arrange(d1, d2, d3)
#Result: When we compare the balance limits with gender, education and work status, we saw that gender has no effects on balance limit determination process of bank while the education level is has a positive effect on this process. Also it can be seen that work status is a very important factor at balance limit determination.
ggplot(ccdata, aes(factor(maritalH), (LIMIT_BAL/1000), fill=genderH)) +
geom_boxplot() +
xlab("Marital Status") +
ylab("Balance Limit ( x 1000 NT$)") +
coord_cartesian(ylim = c(0,350)) +
scale_fill_brewer(palette = "Paired")
#Result: By this graph, we can see it again that, there is no change at females side such as balance limits depending on their marital status, however it changes a lot of things on males’ side starting with the expenditures which is the reason on increased balance limits.
ggplot(aes(x = ccdata$LIMIT_BAL/1000), data = ccdata) +
geom_histogram(aes(fill = ccdata$DEF_PAYMENT)) +
xlab("Balance Limit x 1000") +
ylab("Count") +
scale_fill_discrete(name="Default Payment Next Month",
breaks=c(0, 1),
labels=c("No", "Yes")) +
xlim(c(0,750)) +
facet_wrap(~educationH)
#Result: Balance limits and count of defaulted clients are almost same for University and Graduate Level. Additionally, the ratio of defaulted clients at high school level seems almost the same as the university and graduate levels.
d1 <- ggplot(ccdata, aes(x=DEF_PAYMENT)) +
geom_histogram(stat="count",color='red',fill='orange') +
xlab("Default Payment Status") + ylab("Customer Count") +
facet_wrap(~educationH)
d2 <- ggplot(ccdata, aes(x=DEF_PAYMENT),aes(y=stat_count(gender))) +
geom_bar(aes(fill=factor(ccdata$educationH))) +
xlab("Default Payment Status")+ylab("Customer Count") +
facet_wrap(~genderH)+
scale_fill_discrete(name="Education")
grid.arrange(d1, d2, ncol=1)
#Result: There is no effect of education level on default event occurrence.
ggplot(data = subset(ccdata,!is.na(AGE.bucket)), aes(factor(educationH), (LIMIT_BAL/1000), fill=AGE.bucket)) +
geom_boxplot() +
xlab("Education") +
ylab("Balance Limit ( x 1000 NT$)") +
coord_cartesian(ylim = c(0,500)) +
scale_fill_brewer(palette = "Accent")
#Result: Reflection of the education levels on determination of balance limits for clients is increasing by later ages when we compare the averages of each age buckets with each other grouped by their education levels.
ccdata$SEP2005<-ccdata$BILL_AMT1
ccdata$AUG2005<-ccdata$BILL_AMT2
ccdata$JUL2005<-ccdata$BILL_AMT3
ccdata$JUN2005<-ccdata$BILL_AMT4
ccdata$MAY2005<-ccdata$BILL_AMT5
ccdata$APR2005<-ccdata$BILL_AMT6
apr <- ggplot(aes(x=AGE,y=APR2005/1000),data=ccdata) +
xlab("Age") +
ylab("Amount of Bill in April") +
coord_cartesian(xlim = c(21,60),ylim = c(0,700))+
geom_jitter(alpha=0.3, color="orange") + geom_smooth(stat='summary', fun.y=mean)
may <- ggplot(aes(x=AGE,y=MAY2005/1000),data=ccdata) +
xlab("Age") +
ylab("Amount of Bill in May") +
coord_cartesian(xlim = c(21,60),ylim = c(0,700))+
geom_jitter(alpha=0.3, color="blue") + geom_smooth(stat='summary', fun.y=mean)
jun <- ggplot(aes(x=AGE,y=JUN2005/1000),data=ccdata) +
xlab("Age") +
ylab("Amount of Bill in June") +
coord_cartesian(xlim = c(21,60),ylim = c(0,700))+
geom_jitter(alpha=0.3, color="green") + geom_smooth(stat='summary', fun.y=mean)
jul <- ggplot(aes(x=AGE,y=JUL2005/1000),data=ccdata) +
xlab("Age") +
ylab("Amount of Bill in July") +
coord_cartesian(xlim = c(21,60),ylim = c(0,700))+
geom_jitter(alpha=0.3, color="orange") + geom_smooth(stat='summary', fun.y=mean)
aug <- ggplot(aes(x=AGE,y=AUG2005/1000),data=ccdata) +
xlab("Age") +
ylab("Amount of Bill in August") +
coord_cartesian(xlim = c(21,60),ylim = c(0,700))+
geom_jitter(alpha=0.3, color="blue") + geom_smooth(stat='summary', fun.y=mean)
sep <- ggplot(aes(x=AGE,y=SEP2005/1000),data=ccdata) +
xlab("Age") +
ylab("Amount of Bill in September") +
coord_cartesian(xlim = c(21,60),ylim = c(0,700))+
geom_jitter(alpha=0.3, color="green") + geom_smooth(stat='summary', fun.y=mean)
grid.arrange(apr,may,jun,jul,aug,sep,ncol=3)
#Result: Distribution of expenditures on ages doesn’t show any differentiation between months and the average expenditure by age is almost same for each month.
M <- cor(subset(ccdata, select = c(LIMIT_BAL,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6)))
corrplot(M, method="number")
#Result: When we reflect the correlations between limit balances, bill amounts and payments amounts; it presents us that there’s a low correlation between the limit balances and payments and bill amounts. Howevet it can be seen that bill amounts has high correlation between each other as expected since the bills a reflecting the cumulative amounts.
ggplot(aes(x=AGE,y=LIMIT_BAL/1000),data=subset(ccdata,!is.na(AGE.bucket)))+
xlab("Age") +
ylab("Balance Limit (x1000 NT$)") +
coord_cartesian(xlim = c(21,60),ylim = c(0,700))+
scale_color_brewer(palette = "Pastel1")+
geom_jitter(alpha=0.5, position = position_jitter(h=0), aes(color=AGE.bucket)) +
geom_smooth(stat='summary', fun.y=mean) +
geom_smooth(stat='summary', fun.y=quantile, fun.args = list(probs = 0.1), color = 'black', linetype=2) +
geom_smooth(stat='summary', fun.y=quantile, fun.args = list(probs = 0.5), color = 'red', linetype=2) +
geom_smooth(stat='summary', fun.y=quantile, fun.args = list(probs = 0.9), color = 'black', linetype=2)
#Result: When we plot the average balance limit per age bucket, we realized there are some limit levels given to clients such as 500,000 and 200,000 and these levels are belong to the probabilities of 95% and 50% respectively. So we can say that %95 of clients has balance limits equal or less than TL500k and %50 of clients has balance limits equal or less than TL200k.
# Additionally, when we investigate the blue line which represents the average limit balance of all clients, we think that we can interpret the spread between blue line and discrete red line as the bank prefers to give more balance limit than the average for the 50% probability.
# On the other hand, I can interpret the probabilities as the confidence intervals of exposures of bank to clients and I can say that bank implies that it can lose more than TL500k only by 5% if there is a client default occurs.
ggplot(aes(x=AGE,y=LIMIT_BAL/1000),data=subset(ccdata,!is.na(AGE.bucket)))+
xlab("Age") +
ylab("Balance Limit (x1000 NT$)") +
coord_cartesian(xlim = c(21,60),ylim = c(0,700))+
scale_color_brewer(palette = "Pastel1")+
geom_jitter(alpha=0.5, position = position_jitter(h=0), aes(color=AGE.bucket)) +
geom_smooth(stat='summary', fun.y=mean) +
geom_smooth(stat='summary', fun.y=quantile, fun.args = list(probs = 0.1), color = 'black', linetype=2) +
geom_smooth(stat='summary', fun.y=quantile, fun.args = list(probs = 0.5), color = 'red', linetype=2) +
geom_smooth(stat='summary', fun.y=quantile, fun.args = list(probs = 0.9), color = 'black', linetype=2) +
geom_smooth(stat='summary', fun.y=quantile, fun.args = list(probs = 0.95), color = 'black', linetype=2) +
facet_wrap(~DEF_PAYMENT)
#Result: Within the number of observations having a limit of 500,000 or less, it is not necessary to be more than 5% of the number of observations of the number of defaults when the default probability is related.
#Otherwise,
# a) A change in the default prediction model is required.
# b) A change in the bank's limit allocation model is required.
# We can also say that when defining a limit, a limit of over 50% is defined, it is one of the reasons why people fall into default payment.