R bank credit card risk control score data analysis
1 Initial environment preparation
Read data and preview
rm(list=ls())
#setwd("./case")
#install.packages("xlsx")
library(openxlsx)
dat<-read.xlsx("credit.xlsx",1)
View(dat)
2 Data preview and processing
Data preview, find outliers with a maximum value of 999, deviating from the average
class(dat)
#describe data
summary(dat)
sum(is.na(dat))
Outlier handling, fill with na
#Outlier filling
dat[,1:6]<-sapply(dat[,1:6],function(x) {x[x==999]<-NA;return(x)} )
nrow(dat)
ncol(dat)
summary(dat[,11])
#Understand data deletion invalid variables
dat<-dat[,-11]
Convert character variable character to factor variable factor
#Change string variable type to classification variable
dat1<-dat
sapply(dat1,class)
ch=names(which(sapply(dat1,is.character)))#find the character type variance
dat1[,ch]=as.data.frame(lapply(dat1[,ch], as.factor))
Observing the data, it is found that the number of family members and the number of children in the family are closely related, and there is multicollinearity, so the identification variable is generated to replace these two variables
dat1[,4]<-dat1[,4]-dat1[,3]
table(dat1[,4])
dat1[,4]<-factor(dat1[,4],levels=c(1,2),labels=c("其他","已婚"))
colnames(dat1)<-c("age","income","child","marital","dur_live",
"dur_work","housetype","nation","cardtype","loan")
summary(dat1)
3 Descriptive Statistics
Related package preparation
#install.packages("smbinning")
#install.packages("prettyR")
library(smbinning)
library(prettyR)
library(mvtnorm)
library(kernlab)
cap method
Outliers can be handled using the cap method, using the 1% and 99% quantiles to replace outliers
##Capping method function removes outliers and replaces outliers with 99% and 1% points respectively
block<-function(x,lower=T,upper=T){
if(lower){
q1<-quantile(x,0.01)
x[x<=q1]<-q1
}
if(upper){
q99<-quantile(x,0.99)
x[x>q99]<-q99
}
return(x)
}
In the data set, 1 is non-default, 0 is default, and the reverse setting is performed, so that 1 becomes default and 0 is non-default.
#Odds ratio conversion for later IV calculation
dat1$loan<-as.numeric(!as.logical(dat1$loan))
Descriptive data classification statistics
The difference between defaulters and non-defaulters
#data classification ,discretization of continuous variables
##age
boxplot(age~loan,data=dat1,horizontal=T, frame=F,
col="lightgray",main="Distribution")
age<-smbinning(dat1,"loan","age")
age$ivtable
Age boxplot for default or not
IV plot after binning
age<-smbinning(dat1,"loan","age")
age$ivtable
After binning the age, check the percentage, weight, good, and bad rate. The following descriptive statistics are roughly the same. Qin Chu can see the difference between different age levels.
par(mfrow=c(2,2))
smbinning.plot(age,option="dist",sub="年龄")
smbinning.plot(age,option="WoE",sub="年龄")
smbinning.plot(age,option="goodrate",sub="年龄")
smbinning.plot(age,option="badrate",sub="年龄"
Add IV result to a vector
par(mfrow=c(1,1))
age$iv
#Add Iv value to vector
cred_iv<-c("年龄"=age$iv)
Regarding revenue, there are clearly outliers, using the cap method
##income
boxplot(income~loan,data=dat1,horizontal=T, frame=F,
col="lightgray",main="Distribution")
cap filling
dat1$income<-block(dat1$income)
After filling, it became normal
boxplot(income~loan,data=dat1,horizontal=T, frame=F,
col="lightgray",main="Distribution")
```
IV value measurement, same as above age
income<-smbinning(dat1,"loan","income")
income$ivtable
smbinning.plot(income,option="WoE",sub="收入")
income$iv
cred_iv<-c(cred_iv,"收入"=income$iv)
child statistics
##child
boxplot(child~loan,data=dat1,horizontal=T, frame=F,
col="lightgray",main="Distribution")
child<-smbinning(dat1,"loan","child")
child$ivtable
smbinning.plot(child,option="WoE",sub="孩子数量")
child$iv
cred_iv<-c(cred_iv,"孩子数量"=child$iv)
##marital
xtab(~marital+loan,data=dat1,chisq=T)
marital<-smbinning.factor(dat1,"loan","marital")
marital$ivtable
smbinning.plot(marital,option="WoE",sub="婚姻状态")
marital$iv
cred_iv<-c(cred_iv,"婚姻状态"=marital$iv)
##dur_live
boxplot(dur_live~loan,data=dat1,horizontal=T,
frame=F, col="lightgray",main="Distribution")
t.test(dur_live~loan,data=dat1)
dur_live<-smbinning(dat1,"loan","dur_live")
dur_live
It is observed that the dur_live variable has little difference to the default distribution. Using the t test, the same distribution of the two cannot be rejected.
dur_work variable statistics
##dur_work
boxplot(dur_work~loan,data=dat1,horizontal=T,
frame=F, col="lightgray",main="Distribution")
t.test(dur_work~loan,data=dat1)
dur_work<-smbinning(dat1,"loan","dur_work")
dur_work$ivtable
smbinning.plot(dur_work,option="WoE",sub="在现工作时间")
dur_work$iv
cred_iv<-c(cred_iv,"在现工作时间"=dur_work$iv)
housetype descriptive statistics
##housetype
xtab(~housetype+loan,data=dat1,chisq=T)
housetype<-smbinning.factor(dat1,"loan","housetype")
housetype$ivtable
smbinning.plot(housetype,option="WoE",sub="住房类型")
housetype$iv
cred_iv<-c(cred_iv,"住房种类"=housetype$iv)
##nation
xtab(~nation+loan,data=dat1,chisq=T)
nation<-smbinning.factor(dat1,"loan","nation")
nation$ivtable
smbinning.plot(nation,option="WoE",sub="国籍")
nation$iv
cred_iv<-c(cred_iv,"国籍"=nation$iv)
cardtype descriptive statistics
##cardtype
xtab(~cardtype+loan,data=dat1,chisq=T)
cardtype<-smbinning.factor(dat1,"loan","cardtype")
cardtype$ivtable
smbinning.plot(cardtype,option="WoE",sub="信用卡类型")
cardtype$iv
cred_iv<-c(cred_iv,"信用卡类型"=cardtype$iv)
Overall variable IV value degree
#Drawing shows the amount of information
barplot(cred_iv,main="各变量信息值")
4 attribute binning
#quantity after adding classification
dat2<-dat1
dat2<-smbinning.gen(dat2,age,"glage")
dat2<-smbinning.gen(dat2,income,"glincome")
dat2<-smbinning.gen(dat2,child,"glchild")
dat2<-smbinning.factor.gen(dat2,marital,"glmarital")
dat2<-smbinning.gen(dat2,dur_work,"gldur_work")
dat2<-smbinning.factor.gen(dat2,housetype,"glhousetype")
dat2<-smbinning.factor.gen(dat2,nation,"glnation")
dat2<-smbinning.factor.gen(dat2,cardtype,"glcardtype")
View(dat2)
dat3<-dat2[,c(11:18,10)]
View(dat3)
Generate binned data levels and convert user attributes into interval data
5 Logistic modeling
Specific scoring theory reference link: link .
model generation
#Creat logistic regression
cred_mod<-glm(loan~. ,data=dat3,family=binomial())
summary(cred_mod)
6 points system
According to the scoring formula, the highest and lowest credit scores are 797,362 points respectively.
#Scoring card system
cre_scal<-smbinning.scaling(cred_mod,pdo=45,score=800,odds=50)
cre_scal$logitscaled
cre_scal$minmaxscore
Box plot display of default or not
#Score each item
dat4<-smbinning.scoring.gen(smbscaled=cre_scal, dataset=dat3)
boxplot(Score~loan,data=dat4,horizontal=T, frame=F,
col="lightgray",main="Distribution")
Generate scoring metrics
#Standardized scoring table
scaledcard<-cre_scal$logitscaled[[1]][-1,c(1,2,6)]
scaledcard[,1]<-c(rep("年龄",5),rep("收入",3),rep("孩子数量",2),
rep("婚否",2),rep("在现工作时间",5),
rep("住房类型",3),rep("国籍",8),rep("信用卡类型",7))
scaledcard
7 Write to csv file
ncol(dat4)
dat5=dat4[,10:18]
#write the results
write.table(scaledcard,"card.csv",row.names = F)
write.table(dat4,"card.csv",row.names = F,append = T)
?write.csv
Output files to business personnel