85 R Bank Credit Card Risk Control Score Data Analysis

1 Initial environment preparation

Read data and preview

rm(list=ls())
#setwd("./case")
#install.packages("xlsx")
library(openxlsx)
dat<-read.xlsx("credit.xlsx",1)
View(dat)

insert image description here

2 Data preview and processing

Data preview, find outliers with a maximum value of 999, deviating from the average

class(dat)
#describe data
summary(dat)
sum(is.na(dat))

Outlier handling, fill with na

#Outlier filling
dat[,1:6]<-sapply(dat[,1:6],function(x) {x[x==999]<-NA;return(x)} )
nrow(dat)
ncol(dat)
summary(dat[,11])
#Understand data deletion invalid variables
dat<-dat[,-11]

Convert character variable character to factor variable factor
#Change string variable type to classification variable

dat1<-dat
sapply(dat1,class)
ch=names(which(sapply(dat1,is.character)))#find the character type variance
dat1[,ch]=as.data.frame(lapply(dat1[,ch], as.factor))

insert image description here
Observing the data, it is found that the number of family members and the number of children in the family are closely related, and there is multicollinearity, so the identification variable is generated to replace these two variables

dat1[,4]<-dat1[,4]-dat1[,3]

table(dat1[,4])
dat1[,4]<-factor(dat1[,4],levels=c(1,2),labels=c("其他","已婚"))

colnames(dat1)<-c("age","income","child","marital","dur_live",
                  "dur_work","housetype","nation","cardtype","loan")
summary(dat1)

insert image description here

3 Descriptive Statistics

Related package preparation

#install.packages("smbinning")
#install.packages("prettyR")
library(smbinning)
library(prettyR)

library(mvtnorm)
library(kernlab)

cap method

Outliers can be handled using the cap method, using the 1% and 99% quantiles to replace outliers

##Capping method function removes outliers and replaces outliers with 99% and 1% points respectively

block<-function(x,lower=T,upper=T){
  if(lower){
    q1<-quantile(x,0.01)
    x[x<=q1]<-q1
  }
  if(upper){
    q99<-quantile(x,0.99)
    x[x>q99]<-q99
  }
  return(x)
}

In the data set, 1 is non-default, 0 is default, and the reverse setting is performed, so that 1 becomes default and 0 is non-default.

#Odds ratio conversion for later IV calculation
dat1$loan<-as.numeric(!as.logical(dat1$loan))

Descriptive data classification statistics

The difference between defaulters and non-defaulters

#data classification ,discretization of continuous variables
##age
boxplot(age~loan,data=dat1,horizontal=T, frame=F, 
        col="lightgray",main="Distribution")
age<-smbinning(dat1,"loan","age")
age$ivtable

Age boxplot for default or not
insert image description here
IV plot after binning

age<-smbinning(dat1,"loan","age")
age$ivtable

insert image description here

After binning the age, check the percentage, weight, good, and bad rate. The following descriptive statistics are roughly the same. Qin Chu can see the difference between different age levels.

par(mfrow=c(2,2))
smbinning.plot(age,option="dist",sub="年龄")
smbinning.plot(age,option="WoE",sub="年龄")
smbinning.plot(age,option="goodrate",sub="年龄")
smbinning.plot(age,option="badrate",sub="年龄"

insert image description here
Add IV result to a vector

par(mfrow=c(1,1))
age$iv
#Add Iv value to vector
cred_iv<-c("年龄"=age$iv)

insert image description here
Regarding revenue, there are clearly outliers, using the cap method

##income

boxplot(income~loan,data=dat1,horizontal=T, frame=F, 
        col="lightgray",main="Distribution")
   

insert image description here
cap filling

dat1$income<-block(dat1$income)

After filling, it became normal

boxplot(income~loan,data=dat1,horizontal=T, frame=F, 
        col="lightgray",main="Distribution")
        ```

insert image description here

IV value measurement, same as above age

income<-smbinning(dat1,"loan","income")
income$ivtable
smbinning.plot(income,option="WoE",sub="收入")
income$iv
cred_iv<-c(cred_iv,"收入"=income$iv)

child statistics

##child
boxplot(child~loan,data=dat1,horizontal=T, frame=F, 
        col="lightgray",main="Distribution")


child<-smbinning(dat1,"loan","child")
child$ivtable
smbinning.plot(child,option="WoE",sub="孩子数量")
child$iv
cred_iv<-c(cred_iv,"孩子数量"=child$iv)

insert image description here

##marital
xtab(~marital+loan,data=dat1,chisq=T)
marital<-smbinning.factor(dat1,"loan","marital")
marital$ivtable
smbinning.plot(marital,option="WoE",sub="婚姻状态")
marital$iv
cred_iv<-c(cred_iv,"婚姻状态"=marital$iv)

insert image description here

##dur_live
boxplot(dur_live~loan,data=dat1,horizontal=T, 
        frame=F, col="lightgray",main="Distribution")
t.test(dur_live~loan,data=dat1)
dur_live<-smbinning(dat1,"loan","dur_live")
dur_live

It is observed that the dur_live variable has little difference to the default distribution. Using the t test, the same distribution of the two cannot be rejected.

insert image description here

insert image description here
dur_work variable statistics

##dur_work
boxplot(dur_work~loan,data=dat1,horizontal=T, 
        frame=F, col="lightgray",main="Distribution")
t.test(dur_work~loan,data=dat1)
dur_work<-smbinning(dat1,"loan","dur_work")
dur_work$ivtable
smbinning.plot(dur_work,option="WoE",sub="在现工作时间")
dur_work$iv
cred_iv<-c(cred_iv,"在现工作时间"=dur_work$iv)

insert image description here
housetype descriptive statistics

##housetype
xtab(~housetype+loan,data=dat1,chisq=T)
housetype<-smbinning.factor(dat1,"loan","housetype")
housetype$ivtable
smbinning.plot(housetype,option="WoE",sub="住房类型")
housetype$iv
cred_iv<-c(cred_iv,"住房种类"=housetype$iv)

insert image description here

##nation
xtab(~nation+loan,data=dat1,chisq=T)
nation<-smbinning.factor(dat1,"loan","nation")
nation$ivtable
smbinning.plot(nation,option="WoE",sub="国籍")
nation$iv
cred_iv<-c(cred_iv,"国籍"=nation$iv)

insert image description here
cardtype descriptive statistics

##cardtype
xtab(~cardtype+loan,data=dat1,chisq=T)
cardtype<-smbinning.factor(dat1,"loan","cardtype")
cardtype$ivtable
smbinning.plot(cardtype,option="WoE",sub="信用卡类型")
cardtype$iv
cred_iv<-c(cred_iv,"信用卡类型"=cardtype$iv)

insert image description here
Overall variable IV value degree


#Drawing shows the amount of information
barplot(cred_iv,main="各变量信息值")

insert image description here

4 attribute binning

#quantity after adding classification
dat2<-dat1
dat2<-smbinning.gen(dat2,age,"glage")
dat2<-smbinning.gen(dat2,income,"glincome")
dat2<-smbinning.gen(dat2,child,"glchild")
dat2<-smbinning.factor.gen(dat2,marital,"glmarital")
dat2<-smbinning.gen(dat2,dur_work,"gldur_work")
dat2<-smbinning.factor.gen(dat2,housetype,"glhousetype")
dat2<-smbinning.factor.gen(dat2,nation,"glnation")
dat2<-smbinning.factor.gen(dat2,cardtype,"glcardtype")
View(dat2)

insert image description here

dat3<-dat2[,c(11:18,10)]

View(dat3)

Generate binned data levels and convert user attributes into interval data
insert image description here

5 Logistic modeling

Specific scoring theory reference link: link .

model generation

#Creat logistic regression
cred_mod<-glm(loan~. ,data=dat3,family=binomial())
summary(cred_mod)

insert image description here

6 points system

According to the scoring formula, the highest and lowest credit scores are 797,362 points respectively.

#Scoring card system
cre_scal<-smbinning.scaling(cred_mod,pdo=45,score=800,odds=50)
cre_scal$logitscaled
cre_scal$minmaxscore

insert image description here
Box plot display of default or not

#Score each item
dat4<-smbinning.scoring.gen(smbscaled=cre_scal, dataset=dat3)
boxplot(Score~loan,data=dat4,horizontal=T, frame=F, 
        col="lightgray",main="Distribution")

insert image description here
Generate scoring metrics

#Standardized scoring table
scaledcard<-cre_scal$logitscaled[[1]][-1,c(1,2,6)]
scaledcard[,1]<-c(rep("年龄",5),rep("收入",3),rep("孩子数量",2),
                  rep("婚否",2),rep("在现工作时间",5),
                  rep("住房类型",3),rep("国籍",8),rep("信用卡类型",7))
scaledcard

7 Write to csv file

ncol(dat4)
dat5=dat4[,10:18]

#write the results
write.table(scaledcard,"card.csv",row.names = F)
write.table(dat4,"card.csv",row.names = F,append = T)
?write.csv

Output files to business personnel
insert image description here

Guess you like

Origin blog.csdn.net/weixin_44498127/article/details/124300356