[Python] Using Python to develop a credit score card

1. Import Python package and data overview

#导入包
import pandas as pd
import numpy as np
​
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline  
matplotlib.rcParams['font.sans-serif']=['SimHei'] 
matplotlib.rcParams['axes.unicode_minus']=False
import seaborn as sns   
import copy 
​
from scipy import stats 
from sklearn.ensemble import RandomForestRegressor  
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve,auc   
import warnings 
warnings.filterwarnings('ignore')
​
#导入数据源
data = pd.read_csv('')
data.describe().T

f,ax = plt.subplots(figsize=(10,5)) #figsize=None,             
sns.countplot('SeriousDlqin2yrs',data=train_data)   
plt.show()  
    
badnum=train_data['SeriousDlqin2yrs'].sum() 
goodnum=train_data['SeriousDlqin2yrs'].count()-train_data['SeriousDlqin2yrs'].sum() 
​
print('训练集数据中,好客户数量为:%i,坏客户数量为:%i,坏客户所占比例为:%.2f%%' %(goodnum,badnum,(badnum/train_data['SeriousDlqin2yrs'].count())*100))   
#样本标签及其不平衡,后面需要使用balance参数

2. Data EDA

2.1 Ratio of available credit line to total line

f,[ax1,ax2]=plt.subplots(1,2,figsize=(12,5)) 
sns.distplot(train_data['RevolvingUtilizationOfUnsecuredLines'],ax=ax1) 
sns.boxplot(y='RevolvingUtilizationOfUnsecuredLines',data=train_data,ax=ax2)    
plt.show()  

2.2 Age at the time of borrowing

f,[ax1,ax2]=plt.subplots(1,2,figsize=(12,5))    
sns.distplot(train_data['age'],ax=ax1)  
sns.boxplot(y='age',data=train_data,ax=ax2) 
plt.show()  
print(train_data['age'].describe())

2.3 Number of transactions overdue for 30-59 days/60-89 days/90 days

f,[ax1,ax2,ax3] = plt.subplots(1,3,figsize=(12,5))  
sns.boxplot(y='NumberOfTime30-59DaysPastDueNotWorse',data=train_data,ax=ax1)    
sns.boxplot(y='NumberOfTime60-89DaysPastDueNotWorse',data=train_data,ax=ax2)    
sns.boxplot(y='NumberOfTimes90DaysLate',data=train_data,ax=ax3) 
plt.show()

2.4 Debt Ratio

f,[ax1,ax2] = plt.subplots(1,2,figsize=(12,5))  
sns.distplot(train_data['DebtRatio'],ax=ax1)    
sns.boxplot(y='DebtRatio',data=train_data,ax=ax2)   
plt.show()

2.5 Amount of credit

f,[ax1,ax2] = plt.subplots(1,2,figsize=(12,5))  
sns.distplot(train_data['NumberOfOpenCreditLinesAndLoans'],ax=ax1)  
sns.boxplot(y='NumberOfOpenCreditLinesAndLoans',data=train_data,ax=ax2) 
plt.show()

2.6 Number of dependents

f,[ax1,ax2] = plt.subplots(1,2,figsize=(12,5))  
sns.kdeplot(train_data['NumberOfDependents'],ax=ax1)    
sns.boxplot(y='NumberOfDependents',data=train_data,ax=ax2)  
plt.show()

2.7 Number of real estate loans or lines

f,[ax1,ax2] = plt.subplots(1,2,figsize=(12,5))  
sns.distplot(train_data['NumberRealEstateLoansOrLines'],ax=ax1) 
sns.boxplot(y='NumberRealEstateLoansOrLines',data=train_data,ax=ax2)    
plt.show()

2.8 monthly income

train_data.describe().T
#月收入数据缺失量较大,不能直接删除

3. Data processing

3.1 Missing value processing

#家属数量变量缺失值比较少,直接删除,对总体模型不会造成太大影响
train_data = train_data.dropna() 
#由于变量MonthlyIncome缺失率比较大,采用随机森林填补法
def set_missing(df):
    process_df = df.ix[:,[5,0,1,2,3,4,6,7,8,9]]
    known = process_df[process_df['MonthlyIncome'].notnull()].as_matrix()
    unknown = process_df[process_df['MonthlyIncome'].isnull()].as_matrix()
    X = known[:,1:]
    y = known[:,1]
    rfr = RandomForestRegressor(random_state=0,n_estimators=200,max_depth=3,n_jobs=-1)
    rfr.fit(X,y)
    predicted = rfr.predict(unknown[:,1:]).round(0)
    print(predicted)
   
    df.loc[df.MonthlyIncome.isnull(),'MonthlyIncome'] = predicted
    return df
train_data = set_missing(train_data)

3.2 Outlier processing

train_data=train_data[train_data['RevolvingUtilizationOfUnsecuredLines']<1] 
train_data=train_data[train_data['age']>18] 
train_data=train_data[train_data['NumberOfTime30-59DaysPastDueNotWorse']<80]    
train_data=train_data[train_data['NumberOfTime60-89DaysPastDueNotWorse']<80]    
train_data=train_data[train_data['NumberOfTimes90DaysLate']<80] 
train_data=train_data[train_data['NumberRealEstateLoansOrLines']<50]

3.3 Check for multicollinearity

corr = train_data.corr()#计算各变量的相关性系数
xticks = ['x0','x1','x2','x3','x4','x5','x6','x7','x8','x9','x10']#x轴标签
yticks = list(corr.index)#y轴标签
fig = plt.figure()
ax1 = fig.add_subplot(1, 1, 1)
sns.heatmap(corr, annot=True, cmap='rainbow', ax=ax1, annot_kws={'size': 9, 'weight': 'bold', 'color': 'blue'})#绘制相关性系数热力图
ax1.set_xticklabels(xticks, rotation=0, fontsize=10)
ax1.set_yticklabels(yticks, rotation=0, fontsize=10)
plt.show()
#各变量之间的相关性是较少,无需剔除变量

4. Feature processing

4.1 Binning & woe encoding

# 定义自动分箱函数------------------------------------------------------------
def mono_bin(Y, X, n=20):
    r = 0
    good = Y.sum()
    bad = Y.count()-good
    while np.abs(r) < 1:
        d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.qcut(X, n,duplicates='drop')})
        d2 = d1.groupby('Bucket', as_index=True)
        r, p = scipy.stats.spearmanr(d2.mean().X, d2.mean().Y)
        n = n - 1
    d3 = pd.DataFrame(d2.X.min(), columns=['min'])
    d3['min'] = d2.min().X
    d3['max'] = d2.max().X
    d3['sum'] = d2.sum().Y
    d3['total'] = d2.count().Y
    d3['rate'] = d2.mean().Y
    d3['woe'] = np.log((d3['rate']/(1-d3['rate']))/(good/bad))
    d3['goodattribute'] = d3['sum']/good
    d3['badattribute'] = (d3['total']-d3['sum'])/bad
    iv = ((d3['goodattribute']-d3['badattribute'])*d3['woe']).sum()
    d4 = (d3.sort_index(by='min'))
    print("=" * 60)
    print(d4)
    cut = []
    cut.append(float('-inf'))
    for i in range(1, n+1):
        qua = X.quantile(i/(n+1))
        cut.append(round(qua, 4))
    cut.append(float('inf'))
    woe = list(d4['woe'].round(3))
    return d4, iv, cut, woe
    
    # 最优分箱
dfx1, ivx1, cutx1, woex1 = mono_bin(train_data.SeriousDlqin2yrs, data.RevolvingUtilizationOfUnsecuredLines, n=10)
dfx2, ivx2, cutx2, woex2 = mono_bin(train_data.SeriousDlqin2yrs, train_data.age, n=10)
dfx4, ivx4, cutx4, woex4 = mono_bin(train_data.SeriousDlqin2yrs, train_data.DebtRatio, n=20)
dfx5, ivx5, cutx5, woex5 = mono_bin(train_data.SeriousDlqin2yrs, train_data.MonthlyIncome, n=10)
​
# 自定义分箱函数---------------------------------------------------------------
def self_bin(Y,X,cut):      
    badnum=Y.sum()   
    goodnum=Y.count()-badnum   
    d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.cut(X, cut)})
    d2 = d1.groupby('Bucket', as_index = True)
    d3 = pd.DataFrame(d2.X.min(), columns = ['min'])    
    d3['min']=d2.min().X        
    d3['max'] = d2.max().X      
    d3['bad'] = d2.sum().Y      
    d3['total'] = d2.count().Y      
    d3['rate'] = d2.mean().Y    
    d3['woe']=np.log((d3['bad']/badnum)/((d3['total'] - d3['bad'])/goodnum))
    d3['badattr'] = d3['bad']/badnum 
    d3['goodattr'] = (d3['total'] - d3['bad'])/goodnum      
    iv = ((d3['badattr']-d3['goodattr'])*d3['woe']).sum()  
    d4 = (d3.sort_values(by = 'min')).reset_index(drop=True)   
    print('分箱结果:')  
    print(d4)   
    print('IV值为:')  
    print(iv)
    woe=list(d4['woe'].round(3))    
    return d4,iv,woe
​
​
ninf = float('-inf')#负无穷大   
pinf = float('inf')#正无穷大    
cutx3 = [ninf, 0, 1, 3, 5, pinf]    
cutx6 = [ninf, 1, 2, 3, 5, pinf]    
cutx7 = [ninf, 0, 1, 3, 5, pinf]    
cutx8 = [ninf, 0,1,2, 3, pinf]  
cutx9 = [ninf, 0, 1, 3, pinf]   
cutx10 = [ninf, 0, 1, 2, 3, 5, pinf]    
dfx3,ivx3,woex3 = self_bin(train_data.SeriousDlqin2yrs,train_data['NumberOfTime30-59DaysPastDueNotWorse'], cutx3)   
dfx6,ivx6 ,woex6= self_bin(train_data.SeriousDlqin2yrs, train_data['NumberOfOpenCreditLinesAndLoans'], cutx6)   
dfx7,ivx7,woex7 = self_bin(train_data.SeriousDlqin2yrs, train_data['NumberOfTimes90DaysLate'], cutx7)   
dfx8, ivx8,woex8 = self_bin(train_data.SeriousDlqin2yrs, train_data['NumberRealEstateLoansOrLines'], cutx8) 
dfx9, ivx9,woex9 = self_bin(train_data.SeriousDlqin2yrs, train_data['NumberOfTime60-89DaysPastDueNotWorse'], cutx9) 
dfx10,ivx10,woex10 = self_bin(train_data.SeriousDlqin2yrs, train_data['NumberOfDependents'], cutx10)

4.2 IV value screening variables

ivlist=[x1_iv,x2_iv,ivx3,x4_iv,x5_iv,ivx6,ivx7,ivx8,ivx9,ivx10]
index=['x1','x2','x3','x4','x5','x6','x7','x8','x9','x10']
f,ax1=plt.subplots(figsize=(10,5))
ss=pd.DataFrame({"X":index,"Y":ivlist})
sns.barplot(x="X",y="Y",data=ss)
plt.xlabel('var_name',fontsize=16)
plt.ylabel('iv',fontsize=16)
plt.show()
DebtRatio (x4)、MonthlyIncome(x5)、NumberOfOpenCreditLinesAndLoans(x6)、NumberRealEstateLoansOrLines(x8)和NumberOfDependents(x10)变量的IV值低于0.1,故删除。

5. Training model

train_data=train_data.drop(['DebtRatio','MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans','NumberRealEstateLoansOrLines','NumberOfDependents'],axis=1) 
    
training,testing=train_test_split(train_data,test_size=0.3,random_state=1)  
Y=train_data['SeriousDlqin2yrs']  
X=train_data.iloc[:,-5:]    
​
x_train=training.iloc[:,-5:]    
y_train=training['SeriousDlqin2yrs']    
x_test=testing.iloc[:,-5:]  
y_test=testing['SeriousDlqin2yrs']  
clf = LogisticRegression()  
clf.fit(x_train,y_train)    
​
score_proba = clf.predict_proba(x_test) 
y_predproba=score_proba[:,1]    
coe = clf.coef_ 
print(coe)

6. Model Evaluation

fpr,tpr,threshold = roc_curve(y_test,y_predproba)   
auc_score=auc(fpr,tpr)  
plt.figure(figsize=(8,5))  
plt.plot(fpr,tpr,'b',label='AUC=%0.2f'% auc_score)  
plt.legend(loc='lower right',fontsize=14)   
plt.plot([0, 1], [0, 1], 'r--') 
plt.xlim=([0, 1])   
plt.ylim=([0, 1])   
plt.xticks(fontsize=14) 
plt.yticks(fontsize=14) 
plt.ylabel('TPR-真正率',fontsize=16)   
plt.xlabel('FPR-假正率',fontsize=16)   
plt.show()
​
fig,ax = plt.subplots() 
ax.plot(1-threshold,tpr,label='tpr')    
ax.plot(1-threshold,fpr,label='fpr')    
ax.plot(1-threshold,tpr-fpr,label='KS') 
plt.xlabel('score') 
plt.title('KS curve')   
plt.xlim=([0.0,1.0])    
plt.ylim=([0.0,1.0])    
plt.figure(figsize=(20,20)) 
legend=ax.legend(loc='upper left')  
plt.show()  
max(tpr-fpr)    
#auc=0.84,模型效果较好

7. Output model score

p=20/np.log(2)#比例因子 
q=600-20*np.log(20)/np.log(2)#等于offset,偏移量  
x_coe=[-2.7340,0.6526,0.5201,0.5581,0.5943,0.4329]#回归系数  ???    
baseScore=round(q+p*x_coe[0],0) 
#个人总评分=基础分+各部分得分    
def get_score(coe,woe,factor):  
    scores=[]   
    for w in woe:   
        score=round(coe*w*factor,0) 
        scores.append(score)    
    return scores   
#每一项得分  
x1_score=get_score(x_coe[1],x1_woe,p)   
x2_score=get_score(x_coe[2],x2_woe,p)   
x3_score=get_score(x_coe[3],woex3,p)    
x7_score=get_score(x_coe[4],woex7,p)    
x9_score=get_score(x_coe[5],woex9,p)    
​
def compute_score(series,cut,score):    
    list = []   
    i = 0   
    while i < len(series):  
        #print(series[i].dtype) 
        #print(series.iloc[i])  
        value = series.iloc[i]  
        j = len(cut) - 2    
        m = len(cut) - 2    
        while j >= 0:   
            if value >= cut[j]: 
                j = -1  
            else:   
                j -= 1  
                m -= 1  
        list.append(score[m])   
        i += 1  
    return list 
    
test_data['BaseScore']=np.zeros(len(test_data))+baseScore   
test_data['x1'] =compute_score(test_data['RevolvingUtilizationOfUnsecuredLines'], x1_cut, x1_score) 
test_data['x2'] = compute_score(test_data['age'], x2_cut, x2_score) 
test_data['x3'] = compute_score(test_data['NumberOfTime30-59DaysPastDueNotWorse'], cutx3, x3_score) 
test_data['x7'] = compute_score(test_data['NumberOfTimes90DaysLate'], cutx7, x7_score)  
test_data['x9'] = compute_score(test_data['NumberOfTime60-89DaysPastDueNotWorse'],cutx9,x9_score)   
test_data['Score'] = test_data['x1'] + test_data['x2'] + test_data['x3'] + test_data['x7'] +test_data['x9']  + baseScore    
    
scoretable2=test_data.iloc[:,[0,-8,-7,-6,-5,-4,-3,-2,-1]]  #选取需要的列,就是评分列    
print(scoretable2.head())   
    
colNameDict={'x1': 'RevolvingUtilizationOfUnsecuredLines' ,'x2':'age','x3':'NumberOfTime30-59DaysPastDueNotWorse',  
             'x7':'NumberOfTimes90DaysLate', 'x9':'NumberOfTime60-89DaysPastDueNotWorse'}   
scoretable2=scoretable2.rename(columns=colNameDict,inplace=False)

Reprinted: [Python] Using Python to develop a credit score card - Zhihu (zhihu.com)

[Python] Using Python to develop credit score cards is introduced here. More relevant practical cases in "Python Financial Risk Control Score Card Model and Data Analysis (Enhanced Edition)" will be updated regularly for bank training and paper reference. Everyone Remember to bookmark the course.

Guess you like

Origin blog.csdn.net/fulk6667g78o8/article/details/132305877