【Python】利用Python开发信用评分卡

1、导入Python包及数据概览

#导入包
import pandas as pd
import numpy as np
​
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline  
matplotlib.rcParams['font.sans-serif']=['SimHei'] 
matplotlib.rcParams['axes.unicode_minus']=False
import seaborn as sns   
import copy 
​
from scipy import stats 
from sklearn.ensemble import RandomForestRegressor  
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve,auc   
import warnings 
warnings.filterwarnings('ignore')
​
#导入数据源
data = pd.read_csv('')
data.describe().T

f,ax = plt.subplots(figsize=(10,5)) #figsize=None,             
sns.countplot('SeriousDlqin2yrs',data=train_data)   
plt.show()  
    
badnum=train_data['SeriousDlqin2yrs'].sum() 
goodnum=train_data['SeriousDlqin2yrs'].count()-train_data['SeriousDlqin2yrs'].sum() 
​
print('训练集数据中,好客户数量为:%i,坏客户数量为:%i,坏客户所占比例为:%.2f%%' %(goodnum,badnum,(badnum/train_data['SeriousDlqin2yrs'].count())*100))   
#样本标签及其不平衡,后面需要使用balance参数

2、数据EDA

2.1 信贷可用额度与总额度比

f,[ax1,ax2]=plt.subplots(1,2,figsize=(12,5)) 
sns.distplot(train_data['RevolvingUtilizationOfUnsecuredLines'],ax=ax1) 
sns.boxplot(y='RevolvingUtilizationOfUnsecuredLines',data=train_data,ax=ax2)    
plt.show()  

2.2 借款时的年龄

f,[ax1,ax2]=plt.subplots(1,2,figsize=(12,5))    
sns.distplot(train_data['age'],ax=ax1)  
sns.boxplot(y='age',data=train_data,ax=ax2) 
plt.show()  
print(train_data['age'].describe())

2.3 逾期30-59天/ 60-89天 /90天笔数

f,[ax1,ax2,ax3] = plt.subplots(1,3,figsize=(12,5))  
sns.boxplot(y='NumberOfTime30-59DaysPastDueNotWorse',data=train_data,ax=ax1)    
sns.boxplot(y='NumberOfTime60-89DaysPastDueNotWorse',data=train_data,ax=ax2)    
sns.boxplot(y='NumberOfTimes90DaysLate',data=train_data,ax=ax3) 
plt.show()

2.4 负债比率

f,[ax1,ax2] = plt.subplots(1,2,figsize=(12,5))  
sns.distplot(train_data['DebtRatio'],ax=ax1)    
sns.boxplot(y='DebtRatio',data=train_data,ax=ax2)   
plt.show()

2.5 信贷数量

f,[ax1,ax2] = plt.subplots(1,2,figsize=(12,5))  
sns.distplot(train_data['NumberOfOpenCreditLinesAndLoans'],ax=ax1)  
sns.boxplot(y='NumberOfOpenCreditLinesAndLoans',data=train_data,ax=ax2) 
plt.show()

2.6 家属数量

f,[ax1,ax2] = plt.subplots(1,2,figsize=(12,5))  
sns.kdeplot(train_data['NumberOfDependents'],ax=ax1)    
sns.boxplot(y='NumberOfDependents',data=train_data,ax=ax2)  
plt.show()

2.7 不动产贷款或额度数量

f,[ax1,ax2] = plt.subplots(1,2,figsize=(12,5))  
sns.distplot(train_data['NumberRealEstateLoansOrLines'],ax=ax1) 
sns.boxplot(y='NumberRealEstateLoansOrLines',data=train_data,ax=ax2)    
plt.show()

2.8月收入

train_data.describe().T
#月收入数据缺失量较大,不能直接删除

3、数据处理

3.1 缺失值处理

#家属数量变量缺失值比较少,直接删除,对总体模型不会造成太大影响
train_data = train_data.dropna() 
#由于变量MonthlyIncome缺失率比较大,采用随机森林填补法
def set_missing(df):
    process_df = df.ix[:,[5,0,1,2,3,4,6,7,8,9]]
    known = process_df[process_df['MonthlyIncome'].notnull()].as_matrix()
    unknown = process_df[process_df['MonthlyIncome'].isnull()].as_matrix()
    X = known[:,1:]
    y = known[:,1]
    rfr = RandomForestRegressor(random_state=0,n_estimators=200,max_depth=3,n_jobs=-1)
    rfr.fit(X,y)
    predicted = rfr.predict(unknown[:,1:]).round(0)
    print(predicted)
   
    df.loc[df.MonthlyIncome.isnull(),'MonthlyIncome'] = predicted
    return df
train_data = set_missing(train_data)

3.2 异常值处理

train_data=train_data[train_data['RevolvingUtilizationOfUnsecuredLines']<1] 
train_data=train_data[train_data['age']>18] 
train_data=train_data[train_data['NumberOfTime30-59DaysPastDueNotWorse']<80]    
train_data=train_data[train_data['NumberOfTime60-89DaysPastDueNotWorse']<80]    
train_data=train_data[train_data['NumberOfTimes90DaysLate']<80] 
train_data=train_data[train_data['NumberRealEstateLoansOrLines']<50]

3.3 校验多重共线性

corr = train_data.corr()#计算各变量的相关性系数
xticks = ['x0','x1','x2','x3','x4','x5','x6','x7','x8','x9','x10']#x轴标签
yticks = list(corr.index)#y轴标签
fig = plt.figure()
ax1 = fig.add_subplot(1, 1, 1)
sns.heatmap(corr, annot=True, cmap='rainbow', ax=ax1, annot_kws={'size': 9, 'weight': 'bold', 'color': 'blue'})#绘制相关性系数热力图
ax1.set_xticklabels(xticks, rotation=0, fontsize=10)
ax1.set_yticklabels(yticks, rotation=0, fontsize=10)
plt.show()
#各变量之间的相关性是较少,无需剔除变量

4、特征加工

4.1 分箱&woe编码

# 定义自动分箱函数------------------------------------------------------------
def mono_bin(Y, X, n=20):
    r = 0
    good = Y.sum()
    bad = Y.count()-good
    while np.abs(r) < 1:
        d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.qcut(X, n,duplicates='drop')})
        d2 = d1.groupby('Bucket', as_index=True)
        r, p = scipy.stats.spearmanr(d2.mean().X, d2.mean().Y)
        n = n - 1
    d3 = pd.DataFrame(d2.X.min(), columns=['min'])
    d3['min'] = d2.min().X
    d3['max'] = d2.max().X
    d3['sum'] = d2.sum().Y
    d3['total'] = d2.count().Y
    d3['rate'] = d2.mean().Y
    d3['woe'] = np.log((d3['rate']/(1-d3['rate']))/(good/bad))
    d3['goodattribute'] = d3['sum']/good
    d3['badattribute'] = (d3['total']-d3['sum'])/bad
    iv = ((d3['goodattribute']-d3['badattribute'])*d3['woe']).sum()
    d4 = (d3.sort_index(by='min'))
    print("=" * 60)
    print(d4)
    cut = []
    cut.append(float('-inf'))
    for i in range(1, n+1):
        qua = X.quantile(i/(n+1))
        cut.append(round(qua, 4))
    cut.append(float('inf'))
    woe = list(d4['woe'].round(3))
    return d4, iv, cut, woe
    
    # 最优分箱
dfx1, ivx1, cutx1, woex1 = mono_bin(train_data.SeriousDlqin2yrs, data.RevolvingUtilizationOfUnsecuredLines, n=10)
dfx2, ivx2, cutx2, woex2 = mono_bin(train_data.SeriousDlqin2yrs, train_data.age, n=10)
dfx4, ivx4, cutx4, woex4 = mono_bin(train_data.SeriousDlqin2yrs, train_data.DebtRatio, n=20)
dfx5, ivx5, cutx5, woex5 = mono_bin(train_data.SeriousDlqin2yrs, train_data.MonthlyIncome, n=10)
​
# 自定义分箱函数---------------------------------------------------------------
def self_bin(Y,X,cut):      
    badnum=Y.sum()   
    goodnum=Y.count()-badnum   
    d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.cut(X, cut)})
    d2 = d1.groupby('Bucket', as_index = True)
    d3 = pd.DataFrame(d2.X.min(), columns = ['min'])    
    d3['min']=d2.min().X        
    d3['max'] = d2.max().X      
    d3['bad'] = d2.sum().Y      
    d3['total'] = d2.count().Y      
    d3['rate'] = d2.mean().Y    
    d3['woe']=np.log((d3['bad']/badnum)/((d3['total'] - d3['bad'])/goodnum))
    d3['badattr'] = d3['bad']/badnum 
    d3['goodattr'] = (d3['total'] - d3['bad'])/goodnum      
    iv = ((d3['badattr']-d3['goodattr'])*d3['woe']).sum()  
    d4 = (d3.sort_values(by = 'min')).reset_index(drop=True)   
    print('分箱结果:')  
    print(d4)   
    print('IV值为:')  
    print(iv)
    woe=list(d4['woe'].round(3))    
    return d4,iv,woe
​
​
ninf = float('-inf')#负无穷大   
pinf = float('inf')#正无穷大    
cutx3 = [ninf, 0, 1, 3, 5, pinf]    
cutx6 = [ninf, 1, 2, 3, 5, pinf]    
cutx7 = [ninf, 0, 1, 3, 5, pinf]    
cutx8 = [ninf, 0,1,2, 3, pinf]  
cutx9 = [ninf, 0, 1, 3, pinf]   
cutx10 = [ninf, 0, 1, 2, 3, 5, pinf]    
dfx3,ivx3,woex3 = self_bin(train_data.SeriousDlqin2yrs,train_data['NumberOfTime30-59DaysPastDueNotWorse'], cutx3)   
dfx6,ivx6 ,woex6= self_bin(train_data.SeriousDlqin2yrs, train_data['NumberOfOpenCreditLinesAndLoans'], cutx6)   
dfx7,ivx7,woex7 = self_bin(train_data.SeriousDlqin2yrs, train_data['NumberOfTimes90DaysLate'], cutx7)   
dfx8, ivx8,woex8 = self_bin(train_data.SeriousDlqin2yrs, train_data['NumberRealEstateLoansOrLines'], cutx8) 
dfx9, ivx9,woex9 = self_bin(train_data.SeriousDlqin2yrs, train_data['NumberOfTime60-89DaysPastDueNotWorse'], cutx9) 
dfx10,ivx10,woex10 = self_bin(train_data.SeriousDlqin2yrs, train_data['NumberOfDependents'], cutx10)

4.2 IV值筛选变量

ivlist=[x1_iv,x2_iv,ivx3,x4_iv,x5_iv,ivx6,ivx7,ivx8,ivx9,ivx10]
index=['x1','x2','x3','x4','x5','x6','x7','x8','x9','x10']
f,ax1=plt.subplots(figsize=(10,5))
ss=pd.DataFrame({"X":index,"Y":ivlist})
sns.barplot(x="X",y="Y",data=ss)
plt.xlabel('var_name',fontsize=16)
plt.ylabel('iv',fontsize=16)
plt.show()
DebtRatio (x4)、MonthlyIncome(x5)、NumberOfOpenCreditLinesAndLoans(x6)、NumberRealEstateLoansOrLines(x8)和NumberOfDependents(x10)变量的IV值低于0.1,故删除。

5、训练模型

train_data=train_data.drop(['DebtRatio','MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans','NumberRealEstateLoansOrLines','NumberOfDependents'],axis=1) 
    
training,testing=train_test_split(train_data,test_size=0.3,random_state=1)  
Y=train_data['SeriousDlqin2yrs']  
X=train_data.iloc[:,-5:]    
​
x_train=training.iloc[:,-5:]    
y_train=training['SeriousDlqin2yrs']    
x_test=testing.iloc[:,-5:]  
y_test=testing['SeriousDlqin2yrs']  
clf = LogisticRegression()  
clf.fit(x_train,y_train)    
​
score_proba = clf.predict_proba(x_test) 
y_predproba=score_proba[:,1]    
coe = clf.coef_ 
print(coe)

6、模型评估

fpr,tpr,threshold = roc_curve(y_test,y_predproba)   
auc_score=auc(fpr,tpr)  
plt.figure(figsize=(8,5))  
plt.plot(fpr,tpr,'b',label='AUC=%0.2f'% auc_score)  
plt.legend(loc='lower right',fontsize=14)   
plt.plot([0, 1], [0, 1], 'r--') 
plt.xlim=([0, 1])   
plt.ylim=([0, 1])   
plt.xticks(fontsize=14) 
plt.yticks(fontsize=14) 
plt.ylabel('TPR-真正率',fontsize=16)   
plt.xlabel('FPR-假正率',fontsize=16)   
plt.show()
​
fig,ax = plt.subplots() 
ax.plot(1-threshold,tpr,label='tpr')    
ax.plot(1-threshold,fpr,label='fpr')    
ax.plot(1-threshold,tpr-fpr,label='KS') 
plt.xlabel('score') 
plt.title('KS curve')   
plt.xlim=([0.0,1.0])    
plt.ylim=([0.0,1.0])    
plt.figure(figsize=(20,20)) 
legend=ax.legend(loc='upper left')  
plt.show()  
max(tpr-fpr)    
#auc=0.84,模型效果较好

7、输出模型分

p=20/np.log(2)#比例因子 
q=600-20*np.log(20)/np.log(2)#等于offset,偏移量  
x_coe=[-2.7340,0.6526,0.5201,0.5581,0.5943,0.4329]#回归系数  ???    
baseScore=round(q+p*x_coe[0],0) 
#个人总评分=基础分+各部分得分    
def get_score(coe,woe,factor):  
    scores=[]   
    for w in woe:   
        score=round(coe*w*factor,0) 
        scores.append(score)    
    return scores   
#每一项得分  
x1_score=get_score(x_coe[1],x1_woe,p)   
x2_score=get_score(x_coe[2],x2_woe,p)   
x3_score=get_score(x_coe[3],woex3,p)    
x7_score=get_score(x_coe[4],woex7,p)    
x9_score=get_score(x_coe[5],woex9,p)    
​
def compute_score(series,cut,score):    
    list = []   
    i = 0   
    while i < len(series):  
        #print(series[i].dtype) 
        #print(series.iloc[i])  
        value = series.iloc[i]  
        j = len(cut) - 2    
        m = len(cut) - 2    
        while j >= 0:   
            if value >= cut[j]: 
                j = -1  
            else:   
                j -= 1  
                m -= 1  
        list.append(score[m])   
        i += 1  
    return list 
    
test_data['BaseScore']=np.zeros(len(test_data))+baseScore   
test_data['x1'] =compute_score(test_data['RevolvingUtilizationOfUnsecuredLines'], x1_cut, x1_score) 
test_data['x2'] = compute_score(test_data['age'], x2_cut, x2_score) 
test_data['x3'] = compute_score(test_data['NumberOfTime30-59DaysPastDueNotWorse'], cutx3, x3_score) 
test_data['x7'] = compute_score(test_data['NumberOfTimes90DaysLate'], cutx7, x7_score)  
test_data['x9'] = compute_score(test_data['NumberOfTime60-89DaysPastDueNotWorse'],cutx9,x9_score)   
test_data['Score'] = test_data['x1'] + test_data['x2'] + test_data['x3'] + test_data['x7'] +test_data['x9']  + baseScore    
    
scoretable2=test_data.iloc[:,[0,-8,-7,-6,-5,-4,-3,-2,-1]]  #选取需要的列,就是评分列    
print(scoretable2.head())   
    
colNameDict={'x1': 'RevolvingUtilizationOfUnsecuredLines' ,'x2':'age','x3':'NumberOfTime30-59DaysPastDueNotWorse',  
             'x7':'NumberOfTimes90DaysLate', 'x9':'NumberOfTime60-89DaysPastDueNotWorse'}   
scoretable2=scoretable2.rename(columns=colNameDict,inplace=False)

转载:【Python】利用Python开发信用评分卡 - 知乎 (zhihu.com)

【Python】利用Python开发信用评分卡就为大家介绍到这里,《python金融风控评分卡模型和数据分析(加强版)》更多相关实战案例会定期更新,用于银行培训,论文参考,大家记得收藏课程。

猜你喜欢

转载自blog.csdn.net/fulk6667g78o8/article/details/132305877