1、导入Python包及数据概览
#导入包
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
matplotlib.rcParams['font.sans-serif']=['SimHei']
matplotlib.rcParams['axes.unicode_minus']=False
import seaborn as sns
import copy
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve,auc
import warnings
warnings.filterwarnings('ignore')
#导入数据源
data = pd.read_csv('')
data.describe().T
f,ax = plt.subplots(figsize=(10,5)) #figsize=None,
sns.countplot('SeriousDlqin2yrs',data=train_data)
plt.show()
badnum=train_data['SeriousDlqin2yrs'].sum()
goodnum=train_data['SeriousDlqin2yrs'].count()-train_data['SeriousDlqin2yrs'].sum()
print('训练集数据中,好客户数量为:%i,坏客户数量为:%i,坏客户所占比例为:%.2f%%' %(goodnum,badnum,(badnum/train_data['SeriousDlqin2yrs'].count())*100))
#样本标签及其不平衡,后面需要使用balance参数
2、数据EDA
2.1 信贷可用额度与总额度比
f,[ax1,ax2]=plt.subplots(1,2,figsize=(12,5))
sns.distplot(train_data['RevolvingUtilizationOfUnsecuredLines'],ax=ax1)
sns.boxplot(y='RevolvingUtilizationOfUnsecuredLines',data=train_data,ax=ax2)
plt.show()
2.2 借款时的年龄
f,[ax1,ax2]=plt.subplots(1,2,figsize=(12,5))
sns.distplot(train_data['age'],ax=ax1)
sns.boxplot(y='age',data=train_data,ax=ax2)
plt.show()
print(train_data['age'].describe())
2.3 逾期30-59天/ 60-89天 /90天笔数
f,[ax1,ax2,ax3] = plt.subplots(1,3,figsize=(12,5))
sns.boxplot(y='NumberOfTime30-59DaysPastDueNotWorse',data=train_data,ax=ax1)
sns.boxplot(y='NumberOfTime60-89DaysPastDueNotWorse',data=train_data,ax=ax2)
sns.boxplot(y='NumberOfTimes90DaysLate',data=train_data,ax=ax3)
plt.show()
2.4 负债比率
f,[ax1,ax2] = plt.subplots(1,2,figsize=(12,5))
sns.distplot(train_data['DebtRatio'],ax=ax1)
sns.boxplot(y='DebtRatio',data=train_data,ax=ax2)
plt.show()
2.5 信贷数量
f,[ax1,ax2] = plt.subplots(1,2,figsize=(12,5))
sns.distplot(train_data['NumberOfOpenCreditLinesAndLoans'],ax=ax1)
sns.boxplot(y='NumberOfOpenCreditLinesAndLoans',data=train_data,ax=ax2)
plt.show()
2.6 家属数量
f,[ax1,ax2] = plt.subplots(1,2,figsize=(12,5))
sns.kdeplot(train_data['NumberOfDependents'],ax=ax1)
sns.boxplot(y='NumberOfDependents',data=train_data,ax=ax2)
plt.show()
2.7 不动产贷款或额度数量
f,[ax1,ax2] = plt.subplots(1,2,figsize=(12,5))
sns.distplot(train_data['NumberRealEstateLoansOrLines'],ax=ax1)
sns.boxplot(y='NumberRealEstateLoansOrLines',data=train_data,ax=ax2)
plt.show()
2.8月收入
train_data.describe().T
#月收入数据缺失量较大,不能直接删除
3、数据处理
3.1 缺失值处理
#家属数量变量缺失值比较少,直接删除,对总体模型不会造成太大影响
train_data = train_data.dropna()
#由于变量MonthlyIncome缺失率比较大,采用随机森林填补法
def set_missing(df):
process_df = df.ix[:,[5,0,1,2,3,4,6,7,8,9]]
known = process_df[process_df['MonthlyIncome'].notnull()].as_matrix()
unknown = process_df[process_df['MonthlyIncome'].isnull()].as_matrix()
X = known[:,1:]
y = known[:,1]
rfr = RandomForestRegressor(random_state=0,n_estimators=200,max_depth=3,n_jobs=-1)
rfr.fit(X,y)
predicted = rfr.predict(unknown[:,1:]).round(0)
print(predicted)
df.loc[df.MonthlyIncome.isnull(),'MonthlyIncome'] = predicted
return df
train_data = set_missing(train_data)
3.2 异常值处理
train_data=train_data[train_data['RevolvingUtilizationOfUnsecuredLines']<1]
train_data=train_data[train_data['age']>18]
train_data=train_data[train_data['NumberOfTime30-59DaysPastDueNotWorse']<80]
train_data=train_data[train_data['NumberOfTime60-89DaysPastDueNotWorse']<80]
train_data=train_data[train_data['NumberOfTimes90DaysLate']<80]
train_data=train_data[train_data['NumberRealEstateLoansOrLines']<50]
3.3 校验多重共线性
corr = train_data.corr()#计算各变量的相关性系数
xticks = ['x0','x1','x2','x3','x4','x5','x6','x7','x8','x9','x10']#x轴标签
yticks = list(corr.index)#y轴标签
fig = plt.figure()
ax1 = fig.add_subplot(1, 1, 1)
sns.heatmap(corr, annot=True, cmap='rainbow', ax=ax1, annot_kws={'size': 9, 'weight': 'bold', 'color': 'blue'})#绘制相关性系数热力图
ax1.set_xticklabels(xticks, rotation=0, fontsize=10)
ax1.set_yticklabels(yticks, rotation=0, fontsize=10)
plt.show()
#各变量之间的相关性是较少,无需剔除变量
4、特征加工
4.1 分箱&woe编码
# 定义自动分箱函数------------------------------------------------------------
def mono_bin(Y, X, n=20):
r = 0
good = Y.sum()
bad = Y.count()-good
while np.abs(r) < 1:
d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.qcut(X, n,duplicates='drop')})
d2 = d1.groupby('Bucket', as_index=True)
r, p = scipy.stats.spearmanr(d2.mean().X, d2.mean().Y)
n = n - 1
d3 = pd.DataFrame(d2.X.min(), columns=['min'])
d3['min'] = d2.min().X
d3['max'] = d2.max().X
d3['sum'] = d2.sum().Y
d3['total'] = d2.count().Y
d3['rate'] = d2.mean().Y
d3['woe'] = np.log((d3['rate']/(1-d3['rate']))/(good/bad))
d3['goodattribute'] = d3['sum']/good
d3['badattribute'] = (d3['total']-d3['sum'])/bad
iv = ((d3['goodattribute']-d3['badattribute'])*d3['woe']).sum()
d4 = (d3.sort_index(by='min'))
print("=" * 60)
print(d4)
cut = []
cut.append(float('-inf'))
for i in range(1, n+1):
qua = X.quantile(i/(n+1))
cut.append(round(qua, 4))
cut.append(float('inf'))
woe = list(d4['woe'].round(3))
return d4, iv, cut, woe
# 最优分箱
dfx1, ivx1, cutx1, woex1 = mono_bin(train_data.SeriousDlqin2yrs, data.RevolvingUtilizationOfUnsecuredLines, n=10)
dfx2, ivx2, cutx2, woex2 = mono_bin(train_data.SeriousDlqin2yrs, train_data.age, n=10)
dfx4, ivx4, cutx4, woex4 = mono_bin(train_data.SeriousDlqin2yrs, train_data.DebtRatio, n=20)
dfx5, ivx5, cutx5, woex5 = mono_bin(train_data.SeriousDlqin2yrs, train_data.MonthlyIncome, n=10)
# 自定义分箱函数---------------------------------------------------------------
def self_bin(Y,X,cut):
badnum=Y.sum()
goodnum=Y.count()-badnum
d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.cut(X, cut)})
d2 = d1.groupby('Bucket', as_index = True)
d3 = pd.DataFrame(d2.X.min(), columns = ['min'])
d3['min']=d2.min().X
d3['max'] = d2.max().X
d3['bad'] = d2.sum().Y
d3['total'] = d2.count().Y
d3['rate'] = d2.mean().Y
d3['woe']=np.log((d3['bad']/badnum)/((d3['total'] - d3['bad'])/goodnum))
d3['badattr'] = d3['bad']/badnum
d3['goodattr'] = (d3['total'] - d3['bad'])/goodnum
iv = ((d3['badattr']-d3['goodattr'])*d3['woe']).sum()
d4 = (d3.sort_values(by = 'min')).reset_index(drop=True)
print('分箱结果:')
print(d4)
print('IV值为:')
print(iv)
woe=list(d4['woe'].round(3))
return d4,iv,woe
ninf = float('-inf')#负无穷大
pinf = float('inf')#正无穷大
cutx3 = [ninf, 0, 1, 3, 5, pinf]
cutx6 = [ninf, 1, 2, 3, 5, pinf]
cutx7 = [ninf, 0, 1, 3, 5, pinf]
cutx8 = [ninf, 0,1,2, 3, pinf]
cutx9 = [ninf, 0, 1, 3, pinf]
cutx10 = [ninf, 0, 1, 2, 3, 5, pinf]
dfx3,ivx3,woex3 = self_bin(train_data.SeriousDlqin2yrs,train_data['NumberOfTime30-59DaysPastDueNotWorse'], cutx3)
dfx6,ivx6 ,woex6= self_bin(train_data.SeriousDlqin2yrs, train_data['NumberOfOpenCreditLinesAndLoans'], cutx6)
dfx7,ivx7,woex7 = self_bin(train_data.SeriousDlqin2yrs, train_data['NumberOfTimes90DaysLate'], cutx7)
dfx8, ivx8,woex8 = self_bin(train_data.SeriousDlqin2yrs, train_data['NumberRealEstateLoansOrLines'], cutx8)
dfx9, ivx9,woex9 = self_bin(train_data.SeriousDlqin2yrs, train_data['NumberOfTime60-89DaysPastDueNotWorse'], cutx9)
dfx10,ivx10,woex10 = self_bin(train_data.SeriousDlqin2yrs, train_data['NumberOfDependents'], cutx10)
4.2 IV值筛选变量
ivlist=[x1_iv,x2_iv,ivx3,x4_iv,x5_iv,ivx6,ivx7,ivx8,ivx9,ivx10]
index=['x1','x2','x3','x4','x5','x6','x7','x8','x9','x10']
f,ax1=plt.subplots(figsize=(10,5))
ss=pd.DataFrame({"X":index,"Y":ivlist})
sns.barplot(x="X",y="Y",data=ss)
plt.xlabel('var_name',fontsize=16)
plt.ylabel('iv',fontsize=16)
plt.show()
DebtRatio (x4)、MonthlyIncome(x5)、NumberOfOpenCreditLinesAndLoans(x6)、NumberRealEstateLoansOrLines(x8)和NumberOfDependents(x10)变量的IV值低于0.1,故删除。
5、训练模型
train_data=train_data.drop(['DebtRatio','MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans','NumberRealEstateLoansOrLines','NumberOfDependents'],axis=1)
training,testing=train_test_split(train_data,test_size=0.3,random_state=1)
Y=train_data['SeriousDlqin2yrs']
X=train_data.iloc[:,-5:]
x_train=training.iloc[:,-5:]
y_train=training['SeriousDlqin2yrs']
x_test=testing.iloc[:,-5:]
y_test=testing['SeriousDlqin2yrs']
clf = LogisticRegression()
clf.fit(x_train,y_train)
score_proba = clf.predict_proba(x_test)
y_predproba=score_proba[:,1]
coe = clf.coef_
print(coe)
6、模型评估
fpr,tpr,threshold = roc_curve(y_test,y_predproba)
auc_score=auc(fpr,tpr)
plt.figure(figsize=(8,5))
plt.plot(fpr,tpr,'b',label='AUC=%0.2f'% auc_score)
plt.legend(loc='lower right',fontsize=14)
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim=([0, 1])
plt.ylim=([0, 1])
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.ylabel('TPR-真正率',fontsize=16)
plt.xlabel('FPR-假正率',fontsize=16)
plt.show()
fig,ax = plt.subplots()
ax.plot(1-threshold,tpr,label='tpr')
ax.plot(1-threshold,fpr,label='fpr')
ax.plot(1-threshold,tpr-fpr,label='KS')
plt.xlabel('score')
plt.title('KS curve')
plt.xlim=([0.0,1.0])
plt.ylim=([0.0,1.0])
plt.figure(figsize=(20,20))
legend=ax.legend(loc='upper left')
plt.show()
max(tpr-fpr)
#auc=0.84,模型效果较好
7、输出模型分
p=20/np.log(2)#比例因子
q=600-20*np.log(20)/np.log(2)#等于offset,偏移量
x_coe=[-2.7340,0.6526,0.5201,0.5581,0.5943,0.4329]#回归系数 ???
baseScore=round(q+p*x_coe[0],0)
#个人总评分=基础分+各部分得分
def get_score(coe,woe,factor):
scores=[]
for w in woe:
score=round(coe*w*factor,0)
scores.append(score)
return scores
#每一项得分
x1_score=get_score(x_coe[1],x1_woe,p)
x2_score=get_score(x_coe[2],x2_woe,p)
x3_score=get_score(x_coe[3],woex3,p)
x7_score=get_score(x_coe[4],woex7,p)
x9_score=get_score(x_coe[5],woex9,p)
def compute_score(series,cut,score):
list = []
i = 0
while i < len(series):
#print(series[i].dtype)
#print(series.iloc[i])
value = series.iloc[i]
j = len(cut) - 2
m = len(cut) - 2
while j >= 0:
if value >= cut[j]:
j = -1
else:
j -= 1
m -= 1
list.append(score[m])
i += 1
return list
test_data['BaseScore']=np.zeros(len(test_data))+baseScore
test_data['x1'] =compute_score(test_data['RevolvingUtilizationOfUnsecuredLines'], x1_cut, x1_score)
test_data['x2'] = compute_score(test_data['age'], x2_cut, x2_score)
test_data['x3'] = compute_score(test_data['NumberOfTime30-59DaysPastDueNotWorse'], cutx3, x3_score)
test_data['x7'] = compute_score(test_data['NumberOfTimes90DaysLate'], cutx7, x7_score)
test_data['x9'] = compute_score(test_data['NumberOfTime60-89DaysPastDueNotWorse'],cutx9,x9_score)
test_data['Score'] = test_data['x1'] + test_data['x2'] + test_data['x3'] + test_data['x7'] +test_data['x9'] + baseScore
scoretable2=test_data.iloc[:,[0,-8,-7,-6,-5,-4,-3,-2,-1]] #选取需要的列,就是评分列
print(scoretable2.head())
colNameDict={'x1': 'RevolvingUtilizationOfUnsecuredLines' ,'x2':'age','x3':'NumberOfTime30-59DaysPastDueNotWorse',
'x7':'NumberOfTimes90DaysLate', 'x9':'NumberOfTime60-89DaysPastDueNotWorse'}
scoretable2=scoretable2.rename(columns=colNameDict,inplace=False)