使用XGBT构建——互联网金融客户流失模型

点击下载是所需数据

from pandas import DataFrame
from numpy import nan as NA
from pandas import Series
import os 
import pandas as pd
import numpy as  np
import math
import random
from matplotlib import pyplot
from pandas.tools.plotting import scatter_matrix
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from scipy.stats import chisquare
import numbers
import time
import datetime
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import StratifiedKFold
#解决输出图片不能为含有中文
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['FangSong'] # 指定默认字体
mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题

os.chdir('E://kaggle//risk')
bank_data=pd.read_csv('bankChurn_1.csv')
external_data=pd.read_csv('ExternalData_1.csv')
internal_dic=pd.read_csv('internal_dic.csv')
external_dic=pd.read_csv('external_dic.csv')
path='E://kaggle//risk//'

#观测单因子因素变量分布
#df--数据集
#col--需要研究的变量
#target--目标变量
#filepath--保存路径
#truncation-是否处理极端数据
def NumVarPerf(df,col,target,filepath,truncation=True):
    #塞选出有效数据集
    validDf = df.loc[df[col] == df[col]][[col,target]]
    #测算有效数据集百分比,这里乘以1.0是因为python整数与整数相除还是整数,此举可以让结果变成浮点数。
    validRcd=validDf.shape[0]*1.0/df.shape[0]
    #以百分好数字输出  
    validRcdFmt='%.2f%%' %(validRcd*100)
    #查看与提取一些统计量
    descStats=validDf[col].describe()
    mu='%.2e' % (descStats['mean'])
    std='%.2e' % (descStats['std'])
    maxVal= '%.2e' % (descStats['max'])
    minVal = '%.2e' % (descStats['min'])
    #类别型目标变量,不同类别所要研究自变量的数据,以下代码只适合两分类,且数字分别为0和1
    x=validDf.loc[validDf[target] == 1][col]
    y=validDf.loc[validDf[target] == 0][col]
    #设定每个值的权重
    xweights = 100*np.ones_like(x)/x.size
    yweights = 100*np.ones_like(y)/y.size
    #如果truncation为True,启用下方功能,主要作用是将超过95%分为点的数值改为95%分为点数值
    if  truncation == True:
        pcnt95=np.percentile(validDf[col],95)
        x=x.map(lambda x: min(x,pcnt95))
        y=y.map(lambda y: min(y,pcnt95))
    #使用matplotlib画图
    #创建图画对象
    flg1=pyplot.figure()
    ax=flg1.add_subplot(111)
    #画条形图
    ax.hist(x,weights=xweights,alpha=0.5,label='流失客户')
    ax.hist(y,weights=yweights,alpha=0.5,label='留存客户')
    titleText = '是'+merge_dic[col]+'的条形图'+'\n'+'有效数据收集百分比为:'+str(validRcdFmt)+','+'Mean='+str(mu)+','+'Std='+str(std)
    ax.set(title=titleText,ylabel='各数据分类百分比')
    ax.margins(0.05)
    ax.set_ylim(bottom=0)
    pyplot.legend(loc='upper right')
    figSavePath=filepath+str(merge_dic[col])+'--'+str(col)+'.png'
    pyplot.savefig(figSavePath)
    pyplot.close(1)

def CharVarPerf(df,col,target,filepath):
    #塞选出有效数据集
    validDf=df.loc[df[col] == df[col]][[target,col]]
    #测算有效数据集百分比
    validRcd=validDf.shape[0]/df.shape[0]
    #以百分好数字输出  
    validRcdFmt='%.2f%%' %(validRcd*100)
    #有效数据集数量
    recdNum=validDf.shape[0]
    #不同类别的占比
    freqDict={}
    #不同类别的流失率
    churnRateDict={}
    for v in set(validDf[col]):
        vDf = validDf.loc[validDf[col] == v]
        #计算不同类别占比
        freqDict[v]=vDf.shape[0]*1.0/recdNum
        #计算不同类别流失率,这里target必须是取值0和1.
        churnRateDict[v]=sum(vDf[target])*1.0/vDf.shape[0]
    descStats=pd.DataFrame({'percent':freqDict,'churn rate':churnRateDict})
    fig=pyplot.figure()
    ax=fig.add_subplot(111) 
    #可以理解为创建另一根Y轴
    ax2=ax.twinx()
    pyplot.title('The percentage and churn rate for '+merge_dic[col]+'\n' 'valid pcnt =' + validRcdFmt)
    descStats['churn rate'].plot(kind='line',color='red',ax=ax)
    descStats['percent'].plot(kind='bar',color='green',ax=ax2,width=0.2,position =1 )
    ax.set_ylabel('流失率-折线图')
    ax2.set_ylabel('所占百分比-条形图')
    figSavePath = filepath +str(merge_dic[col])+'--'+str(col)+'.png'
    pyplot.savefig(figSavePath)
    pyplot.close(1)

#编制填补缺失值的函数    
def MakeupMissing(df,col,type,method):
    validDf=df.loc[df[col]==df[col]]
    if validDf.shape[0] == df.shape[0]:
        return '在以下特征中{}没有缺失值'.format(col)
    #创建一个副本,防止影响原数据框
    missingList = [i for i in df[col]]
    if type =='Continuous':
        if method not in ['Mean','Random']:
            return '请明确使用正确的方法'
        descStats=validDf[col].describe()
        mu=descStats['mean']
        std=descStats['std']
        maxVal=descStats['max']
        #根据切比雪夫不等式,超过3个标准差的数出现概率很低,基本可以推断为极端值
        if maxVal > mu+3*std:
            for i in list(validDf.index):
                if validDf.loc[i][col]>mu+3*std:
                    validDf.loc[i][col]=mu+3*std
            mu=validDf[col].describe()['mean']
        for i in range(df.shape[0]):
            if df.loc[i][col] != df.loc[i][col]:
                if method == 'Mean':
                    missingList[i]=mu
                elif method =='Random':
                    missingList[i]=random.sample(validDf[col],1)[0]               
    elif type =='Categorical':
        if method not in ['Mode','Random']:
            return '请明确使用正确的方法'
        freqDict={}
        recdNum = validDf.shape[0]
        for v in set (validDf[col]):
            vDf = validDf.loc[validDf[col] == v]
            freqDict[v] = vDf.shape[0]*1.0/recdNum
        #得出众数,key指的是定义以哪个数值为评价指标
        modeVal = max(freqDict.items(),key=lambda x : x[1])[0]
        freqTuple = freqDict.items()
        freqList=[0]+[i[1] for i in freqTuple]
        freqCumsum = np.cumsum(freqList)
        for i in range(df.shape[0]):
            if df.loc[i][col] != df.loc[i][col]:
                if method == 'Mode':
                    missingList[i]=modeVal
                elif method == 'Random':
                    a=random.random()
                    position = [k+1 for k in range(len(freqCumsum)-1) if freqCumsum[k]<a<=freqCumsum[k+1]][0]
                    missingList[i] = list(freqTuple)[position-1][0]
    print('列{0}的缺失值已经用{1}方法填充'.format(col,method))
    return missingList    
                
#将分类型变量编码
def Encoder(df, col, target):
    encoder = {}
    for v in set(df[col]):
        if v == v:
            subDf = df[df[col] == v]
        else:
            xList = list(df[col])
            nanInd = [i for i in range(len(xList)) if xList[i] != xList[i]]
            subDf = df.loc[nanInd]
        encoder[v] = sum(subDf[target])*1.0/subDf.shape[0]
    newCol = [encoder[i] for i in df[col]]
    return newCol
    
#日期特殊变量转换               
def DateDays(df,dateCol,base):
    base2 = time.strptime(base,'%Y/%m/%d')
    base3 = datetime.datetime(base2[0],base2[1],base2[2])
    date1 = [time.strptime(i,'%Y/%m/%d') for i in df[dateCol]]
    date2 = [datetime.datetime(i[0],i[1],i[2]) for i in date1]
    daysGap = [(date2[i] - base3).days for i in range(len(date2))]
    return daysGap

#colNumerator为分子项
#colDenominator为分母项
#特生衍生
def ColumnDivide(df, colNumerator, colDenominator):   
    N = df.shape[0]
    rate = [0]*N
    xNum = list(df[colNumerator])
    xDenom = list(df[colDenominator])
    for i in range(N):
        #这一判断主要是为了区别分母为零时无法计算的情形
        if xDenom[i]>0:
            rate[i] = xNum[i]*1.0/xDenom[i]
        else:
            rate[i] = 0
    return rate    

external_columns=list(external_data.columns[1:].map(lambda x : x.lower()))
external_columns_cid=list(external_data.columns[0:1])
external_columns_merge=external_columns_cid+external_columns
external_data.columns=external_columns_merge
AllData=pd.merge(bank_data,external_data,on='CUST_ID')
internal_dic2=dict(zip(list(internal_dic['variable']),list(internal_dic['explanation'])))
external_dic2=dict(zip(list(external_dic['variable'].map(lambda x :x.lower())),list(external_dic['explanation'])))
merge_dic={}
merge_dic.update(internal_dic2)
merge_dic.update(external_dic2)
for i in list(AllData.columns):
    if i not in list(merge_dic.keys()):
        merge_dic.update({i:i})
columns=set(list(AllData.columns))
columns.remove('CHURN_CUST_IND')
numericCols=[]
stringCols=[]
for var in columns:
    #首先删除所选列的重复值
    x=list(set(AllData[var]))
    #删除所选列的缺失值
    x=[i for i in x if i == i]
    #判断是否为数值型变量,亦或是类别型变量,并放入不同列表
    if isinstance(x[0],numbers.Real):
        numericCols.append(var)
    elif isinstance(x[0],str):
        stringCols.append(var)
    else:
        print('该数据类型未知')
#输出图示,查看因变量与自变量之间的关系分布
filepath=path+'连续型变量单因子分析图示//'
for var in numericCols:
    NumVarPerf(AllData,var,'CHURN_CUST_IND',filepath,truncation=True)
filepath=path+'类别型变量单因子分析图示//'
for var in stringCols:
    CharVarPerf(AllData,var,'CHURN_CUST_IND',filepath)    
    
All_copy=AllData.copy()
#将时间变量转化为有跨度的连续型变量
All_copy['open_date']=DateDays(All_copy,'open_date','1990/1/1')
    
##填补缺失值
num_dict={}
cat_dict={}
cn_num_dict={}
cn_cat_dict={}
count=0
#筛选出有缺失值的列   
for i in list(All_copy.columns):
    #通过计算有效数据筛选出有缺失的列
    validDf=All_copy.loc[All_copy[i] == All_copy[i]][i]
    validRcd=validDf.shape[0]*1.0/All_copy.shape[0]
    if validRcd < 1 :
        if i in  numericCols: 
            num_dict.update({i:round(validRcd,4)})
            cn_num_dict.update({merge_dic[i]:round(validRcd,4)})
        else:
            cat_dict.update({i:round(validRcd,4)})
            cn_cat_dict.update({merge_dic[i]:round(validRcd,4)})
    count +=1
    print(count/len(All_copy.columns))
#查看上面结果可以看出,连续性变量的缺失率很低,适合用平均值填补
for i in list(num_dict.keys()):
    All_copy[i]=MakeupMissing(All_copy,i,'Continuous','Mean')
#从一上结果可以看出有一些类型变量缺失率非常低,此时考虑删除该变量
for i in list(cat_dict.keys()):
    if cat_dict[i] >=0.7:
        All_copy[i]=MakeupMissing(All_copy,i,'Categorical','Random')
    else:
        All_copy.drop(i,inplace=True,axis=1)

All_copy2=All_copy
#将分类变量转化为哑变量
for i in stringCols:
    if i in list(cat_dict.keys()):
        if cat_dict[i]>0.7:
            lbl=LabelEncoder()
            lbl.fit(list(All_copy2[i].values))
            All_copy2[i]=lbl.transform(list(All_copy2[i].values)) 
    else:
        lbl=LabelEncoder()
        lbl.fit(list(All_copy2[i].values))
        All_copy2[i]=lbl.transform(list(All_copy2[i].values)) 

All_copy3=All_copy2
#衍生特征变量
#根据字典生成对应中文观测变量,更直观的观察变量间的关系
obs_dic=[]
for i in All_copy3.columns:
    if i in list(merge_dic.keys()):
        obs_dic.append({i:merge_dic[i]})
#删除完全共线性的变量,如'本币活期月日均余额占比'=1-'本币定期月日均余额占比',应删除其中一个
All_copy3.drop('LOCAL_FIX_MON_AVG_BAL_PROP',axis=1,inplace=True)
All_copy3.drop('ATM_NOT_ACCT_TX_NUM',axis=1,inplace=True)
All_copy3.drop('COUNTER_NOT_ACCT_TX_NUM',axis=1,inplace=True)
        
All_copy3['ATM_ALL_TX_NUM_RATE']=ColumnDivide(All_copy3, 'ATM_ACCT_TX_NUM', 'ATM_ALL_TX_NUM')
All_copy3['COUNTER_ALL_TX_NUM_RATE']=ColumnDivide(All_copy3, 'COUNTER_ACCT_TX_NUM', 'COUNTER_ALL_TX_NUM')
ad_columns=list(All_copy3.columns)[:50]+list(All_copy3.columns)[51:]
ad_columns.append('CHURN_CUST_IND')
All_copy4=All_copy3[ad_columns]
All_copy4.to_csv('All_copy4.csv',index=False)


#建立模型
All_data_model=pd.read_csv('All_copy4.csv')
x_train=All_data_model.ix[:,1:-1]
labels=All_data_model['CHURN_CUST_IND']
xgtrain = xgb.DMatrix(x_train,label=labels)
    
##最终长成的树,调整步骤在下面    
xgb1=XGBClassifier(
        learning_rate=0.05,
        n_estimators=194,
        max_depth=4,
        min_child_weight=8,
        gamma=0,
        subsample=0.5,
        colsample_bytree=0.9,
        colsample_bylevel=0.7,
        objective='reg:logistic',
        reg_alpha=3,
        reg_lambda=0.5,
        seed=3)    
#用于评价模型
results=cross_val_score(xgb1,x_train,labels,cv=10).mean()

#寻找最佳学习器数量
def modelfit(alg,x_train,y_train,useTrainCV=True,cv_folds=None,early_stopping_rounds=50):
    if useTrainCV == True:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(x_train,label=y_train)   
        cvresult=xgb.cv(xgb_param,xgtrain,num_boost_round=alg.get_params()['n_estimators'],folds=cv_folds,
                        metrics='logloss',early_stopping_rounds=early_stopping_rounds)
        n_estimators=cvresult.shape[0]
        alg.set_params(n_estimators=n_estimators)
        print(cvresult)
 
#首先定义一模型,然后确定要长多少课树   
xgb1=XGBClassifier(
        learning_rate=0.1,
        n_estimators=1000,
        max_depth=8,
        min_child_weight=3,
        gamma=0,
        subsample=0.3,
        colsample_bytree=0.8,
        colsample_bylevel=0.7,
        objective='reg:logistic',
        seed=3)
#由于是一个预测值不平均的两元分类问题,所以使用StratifiedKFold       
kfold=StratifiedKFold(n_splits=10,random_state=7)      
modelfit(xgb1,x_train,labels,cv_folds=kfold)


#寻找最佳步长和最小叶子比例
xgb1=XGBClassifier(
        learning_rate=0.1,
        n_estimators=50,
        max_depth=8,
        min_child_weight=6,
        gamma=0,
        subsample=0.3,
        colsample_bytree=0.8,
        colsample_bylevel=0.7,
        objective='reg:logistic',
        seed=3)
    

param_test={'max_depth':list(range(2,10,1)),'min_child_weight':list(range(1,10,1))}
clf=GridSearchCV(estimator=xgb1,param_grid=param_test,cv=5,scoring='roc_auc')
clf.fit(x_train.values,labels.values)
clf.grid_scores_
clf.best_params_
clf.best_score_


#寻找subsample和colsample_bytree
xgb1=XGBClassifier(
        learning_rate=0.1,
        n_estimators=76,
        max_depth=4,
        min_child_weight=8,
        gamma=0,
        subsample=0.3,
        colsample_bytree=0.8,
        colsample_bylevel=0.7,
        objective='reg:logistic',
        seed=3)
    

param_test={'subsample':[i/10 for i in range(3,9)],'colsample_bytree':[i/10 for i in range(6,10)]}
clf=GridSearchCV(estimator=xgb1,param_grid=param_test,cv=5,scoring='roc_auc')
clf.fit(x_train.values,labels.values)
clf.grid_scores_
clf.best_params_
clf.best_score_

#寻找更好的正则参数
reg_alpha=[0.1,1,1.5,2,2.5,3]#之前测过【0.1,1,1.5,2】
reg_lambda=[0,0.05,0.1,0.5,1,2]#之前测过【0.1,0.5,1,2】
xgb1=XGBClassifier(
        learning_rate=0.1,
        n_estimators=76,
        max_depth=4,
        min_child_weight=8,
        gamma=0,
        subsample=0.5,
        colsample_bytree=0.9,
        colsample_bylevel=0.7,
        objective='reg:logistic',
        seed=3)    
    

param_test={'reg_alpha':reg_alpha,'reg_lambda':reg_lambda}
clf=GridSearchCV(estimator=xgb1,param_grid=param_test,cv=5,scoring='roc_auc')
clf.fit(x_train.values,labels.values)
clf.grid_scores_
clf.best_params_
clf.best_score_

猜你喜欢

转载自blog.csdn.net/pandacode/article/details/81814760