model feature
import pandas as pd
import datetime
import collections
import numpy as np
import numbers
import random
from itertools import combinations
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import sys
import pickle
# reload(sys)
#sys.setdefaultencoding( "utf-8")
#sys.path.append(path+"/Notes/07 申请评分卡中的数据预处理和特征衍生/")
from scorecard_fucntions import *
from sklearn.linear_model import LogisticRegressionCV
# -*- coding: utf-8 -*-
#########################################################################################################
#Step 0: Initiate the data processing work, including reading csv files, checking the consistency of Idx#
#########################################################################################################
# data1 = pd.read_csv('C:/Users/MacBook/Desktop/PPD_LogInfo_3_1_Training_Set.csv', header = 0)
# data2 = pd.read_csv('C:/Users/MacBook/Desktop/PPD_Training_Master_GBK_3_1_Training_Set.csv', header = 0,encoding = 'gbk')
# data3 = pd.read_csv('C:/Users/MacBook/Desktop/PPD_Userupdate_Info_3_1_Training_Set.csv', header = 0)
# data1_Idx, data2_Idx, data3_Idx = set(data1_Idx), set(data2_Idx), set(data3_Idx)
# check_Idx_integrity = (data1_Idx - data2_Idx)|(data2_Idx - data1_Idx)|(data1_Idx - data3_Idx)|(data3_Idx - data1_Idx)
#set([85832, 82505, 10922, 78259, 14662]) is only in data1_Idx, so we remove them from the modeling base
######################################################################################################################################################
# Step 1: Derivate the features using PPD_Training_Master_GBK_3_1_Training_Set, PPD_LogInfo_3_1_Training_Set & PPD_Userupdate_Info_3_1_Training_Set#
######################################################################################################################################################
# compare whether the three city variable match
# data2['city_match'] = data2.apply(lambda x: int(x.UserInfo_2 == x.UserInfo_4 == x.UserInfo_8 == x.UserInfo_20),axis = 1)
# del data2['UserInfo_2']
# del data2['UserInfo_4']
# del data2['UserInfo_8']
# del data2['UserInfo_20']
### Extract the applying date of each applicant
# data1['logInfo'] = data1['LogInfo3'].map(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d'))
# data1['Listinginfo'] = data1['Listinginfo1'].map(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d'))
# data1['ListingGap'] = data1[['logInfo','Listinginfo']].apply(lambda x: (x[1]-x[0]).days,axis = 1)
#maxListingGap = max(data1['ListingGap'])
# timeWindows = TimeWindowSelection(data1, 'ListingGap', range(30,361,30))
'''
We use 180 as the maximum time window to work out some features in data1.
The used time windows can be set as 7 days, 30 days, 60 days, 90 days, 120 days, 150 days and 180 days.
We calculate the count of total and count of distinct of each raw field within selected time window.
'''
# time_window = [7, 30, 60, 90, 120, 150, 180]
# var_list = ['LogInfo1','LogInfo2']
# data1GroupbyIdx = pd.DataFrame({'Idx':data1['Idx'].drop_duplicates()})
# for tw in time_window:
# data1['TruncatedLogInfo'] = data1['Listinginfo'].map(lambda x: x + datetime.timedelta(-tw))
# temp = data1.loc[data1['logInfo'] >= data1['TruncatedLogInfo']]
# for var in var_list:
# #count the frequences of LogInfo1 and LogInfo2
# count_stats = temp.groupby(['Idx'])[var].count().to_dict()
# data1GroupbyIdx[str(var)+'_'+str(tw)+'_count'] = data1GroupbyIdx['Idx'].map(lambda x: count_stats.get(x,0))
#
# # count the distinct value of LogInfo1 and LogInfo2
# Idx_UserupdateInfo1 = temp[['Idx', var]].drop_duplicates()
# uniq_stats = Idx_UserupdateInfo1.groupby(['Idx'])[var].count().to_dict()
# data1GroupbyIdx[str(var) + '_' + str(tw) + '_unique'] = data1GroupbyIdx['Idx'].map(lambda x: uniq_stats.get(x,0))
#
# # calculate the average count of each value in LogInfo1 and LogInfo2
# data1GroupbyIdx[str(var) + '_' + str(tw) + '_avg_count'] = data1GroupbyIdx[[str(var)+'_'+str(tw)+'_count',str(var) + '_' + str(tw) + '_unique']].\
# apply(lambda x: x[0]*1.0/x[1], axis=1)
#
#
# data3['ListingInfo'] = data3['ListingInfo1'].map(lambda x: datetime.datetime.strptime(x,'%Y/%m/%d'))
# data3['UserupdateInfo'] = data3['UserupdateInfo2'].map(lambda x: datetime.datetime.strptime(x,'%Y/%m/%d'))
# data3['ListingGap'] = data3[['UserupdateInfo','ListingInfo']].apply(lambda x: (x[1]-x[0]).days,axis = 1)
# collections.Counter(data3['ListingGap'])
# hist_ListingGap = np.histogram(data3['ListingGap'])
# hist_ListingGap = pd.DataFrame({'Freq':hist_ListingGap[0],'gap':hist_ListingGap[1][1:]})
# hist_ListingGap['CumFreq'] = hist_ListingGap['Freq'].cumsum()
# hist_ListingGap['CumPercent'] = hist_ListingGap['CumFreq'].map(lambda x: x*1.0/hist_ListingGap.iloc[-1]['CumFreq'])
'''
we use 180 as the maximum time window to work out some features in data1. The used time windows can be set as
7 days, 30 days, 60 days, 90 days, 120 days, 150 days and 180 days
Because we observe some mismatch of letter's upercase/lowercase, like QQ & qQ, Idnumber & idNumber, so we firstly make them consistant。
Besides, we combine MOBILEPHONE&PHONE into PHONE.
Within selected time window, we calculate the
(1) the frequences of updating
(2) the distinct of each item
(3) some important items like IDNUMBER,HASBUYCAR, MARRIAGESTATUSID, PHONE
'''
# data3['UserupdateInfo1'] = data3['UserupdateInfo1'].map(ChangeContent)
# data3GroupbyIdx = pd.DataFrame({'Idx':data3['Idx'].drop_duplicates()})
#
# time_window = [7, 30, 60, 90, 120, 150, 180]
# for tw in time_window:
# data3['TruncatedLogInfo'] = data3['ListingInfo'].map(lambda x: x + datetime.timedelta(-tw))
# temp = data3.loc[data3['UserupdateInfo'] >= data3['TruncatedLogInfo']]
#
# #frequency of updating
# freq_stats = temp.groupby(['Idx'])['UserupdateInfo1'].count().to_dict()
# data3GroupbyIdx['UserupdateInfo_'+str(tw)+'_freq'] = data3GroupbyIdx['Idx'].map(lambda x: freq_stats.get(x,0))
#
# # number of updated types
# Idx_UserupdateInfo1 = temp[['Idx','UserupdateInfo1']].drop_duplicates()
# uniq_stats = Idx_UserupdateInfo1.groupby(['Idx'])['UserupdateInfo1'].count().to_dict()
# data3GroupbyIdx['UserupdateInfo_' + str(tw) + '_unique'] = data3GroupbyIdx['Idx'].map(lambda x: uniq_stats.get(x, x))
#
# #average count of each type
# data3GroupbyIdx['UserupdateInfo_' + str(tw) + '_avg_count'] = data3GroupbyIdx[['UserupdateInfo_'+str(tw)+'_freq', 'UserupdateInfo_' + str(tw) + '_unique']]. \
# apply(lambda x: x[0] * 1.0 / x[1], axis=1)
#
# #whether the applicant changed items like IDNUMBER,HASBUYCAR, MARRIAGESTATUSID, PHONE
# Idx_UserupdateInfo1['UserupdateInfo1'] = Idx_UserupdateInfo1['UserupdateInfo1'].map(lambda x: [x])
# Idx_UserupdateInfo1_V2 = Idx_UserupdateInfo1.groupby(['Idx'])['UserupdateInfo1'].sum()
# for item in ['_IDNUMBER','_HASBUYCAR','_MARRIAGESTATUSID','_PHONE']:
# item_dict = Idx_UserupdateInfo1_V2.map(lambda x: int(item in x)).to_dict()
# data3GroupbyIdx['UserupdateInfo_' + str(tw) + str(item)] = data3GroupbyIdx['Idx'].map(lambda x: item_dict.get(x, x))
#
# # Combine the above features with raw features in PPD_Training_Master_GBK_3_1_Training_Set
# allData = pd.concat([data2.set_index('Idx'), data3GroupbyIdx.set_index('Idx'), data1GroupbyIdx.set_index('Idx')],axis= 1)
# allData.to_csv('C:/Users/MacBook/Desktop/allData_0.csv',encoding = 'gbk')
##################################################################################
# Step 2: Makeup missing value for categorical variables and continuous variables#
##################################################################################
# allData = pd.read_csv('C:/Users/MacBook/Desktop/allData_0.csv',header = 0,encoding = 'gbk')
# allFeatures = list(allData.columns)
# allFeatures.remove('ListingInfo')
# allFeatures.remove('target')
# allFeatures.remove('Idx')
#check columns and remove them if they are a constant
# for col in allFeatures:
# if len(set(allData[col])) == 1:
# del allData[col]
# allFeatures.remove(col)
#devide the whole independent variables into categorical type and numerical type
# numerical_var = []
# for var in allFeatures:
# uniq_vals = list(set(allData[var]))
# if np.nan in uniq_vals:
# uniq_vals.remove( np.nan)
# if len(uniq_vals) >= 10 and isinstance(uniq_vals[0],numbers.Real):
# numerical_var.append(var)
#
# categorical_var = [i for i in allFeatures if i not in numerical_var]
'''
For each categorical variable, if the missing value occupies more than 50%, we remove it.
Otherwise we will use missing as a special status
'''
# missing_pcnt_threshould_1 = 0.5
# for col in categorical_var:
# missingRate = MissingCategorial(allData,col)
# print('{0} has missing rate as {1}'.format(col,missingRate))
# if missingRate > missing_pcnt_threshould_1:
# categorical_var.remove(col)
# del allData[col]
# if 0 < missingRate < missing_pcnt_threshould_1:
# allData[col] = allData[col].map(lambda x: str(x).upper())
'''
For continuous variable, if the missing value is more than 30%, we remove it.
Otherwise we use random sampling method to make up the missing
'''
# missing_pcnt_threshould_2 = 0.3
# for col in numerical_var:
# missingRate = MissingContinuous(allData, col)
# print('{0} has missing rate as {1}'.format(col, missingRate))
# if missingRate > missing_pcnt_threshould_2:
# numerical_var.remove(col)
# del allData[col]
# print('we delete variable {} because of its high missing rate'.format(col))
# else:
# if missingRate > 0:
# not_missing = allData.loc[allData[col] == allData[col]][col]
# makeuped = allData[col].map(lambda x: MakeupRandom(x, list(not_missing)))
# del allData[col]
# allData[col] = makeuped
# missingRate2 = MissingContinuous(allData, col)
# print('missing rate after making up is:{}'.format(str(missingRate2)))
#
#
# allData.to_csv('C:/Users/MacBook/Desktop/allData_1b.csv', header=True,encoding='gbk', columns = allData.columns, index=False)
####################################
# Step 3: Group variables into bins#
####################################
#for each categorical variable, if it has distinct values more than 5, we use the ChiMerge to merge it
trainData = pd.read_csv('C:/Users/MacBook/Desktop/a_1.csv',header = 0, encoding='gbk')
allFeatures = list(trainData.columns)
allFeatures.remove('ListingInfo')
allFeatures.remove('target')
allFeatures.remove('Idx')
#devide the whole independent variables into categorical type and numerical type
numerical_var = []
for var in allFeatures:
uniq_vals = list(set(trainData[var]))
if np.nan in uniq_vals:
uniq_vals.remove( np.nan)
if len(uniq_vals) >= 10 and isinstance(uniq_vals[0],numbers.Real):
numerical_var.append(var)
categorical_var = [i for i in allFeatures if i not in numerical_var]
for col in categorical_var:
trainData[col] = trainData[col].map(lambda x: str(x).upper())
'''
For cagtegorical variables, follow the below steps
1, if the variable has distinct values more than 5, we calculate the bad rate and encode the variable with the bad rate
2, otherwise:
(2.1) check the maximum bin, and delete the variable if the maximum bin occupies more than 90%
(2.2) check the bad percent for each bin, if any bin has 0 bad samples, then combine it with samllest non-zero bad bin,
and then check the maximum bin again
'''
deleted_features = [] #delete the categorical features in one of its single bin occupies more than 90%
encoded_features = {}
merged_features = {}
var_IV = {} #save the IV values for binned features
var_WOE = {}
for col in categorical_var:
print('we are processing {}'.format(col))
if len(set(trainData[col]))>5:
print('{} is encoded with bad rate'.format(col))
col0 = str(col)+'_encoding'
#(1), calculate the bad rate and encode the original value using bad rate
encoding_result = BadRateEncoding(trainData, col, 'target')
trainData[col0], br_encoding = encoding_result['encoding'],encoding_result['br_rate']
#(2), push the bad rate encoded value into numerical varaible list
numerical_var.append(col0)
#(3), save the encoding result, including new column name and bad rate
encoded_features[col] = [col0, br_encoding]
#(4), delete the original value
del trainData[col]
deleted_features.append(col)
else:
maxPcnt = MaximumBinPcnt(trainData, col)
if maxPcnt > 0.9:
print('{} is deleted because of large percentage of single bin'.format(col))
deleted_features.append(col)
categorical_var.remove(col)
del trainData[col]
continue
bad_bin = trainData.groupby([col])['target'].sum()
if min(bad_bin) == 0:
print('{} has 0 bad sample!'.format(col))
col1 = str(col) + '_mergeByBadRate'
#(1), determine how to merge the categories
mergeBin = MergeBad0(trainData, col, 'target')
#(2), convert the original data into merged data
trainData[col1] = trainData[col].map(mergeBin)
maxPcnt = MaximumBinPcnt(trainData, col1)
if maxPcnt > 0.9:
print('{} is deleted because of large percentage of single bin'.format(col))
deleted_features.append(col)
categorical_var.remove(col)
del trainData[col]
continue
#(3) if the merged data satisify the requirement, we keep it
merged_features[col] = [col1, mergeBin]
WOE_IV = CalcWOE(trainData, col1, 'target')
var_WOE[col1] = WOE_IV['WOE']
var_IV[col1] = WOE_IV['IV']
del trainData[col]
deleted_features.append(col)
else:
WOE_IV = CalcWOE(trainData, col, 'target')
var_WOE[col] = WOE_IV['WOE']
var_IV[col] = WOE_IV['IV']
'''
For continous variables, we do the following work:
1, split the variable by ChiMerge (by default into 5 bins)
2, check the bad rate, if it is not monotone, we decrease the number of bins until the bad rate is monotone
3, delete the variable if maximum bin occupies more than 90%
'''
var_cutoff = {}
for col in numerical_var:
print("{} is in processing".format(col))
col1 = str(col) + '_Bin'
#(1), split the continuous variable and save the cutoff points. Particulary, -1 is a special case and we separate it into a group
if -1 in set(trainData[col]):
special_attribute = [-1]
else:
special_attribute = []
cutOffPoints = ChiMerge_MaxInterval(trainData, col, 'target',special_attribute=special_attribute)
var_cutoff[col] = cutOffPoints
trainData[col1] = trainData[col].map(lambda x: AssignBin(x, cutOffPoints,special_attribute=special_attribute))
#(2), check whether the bad rate is monotone
BRM = BadRateMonotone(trainData, col1, 'target',special_attribute=special_attribute)
if not BRM:
for bins in range(4,1,-1):
cutOffPoints = ChiMerge_MaxInterval(trainData, col, 'target',max_interval = bins,special_attribute=special_attribute)
trainData[col1] = trainData[col].map(lambda x: AssignBin(x, cutOffPoints,special_attribute=special_attribute))
BRM = BadRateMonotone(trainData, col1, 'target',special_attribute=special_attribute)
if BRM:
break
var_cutoff[col] = cutOffPoints
#(3), check whether any single bin occupies more than 90% of the total
maxPcnt = MaximumBinPcnt(trainData, col1)
if maxPcnt > 0.9:
del trainData[col1]
deleted_features.append(col)
numerical_var.remove(col)
print('we delete {} because the maximum bin occupies more than 90%'.format(col))
continue
WOE_IV = CalcWOE(trainData, col1, 'target')
var_IV[col] = WOE_IV['IV']
var_WOE[col] = WOE_IV['WOE']
del trainData[col]
trainData.to_csv('C:/Users/MacBook/Desktop/allData_2a.csv', header=True,encoding='gbk', columns = trainData.columns, index=False)
filewrite = open('C:/Users/MacBook/Desktop/var_WOE.pkl','w')
pickle.dump(var_WOE, filewrite)
filewrite.close()
filewrite = open('C:/Users/MacBook/Desktop/var_IV.pkl','w')
pickle.dump(var_IV, filewrite)
filewrite.close()
#########################################################
# Step 4: Select variables with IV > 0.02 and assign WOE#
#########################################################
trainData = pd.read_csv('C:/Users/MacBook/Desktop/allData_2a.csv', header=0, encoding='gbk')
num2str = ['SocialNetwork_13','SocialNetwork_12','UserInfo_6','UserInfo_5','UserInfo_10','UserI nfo_17','city_match']
for col in num2str:
trainData[col] = trainData[col].map(lambda x: str(x))
for col in var_WOE.keys():
print(col)
col2 = str(col)+"_WOE"
if col in var_cutoff.keys():
cutOffPoints = var_cutoff[col]
special_attribute = []
if - 1 in cutOffPoints:
special_attribute = [-1]
binValue = trainData[col].map(lambda x: AssignBin(x, cutOffPoints,special_attribute=special_attribute))
trainData[col2] = binValue.map(lambda x: var_WOE[col][x])
else:
trainData[col2] = trainData[col].map(lambda x: var_WOE[col][x])
trainData.to_csv('C:/Users/MacBook/Desktop/allData_3.csv', header=True,encoding='gbk', columns = trainData.columns, index=False)
### (i) select the features with IV above the thresould
iv_threshould = 0.02
varByIV = [k for k, v in var_IV.items() if v > iv_threshould]
### (ii) check the collinearity of any pair of the features with WOE after (i)
var_IV_selected = {k:var_IV[k] for k in varByIV}
var_IV_sorted = sorted(var_IV_selected.iteritems(), key=lambda d:d[1], reverse = True)
var_IV_sorted = [i[0] for i in var_IV_sorted]
removed_var = []
roh_thresould = 0.6
for i in range(len(var_IV_sorted)-1):
if var_IV_sorted[i] not in removed_var:
x1 = var_IV_sorted[i]+"_WOE"
for j in range(i+1,len(var_IV_sorted)):
if var_IV_sorted[j] not in removed_var:
x2 = var_IV_sorted[j] + "_WOE"
roh = np.corrcoef([trainData[x1], trainData[x2]])[0, 1]
if abs(roh) >= roh_thresould:
print('the correlation coeffient between {0} and {1} is {2}'.format(x1, x2, str(roh)))
if var_IV[var_IV_sorted[i]] > var_IV[var_IV_sorted[j]]:
removed_var.append(var_IV_sorted[j])
else:
removed_var.append(var_IV_sorted[i])
var_IV_sortet_2 = [i for i in var_IV_sorted if i not in removed_var]
### (iii) check the multi-colinearity according to VIF > 10
for i in range(len(var_IV_sortet_2)):
x0 = trainData[var_IV_sortet_2[i]+'_WOE']
x0 = np.array(x0)
X_Col = [k+'_WOE' for k in var_IV_sortet_2 if k != var_IV_sortet_2[i]]
X = trainData[X_Col]
X = np.matrix(X)
regr = LinearRegression()
clr= regr.fit(X, x0)
x_pred = clr.predict(X)
R2 = 1 - ((x_pred - x0) ** 2).sum() / ((x0 - x0.mean()) ** 2).sum()
vif = 1/( 1-R2)
if vif > 10:
print("Warning: the vif for {0} is {1}".format(var_IV_sortet_2[i], vif))
#############################################################################################################
# Step 5: build the logistic regression using selected variables after single analysis and mulitple analysis#
#############################################################################################################
### (1) put all the features after single & multiple analysis into logisitic regression
var_WOE_list = [i+'_WOE' for i in var_IV_sortet_2]
y = trainData['target']
X = trainData[var_WOE_list]
X['intercept'] = [1]*X.shape[0]
LR = sm.Logit(y, X).fit()
summary = LR.summary()
pvals = LR.pvalues
pvals = pvals.to_dict()
### Some features are not significant, so we need to delete feature one by one.
varLargeP = {k: v for k,v in pvals.items() if v >= 0.1}
varLargeP = sorted(varLargeP.iteritems(), key=lambda d:d[1], reverse = True)
while(len(varLargeP) > 0 and len(var_WOE_list) > 0):
# In each iteration, we remove the most insignificant feature and build the regression again, until
# (1) all the features are significant or
# (2) no feature to be selected
varMaxP = varLargeP[0][0]
if varMaxP == 'intercept':
print('the intercept is not significant!')
break
var_WOE_list.remove(varMaxP)
y = trainData['target']
X = trainData[var_WOE]
X['intercept'] = [1] * X.shape[0]
LR = sm.Logit(y, X).fit()
summary = LR.summary()
pvals = LR.pvalues
pvals = pvals.to_dict()
varLargeP = {k: v for k, v in pvals.items() if v >= 0.1}
varLargeP = sorted(varLargeP.iteritems(), key=lambda d: d[1], reverse=True)
'''
Now all the features are significant and the sign of coefficients are negative
var_WOE_list = ['UserInfo_15_encoding_WOE', u'ThirdParty_Info_Period6_10_WOE', u'ThirdParty_Info_Period5_2_WOE', 'UserInfo_16_encoding_WOE', 'WeblogInfo_20_encoding_WOE',
'UserInfo_7_encoding_WOE', u'UserInfo_17_WOE', u'ThirdParty_Info_Period3_10_WOE', u'ThirdParty_Info_Period1_10_WOE', 'WeblogInfo_2_encoding_WOE',
'UserInfo_1_encoding_WOE']
'''
saveModel =open('C:/Users/MacBook/Desktop/LR_Model_Normal.pkl','w')
pickle.dump(LR,saveModel)
saveModel.close()
######################################################################################################
# Step 6(a): build the logistic regression using lasso and weights based on variables given in Step 5#
######################################################################################################
### use cross validation to select the best regularization parameter
X = trainData[var_WOE_list] #by default LogisticRegressionCV() fill fit the intercept
X = np.matrix(X)
y = trainData['target']
y = np.array(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
X_train.shape, y_train.shape
model_parameter = {}
for C_penalty in np.arange(0.005, 0.2,0.005):
for bad_weight in range(2, 101, 2):
LR_model_2 = LogisticRegressionCV(Cs=[C_penalty], penalty='l1', solver='liblinear', class_weight={1:bad_weight, 0:1})
LR_model_2_fit = LR_model_2.fit(X_train,y_train)
y_pred = LR_model_2_fit.predict_proba(X_test)[:,1]
scorecard_result = pd.DataFrame({'prob':y_pred, 'target':y_test})
performance = KS_AR(scorecard_result,'prob','target')
KS = performance['KS']
model_parameter[(C_penalty, bad_weight)] = KS
####################################################################################
# Step 6(b): build the logistic regression using according to RF feature importance#
####################################################################################
### build random forest model to estimate the importance of each feature
### In this case we use the original feautures with WOE encoding before single analysis
X = trainData[var_WOE_list]
X = np.matrix(X)
y = trainData['target']
y = np.array(y)
RFC = RandomForestClassifier()
RFC_Model = RFC.fit(X,y)
features_rfc = trainData[var_WOE_list].columns
featureImportance = {features_rfc[i]:RFC_Model.feature_importances_[i] for i in range(len(features_rfc))}
featureImportanceSorted = sorted(featureImportance.iteritems(),key=lambda x: x[1], reverse=True)
# we selecte the top 10 features
features_selection = [k[0] for k in featureImportanceSorted[:10]]
y = trainData['target']
X = trainData[features_selection]
X['intercept'] = [1]*X.shape[0]
LR = sm.Logit(y, X).fit()
summary = LR.summary()
"""
Logit Regression Results
==============================================================================
Dep. Variable: target No. Observations: 30000
Model: Logit Df Residuals: 29989
Method: MLE Df Model: 10
Date: Wed, 26 Apr 2017 Pseudo R-squ.: 0.05762
Time: 19:26:13 Log-Likelihood: -7407.3
converged: True LL-Null: -7860.2
LLR p-value: 3.620e-188
==================================================================================================
coef std err z P>|z| [0.025 0.975]
--------------------------------------------------------------------------------------------------
UserInfo_1_encoding_WOE -1.0433 0.135 -7.756 0.000 -1.307 -0.780
WeblogInfo_20_encoding_WOE -0.9011 0.089 -10.100 0.000 -1.076 -0.726
UserInfo_15_encoding_WOE -0.9184 0.069 -13.215 0.000 -1.055 -0.782
UserInfo_7_encoding_WOE -0.9891 0.096 -10.299 0.000 -1.177 -0.801
UserInfo_16_encoding_WOE -0.9492 0.099 -9.603 0.000 -1.143 -0.756
ThirdParty_Info_Period1_10_WOE -0.5942 0.143 -4.169 0.000 -0.874 -0.315
ThirdParty_Info_Period2_10_WOE -0.0650 0.165 -0.395 0.693 -0.388 0.257
ThirdParty_Info_Period3_10_WOE -0.2052 0.136 -1.511 0.131 -0.471 0.061
ThirdParty_Info_Period6_10_WOE -0.6902 0.090 -7.682 0.000 -0.866 -0.514
ThirdParty_Info_Period5_10_WOE -0.4018 0.100 -4.017 0.000 -0.598 -0.206
intercept -2.5382 0.024 -107.939 0.000 -2.584 -2.492
==================================================================================================
model function
import pandas as pd
import random
import numpy as np
### Calculate the cumulative frequences of events for each selected time window
def TimeWindowSelection(df, daysCol, time_windows):
'''
:param df: the dataset containg variabel of days
:param daysCol: the column of days
:param time_windows: the list of time window
:return:
'''
freq_tw = {}
for tw in time_windows:
freq = sum(df[daysCol].apply(lambda x: int(x<=tw)))
freq_tw[tw] = freq
return freq_tw
def ChangeContent(x):
y = x.upper()
if y == '_MOBILEPHONE':
y = '_PHONE'
return y
def MissingCategorial(df,x):
missing_vals = df[x].map(lambda x: int(x!=x))
return sum(missing_vals)*1.0/df.shape[0]
def MissingContinuous(df,x):
missing_vals = df[x].map(lambda x: int(np.isnan(x)))
return sum(missing_vals) * 1.0 / df.shape[0]
def MakeupRandom(x, sampledList):
if x==x:
return x
else:
randIndex = random.randint(0, len(sampledList)-1)
return sampledList[randIndex]
def AssignBin(x, cutOffPoints,special_attribute=[]):
'''
:param x: the value of variable
:param cutOffPoints: the ChiMerge result for continous variable
:param special_attryibute: the special attribute which should be assigned separately
:return: bin number, indexing from 0
for example, if cutOffPoints = [10,20,30], if x = 7, return Bin 0. If x = 35, return Bin 3
'''
numBin = len(cutOffPoints) + 1 + len(special_attribute)
if x in special_attribute:
i = special_attribute.index(x)+1
return 'Bin {}'.format(0-i)
if x<=cutOffPoints[0]:
return 'Bin 0'
elif x > cutOffPoints[-1]:
return 'Bin {}'.format(numBin-1)
else:
for i in range(0,numBin-1):
if cutOffPoints[i] < x <= cutOffPoints[i+1]:
return 'Bin {}'.format(i+1)
def MaximumBinPcnt(df,col):
N = df.shape[0]
total = df.groupby([col])[col].count()
pcnt = total*1.0/N
return max(pcnt)
def CalcWOE(df, col, target):
'''
:param df: dataframe containing feature and target
:param col: the feature that needs to be calculated the WOE and iv, usually categorical type
:param target: good/bad indicator
:return: WOE and IV in a dictionary
'''
total = df.groupby([col])[target].count()
total = pd.DataFrame({'total': total})
bad = df.groupby([col])[target].sum()
bad = pd.DataFrame({'bad': bad})
regroup = total.merge(bad, left_index=True, right_index=True, how='left')
regroup.reset_index(level=0, inplace=True)
N = sum(regroup['total'])
B = sum(regroup['bad'])
regroup['good'] = regroup['total'] - regroup['bad']
G = N - B
regroup['bad_pcnt'] = regroup['bad'].map(lambda x: x*1.0/B)
regroup['good_pcnt'] = regroup['good'].map(lambda x: x * 1.0 / G)
try:
regroup['WOE'] = regroup.apply(lambda x: np.log(x.good_pcnt*1.0/x.bad_pcnt),axis = 1)
except ZeroDivisionError as e:
print(e)
WOE_dict = regroup[[col,'WOE']].set_index(col).to_dict(orient='index')
for k, v in WOE_dict.items():
WOE_dict[k] = v['WOE']
IV = regroup.apply(lambda x: (x.good_pcnt-x.bad_pcnt)*np.log(x.good_pcnt*1.0/x.bad_pcnt),axis = 1)
IV = sum(IV)
return {"WOE": WOE_dict, 'IV':IV}
def BadRateEncoding(df, col, target):
'''
:param df: dataframe containing feature and target
:param col: the feature that needs to be encoded with bad rate, usually categorical type
:param target: good/bad indicator
:return: the assigned bad rate to encode the categorical feature
'''
total = df.groupby([col])[target].count()
total = pd.DataFrame({'total': total})
bad = df.groupby([col])[target].sum()
bad = pd.DataFrame({'bad': bad})
regroup = total.merge(bad, left_index=True, right_index=True, how='left')
regroup.reset_index(level=0, inplace=True)
regroup['bad_rate'] = regroup.apply(lambda x: x.bad*1.0/x.total,axis = 1)
br_dict = regroup[[col,'bad_rate']].set_index([col]).to_dict(orient='index')
for k, v in br_dict.items():
br_dict[k] = v['bad_rate']
badRateEnconding = df[col].map(lambda x: br_dict[x])
return {'encoding':badRateEnconding, 'br_rate':br_dict}
def Chi2(df, total_col, bad_col, overallRate):
'''
:param df: the dataset containing the total count and bad count
:param total_col: total count of each value in the variable
:param bad_col: bad count of each value in the variable
:param overallRate: the overall bad rate of the training set
:return: the chi-square value
'''
df2 = df.copy()
df2['expected'] = df[total_col].apply(lambda x: x*overallRate)
combined = zip(df2['expected'], df2[bad_col])
chi = [(i[0]-i[1])**2/i[0] for i in combined]
chi2 = sum(chi)
return chi2
def AssignGroup(x, bin):
N = len(bin)
if x<=min(bin):
return min(bin)
elif x>max(bin):
return 10e10
else:
for i in range(N-1):
if bin[i] < x <= bin[i+1]:
return bin[i+1]
### ChiMerge_MaxInterval: split the continuous variable using Chi-square value by specifying the max number of intervals
def ChiMerge_MaxInterval(df, col, target, max_interval=5,special_attribute=[]):
'''
:param df: the dataframe containing splitted column, and target column with 1-0
:param col: splitted column
:param target: target column with 1-0
:param max_interval: the maximum number of intervals. If the raw column has attributes less than this parameter, the function will not work
:return: the combined bins
'''
colLevels = sorted(list(set(df[col])))
N_distinct = len(colLevels)
if N_distinct <= max_interval: # If the raw column has attributes less than this parameter, the function will not work
print("The number of original levels for {} is less than or equal to max intervals".format(col))
return colLevels[:-1]
else:
if len(special_attribute)>=1:
df1 = df.loc[df[col].isin(special_attribute)]
df2 = df.loc[~df[col].isin(special_attribute)]
else:
df2 = df.copy()
N_distinct = len(list(set(df2[col])))
# Step 1: group the dataset by col and work out the total count & bad count in each level of the raw column
if N_distinct > 100:
ind_x = [int(i / 100.0 * N_distinct) for i in range(1, 100)]
split_x = [colLevels[i] for i in ind_x]
df2['temp'] = df2[col].map(lambda x: AssignGroup(x, split_x))
else:
df2['temp'] = df[col]
total = df2.groupby(['temp'])[target].count()
total = pd.DataFrame({'total': total})
bad = df2.groupby(['temp'])[target].sum()
bad = pd.DataFrame({'bad': bad})
regroup = total.merge(bad, left_index=True, right_index=True, how='left')
regroup.reset_index(level=0, inplace=True)
N = sum(regroup['total'])
B = sum(regroup['bad'])
# the overall bad rate will be used in calculating expected bad count
try:
overallRate = B * 1.0 / N
except ZeroDivisionError as e:
print(e.message)
# initially, each single attribute forms a single interval
# since we always combined the neighbours of intervals, we need to sort the attributes
colLevels = sorted(list(set(df2['temp'])))
groupIntervals = [[i] for i in colLevels]
groupNum = len(groupIntervals)
#the final splitted intervals should be the specified max intervals minus the number of special attributes
split_intervals = max_interval - len(special_attribute)
while (len(groupIntervals) > split_intervals): # the termination condition: the number of intervals is equal to the pre-specified threshold
# in each step of iteration, we calcualte the chi-square value of each atttribute
chisqList = []
for interval in groupIntervals:
df2b = regroup.loc[regroup['temp'].isin(interval)]
chisq = Chi2(df2b, 'total', 'bad', overallRate)
chisqList.append(chisq)
# find the interval corresponding to minimum chi-square, and combine with the neighbore with smaller chi-square
min_position = chisqList.index(min(chisqList))
if min_position == 0:
combinedPosition = 1
elif min_position == groupNum - 1:
combinedPosition = min_position - 1
else:
if chisqList[min_position - 1] <= chisqList[min_position + 1]:
combinedPosition = min_position - 1
else:
combinedPosition = min_position + 1
groupIntervals[min_position] = groupIntervals[min_position] + groupIntervals[combinedPosition]
# after combining two intervals, we need to remove one of them
groupIntervals.remove(groupIntervals[combinedPosition])
groupNum = len(groupIntervals)
groupIntervals = [sorted(i) for i in groupIntervals]
cutOffPoints = [max(i) for i in groupIntervals[:-1]]
cutOffPoints = special_attribute + cutOffPoints
return cutOffPoints
## determine whether the bad rate is monotone along the sortByVar
def BadRateMonotone(df, sortByVar, target,special_attribute = []):
'''
:param df: the dataset contains the column which should be monotone with the bad rate and bad column
:param sortByVar: the column which should be monotone with the bad rate
:param target: the bad column
:param special_attribute: some attributes should be excluded when checking monotone
:return:
'''
df2 = df.loc[~df[sortByVar].isin(special_attribute)]
df2 = df2.sort_values([sortByVar])
total = df2.groupby([sortByVar])[target].count()
total = pd.DataFrame({'total': total})
bad = df2.groupby([sortByVar])[target].sum()
bad = pd.DataFrame({'bad': bad})
regroup = total.merge(bad, left_index=True, right_index=True, how='left')
regroup.reset_index(level=0, inplace=True)
combined = zip(regroup['total'],regroup['bad'])
badRate = [x[1]*1.0/x[0] for x in combined]
badRateMonotone = [badRate[i]<badRate[i+1] for i in range(len(badRate)-1)]
Monotone = len(set(badRateMonotone))
if Monotone == 1:
return True
else:
return False
### If we find any categories with 0 bad, then we combine these categories with that having smallest non-zero bad rate
def MergeBad0(df,col,target):
'''
:param df: dataframe containing feature and target
:param col: the feature that needs to be calculated the WOE and iv, usually categorical type
:param target: good/bad indicator
:return: WOE and IV in a dictionary
'''
total = df.groupby([col])[target].count()
total = pd.DataFrame({'total': total})
bad = df.groupby([col])[target].sum()
bad = pd.DataFrame({'bad': bad})
regroup = total.merge(bad, left_index=True, right_index=True, how='left')
regroup.reset_index(level=0, inplace=True)
regroup['bad_rate'] = regroup.apply(lambda x: x.bad*1.0/x.total,axis = 1)
regroup = regroup.sort_values(by = 'bad_rate')
col_regroup = [[i] for i in regroup[col]]
for i in range(regroup.shape[0]):
col_regroup[1] = col_regroup[0] + col_regroup[1]
col_regroup.pop(0)
if regroup['bad_rate'][i+1] > 0:
break
newGroup = {}
for i in range(len(col_regroup)):
for g2 in col_regroup[i]:
newGroup[g2] = 'Bin '+str(i)
return newGroup
### Calculate the KS and AR for the socrecard model
def KS_AR(df, score, target):
'''
:param df: the dataset containing probability and bad indicator
:param score:
:param target:
:return:
'''
total = df.groupby([score])[target].count()
bad = df.groupby([score])[target].sum()
all = pd.DataFrame({'total':total, 'bad':bad})
all['good'] = all['total'] - all['bad']
all[score] = all.index
all = all.sort_values(by=score,ascending=False)
all.index = range(len(all))
all['badCumRate'] = all['bad'].cumsum() / all['bad'].sum()
all['goodCumRate'] = all['good'].cumsum() / all['good'].sum()
all['totalPcnt'] = all['total'] / all['total'].sum()
arList = [0.5 * all.loc[0, 'badCumRate'] * all.loc[0, 'totalPcnt']]
for j in range(1, len(all)):
ar0 = 0.5 * sum(all.loc[j - 1:j, 'badCumRate']) * all.loc[j, 'totalPcnt']
arList.append(ar0)
arIndex = (2 * sum(arList) - 1) / (all['good'].sum() * 1.0 / all['total'].sum())
KS = all.apply(lambda x: x.badCumRate - x.goodCumRate, axis=1)
return {'AR':arIndex, 'KS': max(KS)}
chimerge
import pandas as pd
# 计算卡方值
def Chi2(df, total_col, bad_col, overallRate):
'''
:param df: the dataset containing the total count and bad count
:param total_col: total count of each value in the variable
:param bad_col: bad count of each value in the variable
:param overallRate: the overall bad rate of the training set
:return: the chi-square value
'''
df2 = df.copy()
df2['expected'] = df[total_col].apply(lambda x: x*overallRate)
combined = zip(df2['expected'], df2[bad_col])
chi = [(i[0]-i[1])**2/i[0] for i in combined]
chi2 = sum(chi)
return chi2
# 按区间分
### ChiMerge_MaxInterval: split the continuous variable using Chi-square value by specifying the max number of intervals
def ChiMerge_MaxInterval(df, col, target, max_interval = 5):
'''
:param df: the dataframe containing splitted column, and target column with 1-0
:param col: splitted column
:param target: target column with 1-0
:param max_interval: the maximum number of intervals. If the raw column has attributes less than this parameter, the function will not work
:return: the combined bins
'''
colLevels = set(df[col])
if len(colLevels) <= max_interval: #If the raw column has attributes less than this parameter, the function will not work
print("The number of original levels for {} is less than or equal to max intervals".format(col))
return []
else:
#Step 1: group the dataset by col and work out the total count & bad count in each level of the raw column
total = df.groupby([col])[target].count()
total = pd.DataFrame({'total':total})
bad = df.groupby([col])[target].sum()
bad = pd.DataFrame({'bad':bad})
regroup = total.merge(bad,left_index=True,right_index=True, how='left')
regroup.reset_index(level=0, inplace=True)
N = sum(regroup['total'])
B = sum(regroup['bad'])
#the overall bad rate will be used in calculating expected bad count
overallRate = B*1.0/N
# since we always combined the neighbours of intervals, we need to sort the attributes
colLevels =sorted(list(colLevels))
# initially, each single attribute forms a single interval
groupIntervals = [[i] for i in colLevels]
groupNum = len(groupIntervals)
while(len(groupIntervals)>max_interval): #the termination condition: the number of intervals is equal to the pre-specified threshold
# in each step of iteration, we calcualte the chi-square value of each atttribute
chisqList = []
for interval in groupIntervals:
df2 = regroup.loc[regroup[col].isin(interval)]
chisq = Chi2(df2, 'total','bad',overallRate)
chisqList.append(chisq)
#find the interval corresponding to minimum chi-square, and combine with the neighbore with smaller chi-square
min_position = chisqList.index(min(chisqList))
if min_position == 0:
combinedPosition = 1
elif min_position == groupNum - 1:
combinedPosition = min_position -1
else:
if chisqList[min_position - 1]<=chisqList[min_position + 1]:
combinedPosition = min_position - 1
else:
combinedPosition = min_position + 1
groupIntervals[min_position] = groupIntervals[min_position]+groupIntervals[combinedPosition]
# after combining two intervals, we need to remove one of them
groupIntervals.remove(groupIntervals[combinedPosition])
groupNum = len(groupIntervals)
return groupIntervals
# 按阈值分
### ChiMerge_MaxInterval: split the continuous variable using Chi-square value by specifying the minimum chi-square value
def ChiMerge_MinChisq(df, col, target, confidenceVal = 3.841):
'''
:param df: the dataframe containing splitted column, and target column with 1-0
:param col: splitted column
:param target: target column with 1-0
:param confidenceVal: the specified chi-square thresold, by default the degree of freedom is 1 and using confidence level as 0.95
:return: the splitted bins
'''
colLevels = set(df[col])
total = df.groupby([col])[target].count()
total = pd.DataFrame({'total':total})
bad = df.groupby([col])[target].sum()
bad = pd.DataFrame({'bad':bad})
regroup = total.merge(bad,left_index=True,right_index=True, how='left')
regroup.reset_index(level=0, inplace=True)
N = sum(regroup['total'])
B = sum(regroup['bad'])
overallRate = B*1.0/N
colLevels =sorted(list(colLevels))
groupIntervals = [[i] for i in colLevels]
groupNum = len(groupIntervals)
while(1): #the termination condition: all the attributes form a single interval; or all the chi-square is above the threshould
if len(groupIntervals) == 1:
break
chisqList = []
for interval in groupIntervals:
df2 = regroup.loc[regroup[col].isin(interval)]
chisq = Chi2(df2, 'total','bad',overallRate)
chisqList.append(chisq)
min_position = chisqList.index(min(chisqList))
if min(chisqList) >=confidenceVal:
break
if min_position == 0:
combinedPosition = 1
elif min_position == groupNum - 1:
combinedPosition = min_position -1
else:
if chisqList[min_position - 1]<=chisqList[min_position + 1]:
combinedPosition = min_position - 1
else:
combinedPosition = min_position + 1
groupIntervals[min_position] = groupIntervals[min_position]+groupIntervals[combinedPosition]
groupIntervals.remove(groupIntervals[combinedPosition])
groupNum = len(groupIntervals)
return groupIntervals
model test
import pandas as pd
from pandas import DataFrame
import datetime
import collections
import numpy as np
import numbers
import random
from pandas.tools.plotting import scatter_matrix
from itertools import combinations
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import sys
reload(sys)
sys.setdefaultencoding( "utf-8")
sys.path.append(path+"/Notes/07 申请评分卡中的数据预处理和特征衍生/")
from scorecard_fucntions import *
# -*- coding: utf-8 -*-
###############################################################################################
# Step 0: Reading the raw testing data, which are in the same structure with training datasets#
###############################################################################################
data1b = pd.read_csv(path+'/数据/bank default/PD-Second-Round-Data/first round test data/LogInfo_9w_2.csv', header = 0)
data2b = pd.read_csv(path+'/数据/bank default/PD-Second-Round-Data/first round test data/Kesci_Master_9w_gbk_2.csv', header = 0,encoding = 'gbk')
data3b = pd.read_csv(path+'/数据/bank default/PD-Second-Round-Data/first round test data/Userupdate_Info_9w_2.csv', header = 0)
#################################################################################
# Step 1: Derivate the features using in the same with as it in training dataset#
#################################################################################
### Extract the applying date of each applicant
data1b['logInfo'] = data1b['LogInfo3'].map(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d'))
data1b['Listinginfo'] = data1b['Listinginfo1'].map(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d'))
data1b['ListingGap'] = data1b[['logInfo','Listinginfo']].apply(lambda x: (x[1]-x[0]).days,axis = 1)
'''
We use 180 as the maximum time window to work out some features in data1b.
The used time windows can be set as 7 days, 30 days, 60 days, 90 days, 120 days, 150 days and 180 days.
We calculate the count of total and count of distinct of each raw field within selected time window.
'''
time_window = [7, 30, 60, 90, 120, 150, 180]
var_list = ['LogInfo1','LogInfo2']
data1bGroupbyIdx = pd.DataFrame({'Idx':data1b['Idx'].drop_duplicates()})
for tw in time_window:
data1b['TruncatedLogInfo'] = data1b['Listinginfo'].map(lambda x: x + datetime.timedelta(-tw))
temp = data1b.loc[data1b['logInfo'] >= data1b['TruncatedLogInfo']]
for var in var_list:
#count the frequences of LogInfo1 and LogInfo2
count_stats = temp.groupby(['Idx'])[var].count().to_dict()
data1bGroupbyIdx[str(var)+'_'+str(tw)+'_count'] = data1bGroupbyIdx['Idx'].map(lambda x: count_stats.get(x,0))
# count the distinct value of LogInfo1 and LogInfo2
Idx_UserupdateInfo1 = temp[['Idx', var]].drop_duplicates()
uniq_stats = Idx_UserupdateInfo1.groupby(['Idx'])[var].count().to_dict()
data1bGroupbyIdx[str(var) + '_' + str(tw) + '_unique'] = data1bGroupbyIdx['Idx'].map(lambda x: uniq_stats.get(x,0))
# calculate the average count of each value in LogInfo1 and LogInfo2
data1bGroupbyIdx[str(var) + '_' + str(tw) + '_avg_count'] = data1bGroupbyIdx[[str(var)+'_'+str(tw)+'_count',str(var) + '_' + str(tw) + '_unique']].\
apply(lambda x: x[0]*1.0/x[1], axis=1)
data3b['ListingInfo'] = data3b['ListingInfo1'].map(lambda x: datetime.datetime.strptime(x,'%Y/%m/%d'))
data3b['UserupdateInfo'] = data3b['UserupdateInfo2'].map(lambda x: datetime.datetime.strptime(x,'%Y/%m/%d'))
data3b['ListingGap'] = data3b[['UserupdateInfo','ListingInfo']].apply(lambda x: (x[1]-x[0]).days,axis = 1)
data3b['UserupdateInfo1'] = data3b['UserupdateInfo1'].map(ChangeContent)
data3bGroupbyIdx = pd.DataFrame({'Idx':data3b['Idx'].drop_duplicates()})
time_window = [7, 30, 60, 90, 120, 150, 180]
for tw in time_window:
data3b['TruncatedLogInfo'] = data3b['ListingInfo'].map(lambda x: x + datetime.timedelta(-tw))
temp = data3b.loc[data3b['UserupdateInfo'] >= data3b['TruncatedLogInfo']]
#frequency of updating
freq_stats = temp.groupby(['Idx'])['UserupdateInfo1'].count().to_dict()
data3bGroupbyIdx['UserupdateInfo_'+str(tw)+'_freq'] = data3bGroupbyIdx['Idx'].map(lambda x: freq_stats.get(x,0))
# number of updated types
Idx_UserupdateInfo1 = temp[['Idx','UserupdateInfo1']].drop_duplicates()
uniq_stats = Idx_UserupdateInfo1.groupby(['Idx'])['UserupdateInfo1'].count().to_dict()
data3bGroupbyIdx['UserupdateInfo_' + str(tw) + '_unique'] = data3bGroupbyIdx['Idx'].map(lambda x: uniq_stats.get(x, x))
#average count of each type
data3bGroupbyIdx['UserupdateInfo_' + str(tw) + '_avg_count'] = data3bGroupbyIdx[['UserupdateInfo_'+str(tw)+'_freq', 'UserupdateInfo_' + str(tw) + '_unique']]. \
apply(lambda x: x[0] * 1.0 / x[1], axis=1)
#whether the applicant changed items like IDNUMBER,HASBUYCAR, MARRIAGESTATUSID, PHONE
Idx_UserupdateInfo1['UserupdateInfo1'] = Idx_UserupdateInfo1['UserupdateInfo1'].map(lambda x: [x])
Idx_UserupdateInfo1_V2 = Idx_UserupdateInfo1.groupby(['Idx'])['UserupdateInfo1'].sum()
for item in ['_IDNUMBER','_HASBUYCAR','_MARRIAGESTATUSID','_PHONE']:
item_dict = Idx_UserupdateInfo1_V2.map(lambda x: int(item in x)).to_dict()
data3bGroupbyIdx['UserupdateInfo_' + str(tw) + str(item)] = data3bGroupbyIdx['Idx'].map(lambda x: item_dict.get(x, x))
# Combine the above features with raw features in PPD_Training_Master_GBK_3_1_Training_Set
allData = pd.concat([data2b.set_index('Idx'), data3bGroupbyIdx.set_index('Idx'), data1bGroupbyIdx.set_index('Idx')],axis= 1)
allData.to_csv(path+'/数据/bank default/allData_0_Test.csv',encoding = 'gbk')
####################################################
# Step 2: Makeup missing value continuous variables#
####################################################
#make some change to the string type varaiable, espeically converting nan to NAN so as it could be read in the mapping dictionary
testData = pd.read_csv(path+'/数据/bank default/allData_0_Test.csv',header = 0,encoding = 'gbk')
allData[col] = allData[col].map(lambda x: str(x).upper())
changedCols = ['WeblogInfo_20', 'UserInfo_17']
for col in changedCols:
testData[col] = testData[col].map(lambda x: str(x).upper())
allFeatures = list(testData.columns)
allFeatures.remove('ListingInfo')
allFeatures.remove('target')
allFeatures.remove('Idx')
### read the saved WOE encoding dictionary ###
fread = open(path+'/数据/bank default/var_WOE.pkl','r')
WOE_dict = pickle.load(fread)
fread.close()
### the below features are selected into the scorecard model in Step 5
var_WOE_model = ['UserInfo_15_encoding_WOE', u'ThirdParty_Info_Period6_10_WOE', u'ThirdParty_Info_Period5_2_WOE', 'UserInfo_16_encoding_WOE', 'WeblogInfo_20_encoding_WOE',
'UserInfo_7_encoding_WOE', u'UserInfo_17_WOE', u'ThirdParty_Info_Period3_10_WOE', u'ThirdParty_Info_Period1_10_WOE', 'WeblogInfo_2_encoding_WOE',
'UserInfo_1_encoding_WOE']
#some features are catgorical type and we need to encode them
var_encoding = [i.replace('_WOE','').replace('_encoding','') for i in var_WOE_model if i.find('_encoding')>=0]
for col in var_encoding:
print col
[col1, encode_dict] = encoded_features[col]
testData[col1] = testData[col].map(lambda x: encode_dict.get(str(x),-99999))
col2 = str(col1) + "_WOE"
cutOffPoints = var_cutoff[col1]
special_attribute = []
if - 1 in cutOffPoints:
special_attribute = [-1]
binValue = testData[col1].map(lambda x: AssignBin(x, cutOffPoints, special_attribute=special_attribute))
testData[col2] = binValue.map(lambda x: WOE_dict[col1][x])
#other features can be mapped to WOE directly
var_others = [i.replace('_WOE','').replace('_encoding','') for i in var_WOE_model if i.find('_encoding') < 0]
for col in var_others:
print col
col2 = str(col) + "_WOE"
if col in var_cutoff.keys():
cutOffPoints = var_cutoff[col]
special_attribute = []
if - 1 in cutOffPoints:
special_attribute = [-1]
binValue = testData[col].map(lambda x: AssignBin(x, cutOffPoints, special_attribute=special_attribute))
testData[col2] = binValue.map(lambda x: WOE_dict[col][x])
else:
testData[col2] = testData[col].map(lambda x: WOE_dict[col][x])
### make the design matrix
X = testData[var_WOE_model]
X['intercept'] = [1]*X.shape[0]
y = testData['target']
#### load the training model
saveModel =open(path+'/数据/bank default/LR_Model_Normal.pkl','r')
LR = pickle.load(saveModel)
saveModel.close()
y_pred = LR.predict(X)
scorecard_result = pd.DataFrame({'prob':y_pred, 'target':y})
# we check the performance of the model using KS and AR
# both indices should be above 30%
performance = KS_AR(scorecard_result,'prob','target')
print "KS and AR for the scorecard in the test dataset are %.0f%% and %.0f%%"%(performance['AR']*100,performance['KS']*100)