[Study Notes] Financial Risk Control Actual Combat

Reference:
Zero-Basic Introduction to Financial Risk Control - Loan Default Prediction

Guide package

import pandas as pd
import matplotlib.pyplot as plt
# 读取数据
train = pd.read_csv('train.csv')
testA = pd.read_csv('testA.csv')
print('Train data shape:', train.shape)
print('testA data shape:', testA.shape)
train.head()
testA.head()

Example of classification index evaluation calculation

# 混淆矩阵
import numpy as np
from sklearn.metrics import confusion_matrix
"""
          预:反              预:正
真:反    true negatives      false positives
真:正    false negatives     true positives
"""
y_pred = [0, 1, 0, 1]
y_true = [0, 1, 1, 0]
print('混淆矩阵:\n', confusion_matrix(y_true, y_pred))
# accuracy
from sklearn.metrics import accuracy_score
y_pred = [0, 1, 0, 1]
y_true = [0, 1, 1, 0]
print('ACC:', accuracy_score(y_true, y_pred))
# Precision, Recall, F1-score
from sklearn import metrics
y_pred = [0, 1, 0, 1]
y_ture = [0, 1, 1, 0]
print('Precision', metrics.precision_score(y_ture, y_pred))
print('Recall', metrics.recall_score(y_true, y_pred))
print('F1-score', metrics.f1_score(y_true, y_pred))
# P-R曲线
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
y_pred = [0, 1, 1, 0, 1, 1, 0, 1, 1, 1]
y_true = [0, 1, 1, 0, 1, 0, 1, 1, 0, 1]
precision , recall, thresholds = precision_recall_curve(y_true, y_pred)
plt.plot(recall, precision)
# ROC曲线
from sklearn.metrics import roc_curve
y_pred = [0, 1, 1, 0, 1, 1, 0, 1, 1, 1]
y_true = [0, 1, 1, 0, 1, 0, 1, 1, 0, 1]
FPR, TPR, thresholds = roc_curve(y_true, y_pred)
plt.title('ROC')
plt.plot(FPR, TPR, 'b') # 蓝实线
plt.plot([0,1], [0,1], 'r--') # 红虚线
plt.ylabel('TPR')
plt.xlabel('FPR')
# AUC
from sklearn.metrics import roc_auc_score
y_true = np.array([0, 0, 1, 1])
y_scores = np.array([0.1, 0.4, 0.35, 0.8])
print('AUC score', roc_auc_score(y_true, y_scores))
# 最大KS值 在实际操作时往往使用ROC曲线配合求出KS值
from sklearn.metrics import roc_curve
y_pred = [0, 1, 1, 0, 1, 1, 0, 1, 1, 1]
y_true = [0, 1, 1, 0, 1, 0, 1, 1, 1, 1]
FPR, TPR, thresholds = roc_curve(y_true, y_pred)
KS = abs(FPR-TPR).max()
print('最大KS值:', KS)

data analysis

"""
1.EDA价值主要在于熟悉了解整个数据集的基本情况(缺失值,异常值),对数据集进行验证是否可以进行接
下来的机器学习或者深度学习建模.
2.了解变量间的相互关系、变量与预测值之间的存在关系。
3.为特征工程做准备
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import warnings
warnings.filterwarnings('ignore')
data_train = pd.read_csv('./train.csv')
data_test_a = pd.read_csv('./testA.csv')
# 通过os.getcwd()查看当前工作目录
import os
os.getcwd()
# 通过nrows参数读取某几行
data_train_sample = pd.read_csv('./train.csv', nrows=5)
data_train_sample
# 通过设置chunksize参数来控制每次迭代数据的大小
chunker = pd.read_csv('./train.csv', chunksize=5)
for item in chunker:
    print(type(item)) # <class 'pandas.core.frame.DataFrame'>
    print(len(item))  # 5
    display(item)
    break

general understanding

data_test_a.shape
data_train.shape
data_train.columns
# 查看数据类型
data_train.info()
# 查看各个特征(数值类型)的基本统计量
data_train.describe()
# data_train.head(3).append(data_train.tail(3)) # 未来会被弃用
pd.concat([data_train.head(3), data_train.tail(3)]) # 建议使用

View feature missing values, unique values, etc. in the dataset

# 查看缺失值
print(f'There are {
      
      data_train.isnull().any().sum()} columns in train dataset with missing values.')
# 进一步查看缺失特征中缺失率大于50%的特征
have_null_fea_dict = (data_train.isnull().sum()/len(data_train)).to_dict()
fea_null_moreThanHalf = {
    
    }
for key, value in have_null_fea_dict.items():
    if value > 0.5:
        fea_null_moreThanHalf[key] = value
fea_null_moreThanHalf # 没有
# 具体查看缺失特征及缺失率
# nan可视化
missing = data_train.isnull().sum()/len(data_train)
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.plot.bar()
"""
了解哪些列存在 “nan”, 并可以把nan的个数打印,主要的目的在于 nan存在的个数是否真的很大,如果很小一般选
择填充,如果使用lgb等树模型可以直接空缺,让树自己去优化,但如果nan存在的过多、可以考虑删掉
"""
# 查看训练集测试集中特征属性只有一值的特征
one_value_fea = [col for col in data_train.columns if data_train[col].nunique() <= 1] # number of unique
oen_value_fea_test = [col for col in data_test_a.columns if data_test_a[col].nunique() <= 1]
print(one_value_fea)
print(oen_value_fea_test)

Summary: 22 of the 47 columns of data are missing data, which is normal in the real world. 'policyCode' has a unique value (or all are missing). There are many continuous
variables and some categorical variables.

Check what are the numeric types and object types of the feature

  1. Features are generally composed of categorical features and numerical features.
  2. Categorical features sometimes have non-numeric relationships and sometimes have numeric relationships. For example, grades A, B, C, etc. in 'grade', whether it is just a simple
    classification, or whether A is better than others should be combined with business judgment.
  3. Numerical features can be directly imported into the model, but often risk control personnel need to bin them, convert them into WOE codes, and then make standard scorecards and other operations. From the perspective of model effect, feature binning is mainly to reduce the complexity of variables, reduce the impact of variable noise on the model, and improve the correlation between independent variables and dependent variables. This makes the model more stable.
# 数值型特征
numerical_fea = list(data_train.select_dtypes(exclude=['object']).columns)
print(numerical_fea)
# 类别型特征
category_fea = list(filter(lambda x: x not in numerical_fea, list(data_train.columns)))
category_fea
data_train.grade
# 数值型变量分析:
# 1. 划分数值型变量中的连续型变量和和离散型变量
# 过滤数值型类别特征:如果特征下面的不同取值小于10种,则认为是离散型数值变量,否则认为是连续型数值变量
def get_numerical_serial_fea(data,feas):
    numerical_serial_fea = []
    numerical_noserial_fea = []
    for fea in feas:
        temp = data[fea].nunique()
        if temp <= 10:
            numerical_noserial_fea.append(fea)
            continue
        numerical_serial_fea.append(fea)
    return numerical_serial_fea, numerical_noserial_fea

numerical_serial_fea, numerical_noserial_fea = get_numerical_serial_fea(data_train,numerical_fea)
print(numerical_serial_fea)
print(numerical_noserial_fea)
# 离散型数值变量分析
data_train['term'].value_counts()
data_train['homeOwnership'].value_counts()
data_train['isDefault'].value_counts()
data_train['initialListStatus'].value_counts()
data_train['applicationType'].value_counts()
data_train['policyCode'].value_counts() # 无用变量,全部一个值
data_train['n11'].value_counts() # 相差悬殊,不用再分析
data_train['n12'].value_counts() # 相差悬殊,不用再分析
# 连续型数值变量分析
# 每个数值特征的分布可视化
f = pd.melt(data_train, value_vars=numerical_serial_fea)
g = sns.FacetGrid(f, col='variable', col_wrap=2, sharex=False, sharey=False) # 结构化多绘图网格,初始化FacetGrid对象
g = g.map(sns.distplot, 'value') # 将sns.distplot应用到每个数据子集
# 参考资料:https://blog.csdn.net/weixin_43618989/article/details/105613021
# https://zhuanlan.zhihu.com/p/484363632
"""
1. 查看某一个数值型变量的分布,查看变量是否符合正态分布,如果不符合正态分布的变量可以log化后再观察下是否符合正态分布。
2. 如果想统一处理一批数据变标准化 必须把这些之前已经正态化的数据提出
"""
# 绘制交易数据(loanAmnt)分布
plt.figure(figsize=(16, 12)) # 画布大小,宽为16,高为12
plt.suptitle('Transaction Values Distribution', fontsize=22) # 标题
plt.subplot(221) # 整个画布分为两行两列,当前子图位于第一个位置
sub_plot_1 = sns.distplot(data_train['loanAmnt']) # 该子图上绘制直方图,默认包含核密度曲线
sub_plot_1.set_title('loanAmnt Distribution', fontsize=18)
sub_plot_1.set_xlabel('')
sub_plot_1.set_ylabel('Probability', fontsize=15)

plt.subplot(222)
sub_plot_2 = sns.distplot(np.log(data_train['loanAmnt']))
sub_plot_2.set_title("loanAmnt (Log) Distribuition", fontsize=18)
sub_plot_2.set_xlabel("")
sub_plot_2.set_ylabel("Probability", fontsize=15)
# 类别型特征分析
category_fea
data_train['grade'].value_counts() # 默认
data_train['subGrade'].value_counts()
data_train['employmentLength'].value_counts()
data_train['issueDate'].value_counts()
data_train['earliesCreditLine'].value_counts()

Summarize:

  1. Above we used functions such as value_counts() to look at the distribution of feature attributes, but charts are the most convenient way to summarize raw information.
  2. Less intuition when number is invisible.
  3. For the same data set, the patterns displayed on different scales reflect different patterns. Python converts the data into graphs, but it is up to you to ensure that the conclusions are correct.

Variable Distribution Visualization

Univariate Distribution Visualization

plt.figure(figsize=(8, 8))
sns.barplot(data_train['employmentLength'].value_counts(dropna=False)[:20], # .values可加可不加
            data_train['employmentLength'].value_counts(dropna=False).keys()[:20])
plt.show()

Visualize the distribution of a certain feature of x according to the different y values

# 1.首先查看类别型变量在不同y值上的分布
train_loan_fr = data_train[data_train['isDefault'] == 1] # .loc可加可不加
train_loan_nofr = data_train.loc[data_train['isDefault'] == 0]
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 8))
# 优化:对索引排序
# 欺诈用户的不同等级的数量分布
train_loan_fr.groupby('grade').size().sort_index(ascending=False).plot(kind='barh', ax=ax1, title='Count of grade fraud') 
# 非欺诈用户的不同等级的数量分布
train_loan_nofr.groupby('grade').size().sort_index(ascending=False).plot(kind='barh', ax=ax2, title='Count of grade non-fraud')
# 欺诈用户的就业年限的数量分布
employmentLength_index = train_loan_fr.groupby('employmentLength').size().index
b = np.arange(11)
b[0], b[1], b[10] = b[10], b[0], b[1]
train_loan_fr.groupby('employmentLength').size().reindex(employmentLength_index[b][::-1]).plot(kind='barh', ax=ax3, title='Count of employmentLength fraud')
# 非欺诈用户的就业年限的数量分布
train_loan_nofr.groupby('employmentLength').size().reindex(employmentLength_index[b][::-1]).plot(kind='barh', ax=ax4, title='Count of employmentLength non-fraud')
plt.show()
# 2.查看连续型变量在不同y值上的分布
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
data_train[data_train['isDefault'] == 1]['loanAmnt'].apply(np.log).plot(kind='hist',
          bins = 100,
          title = 'Log Loan Amt - Fraud',
          color = 'r',
          xlim = (-3, 10),
          ax = axes[0])
data_train[data_train['isDefault'] == 0]['loanAmnt'].apply(np.log).plot(kind='hist',
          bins = 100,
          title = 'Log Loan Amt - Not Fraud',
          color = 'b',
          xlim = (-3, 10),
          ax = axes[1])
"""
图一:绘制欺诈用户和非欺诈用户对应的数量,并在图上标注百分比
"""
total = len(data_train)
print(total_amt)
plt.figure(figsize=(12, 5))
plt.subplot(121) # 下面的图画在第一个坐标轴上
plot_tr = sns.countplot(x='isDefault', data=data_train) #
plot_tr.set_title("Fraud Loan Distribution \n 0: good user | 1: bad user", fontsize=14)
plot_tr.set_xlabel("Is fraud by count", fontsize=16)
plot_tr.set_ylabel('Count', fontsize=16)
for p in plot_tr.patches:
    height = p.get_height()
    plot_tr.text(p.get_x()+p.get_width()/2.,
            height,
            '{:1.2f}%'.format(height/total*100),
            ha="center", fontsize=15) # ha是horizontal alignment(水平对齐)的含义


"""
图二:绘制欺诈用户和非欺诈用户对应的贷款金额总数,并在图上标注百分比
"""
total_amt = data_train['loanAmnt'].sum() # 总的loanAmnt
percent_amt = (data_train.groupby(['isDefault'])['loanAmnt'].sum())
percent_amt = percent_amt.reset_index()
plt.subplot(122) # 下面的图画在第二个坐标轴上
plot_tr_2 = sns.barplot(x='isDefault', y='loanAmnt', data=percent_amt)
plot_tr_2.set_title("Total Amount in loanAmnt \n 0: good user | 1: bad user", fontsize=14)
plot_tr_2.set_xlabel("Is fraud by percent", fontsize=16)
plot_tr_2.set_ylabel('Total Loan Amount Scalar', fontsize=16)
for p in plot_tr_2.patches:
    height = p.get_height()
    plot_tr_2.text(p.get_x()+p.get_width()/2.,
            height,
            '{:1.2f}%'.format(height/total_amt * 100),
            ha="center", fontsize=15)

Time format data processing and viewing

# 训练集:利用datetime类型,计算贷款发放日期距离设定日期的天数,并添加新的列
# issueDate: 贷款发放日期
data_train['issueDate'] = pd.to_datetime(data_train['issueDate'], format='%Y-%m-%d') # 将数据类型转化为datetime类型
startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d') # 设置起始日期,将字符串变成datetime类型
data_train['issueDateDT'] = data_train['issueDate'].apply(lambda x: x-startdate).dt.days # 计算距离起始日期的天数,添加到新的一列中
# 测试集:利用datetime类型,计算贷款发放日期距离设定日期的天数,并添加新的列
data_test_a['issueDate'] = pd.to_datetime(data_train['issueDate'],format='%Y-%m-%d')
startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
data_test_a['issueDateDT'] = data_test_a['issueDate'].apply(lambda x: x-startdate).dt.days
plt.hist(data_train['issueDateDT'], label='train') # 根据数据绘制直方图
plt.hist(data_test_a['issueDateDT'], label='train')
plt.legend() # 添加图例
plt.title('Distribution of issueDateDT dates');
# train 和 test issueDateDT 日期有重叠 所以使用基于时间的分割进行验证是不明智的

Mastering the perspective allows us to better understand the data

# 透视图 索引可以有多个,“columns(列)”是可选的,聚合函数aggfunc最后是被应用到了变量“values”中你所列举的项目上。
# 透视图 行索引是'grade'(贷款等级),列索引是"issueDateDT"(贷款发放距离初始日期),值是'loanAmnt'(贷款金额)经过np.sum聚合的结果
pivot = pd.pivot_table(data_train, index=['grade'], columns=['issueDateDT'], values=['loanAmnt'], aggfunc=np.sum)
pivot

Generate data reports with pandas_profiling

import pandas_profiling
pfr = pandas_profiling.ProfileReport(data_train)
pfr.to_file('./example.html')

Summarize

Data exploratory analysis is the stage where we get a preliminary understanding of the data and get familiar with the data to prepare for feature engineering. In many cases, even the features extracted in the EDA stage can be directly used as rules. It can be seen that the importance of EDA, the main work at this stage is to use various simple statistics to understand the overall data, analyze the relationship between various types of variables, and visualize them with appropriate graphics for intuitive observation. I hope that the content of this section can help beginners, and I look forward to suggestions from all learners on the deficiencies.

Task3 feature engineering

learning target

  1. Learn feature processing methods such as feature preprocessing, missing values, outlier processing, and data bucketing
  2. Learn corresponding methods for feature interaction, encoding, and selection
  3. Complete the corresponding learning tasks, the two optional assignments are not mandatory, for students who have spare time to explore on their own

Introduction

  1. Data preprocessing:
    a. filling of missing values
    ​​b. time format processing
    c. conversion of object type features to values
  2. Outlier processing:
    a. Based on 3sigma principle
    b. Based on box plot
  3. Data binning
    a. Fixed width binning
    b. Quantile binning
    • Binning discrete numeric data
    • Continuous numeric data binning

c. Chi-square binning (optional homework)
4. Feature interaction
a. Combination between features
b. Derivation between features and features
c. Attempts to derive other features (optional homework)
5. Feature coding
a. one -hot encoding
b. label-encode encoding
6. feature selection
a. 1 Filter
b. 2 Wrapper (RFE)
c. 3 Embedded

code example

import package and read data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
# from sklearn.feature_selection import SelectKBest
# from sklearn.feature_selection import chi2
# from sklearn.preprocessing import MinMaxScaler
# import xgboost as xgb
# import lightgbm as lgb
# from catboost import CatBOostRegressor
# from sklearn.model_selection import StratifiedKFold, KFold
# from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
import warnings
warnings.filterwarnings('ignore')
data_train =pd.read_csv('./train.csv')
data_test_a = pd.read_csv('./testA.csv')

feature preprocessing

  1. In the data EDA part, we already have an understanding of the general data and some feature distributions. In the data preprocessing part, we generally have to deal with some problems analyzed in the EDA stage. Here we introduce the filling of missing data values ​​and the conversion of time format features. Processing of certain object class features.

Data preprocessing is an essential part of the competition. The filling of missing values ​​often affects the results of the competition. In the competition, you may wish to try a variety of fillings and compare the results to choose the best one; the competition data is compared with the real scene
. The data of is relatively "clean", but there will still be some "dirty" data. Cleaning some outliers often results in unexpected results.

# 找出数据中的object类型特征和非object类型特征
numerical_fea = list(data_train.select_dtypes(exclude=['object']).columns) # 非object类型特征
category_fea = list(filter(lambda x: x not in numerical_fea,list(data_train.columns))) # object类型特征
# 也可以category_fea = list(data_train.select_dtypes(include=['object']).columns)
label = 'isDefault'
numerical_fea.remove(label) # 剔除label标签

Missing value filling

  1. Replace all missing values ​​with the specified value 0
    data_train = data_train.fillna(0)

  2. Vertically replace missing values ​​with values ​​above missing values
    ​​data_train = data_train.fillna(axis=0,method='ffill')

  3. Vertically replace the missing value with the value below the missing value, and set up to fill only two consecutive missing values
    ​​data_train = data_train.fillna(axis=0, method='bfill', limit=2)

References: https://www.jb51.net/article/255677.htm

# 查看缺失值情况
data_train.isnull().sum()
# 按照平均数填充数值型特征
data_train.fillna(data_train[numerical_fea].median(), inplace=True) # 优化
data_test_a.fillna(data_train[numerical_fea].median(), inplace=True)
# 按照众数填充类别型特征
data_train.fillna(data_train[category_fea].mode().iloc[0], inplace=True) # 勘误
data_test_a.fillna(data_train[category_fea].mode().iloc[0], inplace=True)
data_train.isnull().sum()

Time Format Handling

# 利用datetime类型,计算贷款发放日期距离设定日期的天数,并添加新的列
for data in [data_train, data_test_a]:
    data['issueDate'] = pd.to_datetime(data['issueDate'],format='%Y-%m-%d')
    startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
    #构造时间特征
    data['issueDateDT'] = data['issueDate'].apply(lambda x: x-startdate).dt.days

object type trait conversion to value

data_train['employmentLength'].value_counts(dropna=False).sort_index()
def employmentLength_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int(s.split()[0])

for data in [data_train, data_test_a]:
    data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
    data['employmentLength'].replace('< 1 year', '0 years', inplace=True)
    data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)
data['employmentLength'].value_counts(dropna=False).sort_index()
# 对earliesCreditLine(借款人最早报告的信用额度开立的月份)进行预处理
data_train['earliesCreditLine'].sample(5) # 采五个样本
for data in [data_train, data_test_a]:
    data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:])) # 截取年份
data_train['earliesCreditLine'].sample(5) # 采五个样本
# cate_features
category_fea

Class feature processing

# 部分类别特征
cate_features = ['grade', 'subGrade', \
                 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', \
                 'applicationType', 'initialListStatus', 'title', 'policyCode']
for f in cate_features:
    print(f, '类型数:', data[f].nunique())
# 像等级这种类别特征,是有优先级的可以用map映射成数值
for data in [data_train, data_test_a]:
    data['grade'] = data['grade'].map({
    
    'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7})
# 类型数在2之上,又不是高维稀疏的,且纯分类的特征(没有优先级比较),可以用get_dummies转化为one-hot编码
# 参考: https://blog.csdn.net/u010712012/article/details/83002388
# https://zhuanlan.zhihu.com/p/139144355
# 勘误:
data_train = pd.get_dummies(data_train, columns=['homeOwnership', 'verificationStatus', 'purpose', 'regionCode'])
data_test_a = pd.get_dummies(data_test_a, columns=['homeOwnership', 'verificationStatus', 'purpose', 'regionCode'])

Outlier handling

  1. When you find an outlier, you must first distinguish what caused the outlier, and then consider how to deal with it. First, if the outlier does not represent a regularity, but an extremely occasional phenomenon, or you do not want to study this occasional phenomenon, you can delete it at this time. Second, if outliers exist and represent a real phenomenon, they cannot be deleted casually. In the existing fraud scenarios, the fraud data itself is often abnormal compared to the normal data. We need to include these abnormal points, re-fit the model, and study its laws. If you can use supervision, use a supervised model, and if you can't use it, you can also consider using anomaly detection algorithms.
  2. Note that the test data cannot be deleted.

Method 1 for detecting anomalies: mean square error

In statistics, if a data distribution is approximately normal, then about 68% of the data values ​​will be within one standard deviation of the mean, about 95% will be within two standard deviations, and about 99.7% will be within three standard deviations within the range of difference.

def find_outliers_by_3sigma(data,fea):
    data_std = np.std(data[fea])
    data_mean = np.mean(data[fea])
    outliers_cut_off = data_std * 3
    lower_rule = data_mean - outliers_cut_off
    upper_rule = data_mean + outliers_cut_off
    data[fea+'_outliers'] = data[fea].apply(lambda x: '异常值' if x > upper_rule or x < lower_rule else '正常值')
    return data
# data_train = data_train.copy()
for item in ['homeOwnership', 'verificationStatus', 'purpose', 'regionCode']:
    numerical_fea.remove(item)
for fea in numerical_fea:
    data_train = find_outliers_by_3sigma(data_train,fea)
    print(data_train[fea+'_outliers'].value_counts())
    print(data_train.groupby(fea+'_outliers')['isDefault'].sum()) # 当前特则为正常值(异常值)时,欺诈用户的数量
    print('*'*10)
# 删除异常值
for fea in numerical_fea:
    data_train = data_train[data_train[fea+'_outliers']=='正常值'] # 保留正常值对应的行
data_train = data_train.reset_index(drop=True) # 勘误

Method 2 for detecting anomalies: box plot

To sum up one sentence: the quartile will divide the data into three points and four intervals, IQR = Q3 -Q1, lower tentacles = Q1 − 1.5x IQR, upper tentacles = Q3 + 1.5x IQR;

Data bucketing

  1. The purpose of feature binning:

    a. From the perspective of model effect, feature binning is mainly to reduce the complexity of variables, reduce the impact of variable noise on the model, and improve the correlation between independent variables and dependent variables. This makes the model more stable.

  2. The object of data bucketing:

    a. Discretize continuous variables

    b. Merge multi-state discrete variables into few states

  3. Reasons for binning:

    a. The value span in the feature of the data may be relatively large. For supervised and unsupervised methods such as k-means clustering, it uses Euclidean distance as a similarity function to measure the similarity between
    data points. Both will cause big and small impacts. One of the solutions is to perform interval quantification on the count value, that is, data
    bucketing, also known as data binning, and then use the quantified results.

  4. Advantages of binning:

    a. Handle missing values: When there may be missing values ​​in the data source, null can be used as a separate bin at this time.

    b. Dealing with outliers: When there are outliers in the data, they can be discretized by binning to improve the robustness of variables (anti-interference ability). For example, if there is an abnormal value of 200 in age, it can be classified into the bin of "age > 60" to exclude the influence.

    c. Business interpretation: We are used to linearly judging the role of variables. When x gets bigger and bigger, y gets bigger and bigger. However, there is often
    a nonlinear relationship between x and y in reality, and WOE transformation can be performed at this time.

  5. Pay special attention to the basic principles of binning:

    a. (1) The smallest binning ratio is not less than 5%

    b. (2) Not all of the boxes are good customers

    c. (3) Continuous box monotone

"""
1. 固定宽度分箱
当数值横跨多个数量级时,最好按照 10 的幂(或任何常数的幂)来进行分组:0-9、10-99、100-999、1000-9999,等
等。固定宽度分箱非常容易计算,但如果计数值中有比较大的缺口,就会产生很多没有任何数据的空箱子。
"""
# 通过除法映射到间隔均匀的分箱中,每个分箱的取值范围都是loanAmnt/1000
data['loanAmnt_bin1'] = np.floor_divide(data['loanAmnt'], 1000) # 向下取整
# 通过对数函数映射到指数宽度分箱
data['loanAmnt_bin2'] = np.floor(np.log10(data['loanAmnt']))
"""
2.分位数分箱
"""
data['loanAmnt_bin3'] = pd.qcut(data['loanAmnt'], 10, labels=False) # 根据分位数分10箱,且只显示第几箱(不显示箱的范围如(xxx,xxx]))
"""
3.卡方分箱及其他分箱方法的尝试(略)
"""

feature interaction

  1. Interactive features are simple to construct and expensive to use. If the linear model includes interactive feature pairs, its training time and scoring
    time will increase from O(n) to O( n 2 n^2n2 ), where n is the number of single features.
# 获得grade, subGrade和isDefault的均值的映射关系,并用均值创建新的列
for col in ['grade', 'subGrade']:
    # temp_dict:获得每个col元素对应的isDefault的均值的映射关系
    # temp_dict = data_train.groupby([col])['isDefault'].agg(['mean']).reset_index().rename(columns={'mean': col + '_target_mean'})
    # temp_dict.index = temp_dict[col].values
    # temp_dict = temp_dict[col + '_target_mean'].to_dict()
    # 上面的代码可优化为
    temp_dict = data_train.groupby([col])['isDefault'].agg('mean').to_dict()
    
    data_train[col + '_target_mean'] = data_train[col].map(temp_dict)
    data_test_a[col + '_target_mean'] = data_test_a[col].map(temp_dict)
    
# 其他衍生变量 mean 和 std
for df in [data_train, data_test_a]:
    for item in ['n0','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14']:
        df['grade_to_mean_' + item] = df['grade'] / df.groupby([item])['grade'].transform('mean')
        df['grade_to_std_' + item] = df['grade'] / df.groupby([item])['grade'].transform('std')

feature encoding

labelEncode directly into the tree model

#label-encode:subGrade,postCode,title
# 高维类别特征需要进行转换
for col in tqdm(['employmentTitle', 'postCode', 'title','subGrade']): # tqdm用于展示进度条
    le = LabelEncoder()
    le.fit(list(data_train[col].astype(str).values) + list(data_test_a[col].astype(str).values))
    data_train[col] = le.transform(list(data_train[col].astype(str).values))
    data_test_a[col] = le.transform(list(data_test_a[col].astype(str).values))
print('Label Encoding 完成')
# 参考资料:
# Python数据预处理中的LabelEncoder与OneHotEncoder: https://blog.csdn.net/quintind/article/details/79850455

Feature engineering to be added separately for models such as logistic regression

  1. Normalize the features and remove highly correlated features
  2. The purpose of normalization is to make the training process converge better and faster, and avoid the problem of large and small features
  3. Removing correlations increases the interpretability of the model and speeds up the prediction process.
# 举例归一化过程
# 伪代码
# for fea in [要归一化的特征列表]:
#     data[fea] = ((data[fea] - np.min(data[fea])) / (np.max(data[fea]) - np.min(data[fea])))

feature selection

  1. Feature selection technology can simplify useless features to reduce the complexity of the final model. Its ultimate goal is to obtain a simplified model and increase the calculation speed without reducing or affecting the prediction accuracy. Feature selection is not about reducing training time (in fact, some techniques increase overall training time), but about reducing model scoring time.

The method of feature selection:

  1. 1 Filter

    a. Variance selection method

    b. Correlation coefficient method (pearson correlation coefficient)

    c. Chi-square test

    d. Mutual information method

  2. 2 Wrapper (RFE)

    a. Recursive feature elimination method

  3. 3 Embedded

    a. Feature selection method based on penalty term

    b. Feature selection based on tree model

Filter

  1. Filtering based on the relationship between features
['a', 'b'] + ['c']
"""
方差选择法
1. 方差选择法中,先要计算各个特征的方差,然后根据设定的阈值,选择方差大于阈值的特征
"""
from sklearn.feature_selection import VarianceThreshold
#其中参数threshold为方差的阈值
VarianceThreshold(threshold=3).fit_transform(train,target_train)
"""
相关系数法
1. Pearson 相关系数
皮尔森相关系数是一种最简单的,可以帮助理解特征和响应变量之间关系的方法,该方法衡量的是变量之间的线性相关性。
结果的取值区间为 [-1,1] , -1 表示完全的负相关, +1表示完全的正相关,0 表示没有线性相关。
"""
from sklearn.feature_selection import SelectKBest
from scipy.stats import pearsonr
#选择K个最好的特征,返回选择特征后的数据
#第一个参数为计算评估特征是否好的函数,该函数输入特征矩阵和目标向量,
#输出二元组(评分,P值)的数组,数组第i项为第i个特征的评分和P值。在此定义为计算相关系数
#参数k为选择的特征个数

SelectKBest(k=5).fit_transform(train,target_train)
"""
卡方检验
1. 经典的卡方检验是用于检验自变量对因变量的相关性。 假设自变量有N种取值,因变量有M种取值,考虑自变
量等于i且因变量等于j的样本频数的观察值与期望的差距。 其统计量如下: χ2=Σ(A−T)2T,其中A为实际值,
T为理论值
2. (注:卡方只能运用在正定矩阵上,否则会报错Input X must be non-negative)
"""
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
#参数k为选择的特征个数
SelectKBest(chi2, k=5).fit_transform(train,target_train)
"""
互信息法
1. 经典的互信息也是评价自变量对因变量的相关性的。 在feature_selection库的SelectKBest类结合最大信息系数
法可以用于选择特征,相关代码如下:
"""
from sklearn.feature_selection import SelectKBest
from minepy import MINE
#由于MINE的设计不是函数式的,定义mic方法将其为函数式的,
#返回一个二元组,二元组的第2项设置成固定的P值0.5
def mic(x, y):
    m = MINE()
    m.compute_score(x, y)
    return (m.mic(), 0.5)
#参数k为选择的特征个数
SelectKBest(lambda X, Y: array(map(lambda x:mic(x, Y), X.T)).T, k=2).fit_transform(train,target_train)

Wrapper(Recursive feature elimination,RFE)

"""
1. 递归特征消除法 递归消除特征法使用一个基模型来进行多轮训练,每轮训练后,消除若干权值系数的特征,
再基于新的特征集进行下一轮训练。 在feature_selection库的RFE类可以用于选择特征,相关代码如下(以逻辑
回归为例):
"""
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
#递归特征消除法,返回特征选择后的数据
#参数estimator为基模型
#参数n_features_to_select为选择的特征个数
RFE(estimator=LogisticRegression(),
n_features_to_select=2).fit_transform(train,target_train)

Embedded

"""
1. 基于惩罚项的特征选择法 使用带惩罚项的基模型,除了筛选出特征外,同时也进行了降维。 在
feature_selection库的SelectFromModel类结合逻辑回归模型可以用于选择特征,相关代码如下:
"""
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
#带L1惩罚项的逻辑回归作为基模型的特征选择
SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit_transform(train,target_train)
"""
基于树模型的特征选择 树模型中GBDT也可用来作为基模型进行特征选择。 在feature_selection库的
SelectFromModel类结合GBDT模型可以用于选择特征,相关代码如下:
"""
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier
#GBDT作为基模型的特征选择
SelectFromModel(GradientBoostingClassifier()).fit_transform(train,target_train)

data processing

In this data set, we delete the non-model features, fill in the missing values, and then use the method of calculating covariance to look at the correlation between features, and then perform model training

# 删除不需要的数据
for data in [data_train, data_test_a]:
    data.drop(['issueDate','id'], axis=1,inplace=True)

# 略

Summarize

Feature engineering is the most important part of machine learning, even deep learning, and it is often the step that takes the most time in practical applications. There are often very few explanations of feature engineering in various
algorithm books, because feature engineering is too closely combined with specific data, and it is difficult to systematically cover all
scenarios. This chapter mainly introduces some commonly used methods. For example, the processing method of missing values ​​and outliers is applicable to any data set
. However, for operations such as binning, this chapter gives several specific ideas, which require readers to explore by themselves. In feature engineering, competitions and specific applications are still
different. In the actual process of making financial risk control scorecards, feature binning is particularly important due to the emphasis on the interpretability of features. Students who have enough energy
to learn can try more on their own. I hope you can gain something from the study in this section.

Task4 modeling and tuning

learning target

  1. Learn machine learning models commonly used in the field of financial sub-control

  2. Learning the modeling process and parameter tuning process of machine learning models

  3. Complete the corresponding learning tasks

Introduction

Introduction to Model Related Principles

logistic regression model

https://blog.csdn.net/han_xiaoyang/article/details/49123419

decision tree model

https://blog.csdn.net/c406495762/article/details/76262487

GBDT model

https://zhuanlan.zhihu.com/p/45145899

XGBoost model

https://blog.csdn.net/wuzhongqiang/article/details/104854890

LightGBM model

https://blog.csdn.net/wuzhongqiang/article/details/105350579

Catboost model

https://mp.weixin.qq.com/s/xloTLr5NJBgBspMQtxPoFA

Time Series Modeling (Optional)

RNN: https://zhuanlan.zhihu.com/p/45289691

LSTM: https://zhuanlan.zhihu.com/p/83496936

Recommended Textbooks:

"Machine Learning" https://book.douban.com/subject/26708119/

"Statistical Learning Methods" https://book.douban.com/subject/10590856/

"Feature Engineering for Machine Learning" https://book.douban.com/subject/26826639/

"Credit Scoring Model Technology and Application" https://book.douban.com/subject/1488075/

"Digital Risk Control" https://book.douban.com/subject/30282558/

Model comparison and performance evaluation

logistic regression

Pros and cons see pdf

decision tree model

Pros and cons see pdf

Ensemble model ensemble method (ensemble method)

Model Evaluation Method

Dataset partition summary

  1. When the amount of data is sufficient, the set-out method or k-fold cross-validation method is usually used to divide the training/test set;
  2. Use the bootstrap method when the data set is small and it is difficult to effectively divide the training/test set;
  3. When the data set is small and can be effectively divided, it is best to use the leave-one-out method for division, because this method is the most accurate

Model Evaluation Criteria

AUC

code example

Import related packages and related settings

import pandas as pd
import numpy as np
import warnings
import os
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
"""
sns 相关设置
"""
# 声明使用 Seaborn 样式
sns.set()
# 有五种seaborn的绘图风格,它们分别是:darkgrid, whitegrid, dark, white, ticks。默认的主题是darkgrid。
sns.set_style("whitegrid")
# 有四个预置的环境,按大小从小到大排列分别为:paper, notebook, talk, poster。其中,notebook是默认的。
sns.set_context('talk')
# 中文字体设置-黑体
plt.rcParams['font.sans-serif'] = ['SimHei']
# 解决保存图像是负号'-'显示为方块的问题
plt.rcParams['axes.unicode_minus'] = False
# 解决Seaborn中文显示问题并调整字体大小
sns.set(font='SimHei')

read data

# reduce_mem_usage 函数通过调整数据类型,帮助我们减少数据在内存中占用的空间
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum()/1024/1024
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        # 非object类型的对象(数值类型对象)
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
            
    end_mem = df.memory_usage().sum()/1024/1024
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

# 读取数据
train = pd.read_csv('./train.csv')
train = reduce_mem_usage(train)
test = pd.read_csv('./testA.csv')
test = reduce_mem_usage(test)

simple modeling

Tips1: Most of the actual projects of financial risk control involve credit scoring, so the model features need to have good interpretability. Therefore, most of the actual projects currently
use logistic regression as the basic model. However, in the competition, the score is the criterion, and rigorous interpretability is not required, so most of the
models are based on integrated algorithms.

Tips2: Because of the algorithmic characteristics of logistic regression, outliers and missing value data need to be processed in advance [refer to the task3 part]

Tips3: Based on the algorithm characteristics of the tree model, the processing of outliers and missing values ​​can be skipped, but students who are more familiar with the business can also handle missing outliers by themselves, and the effect may be better than the results of model processing
.

Note: The source data for the following modeling has been subjected to corresponding feature engineering with reference to the baseline, and no corresponding processing operations have been performed for abnormal missing values.

# 建模之前的预操作
from sklearn.model_selection import KFold
# 分离数据集,方便进行交叉验证
X_train = train.drop(['id', 'issueDate', 'isDefault'], axis=1)
X_test = test.drop(['id', 'issueDate'], axis=1)
y_train = train.loc[:, 'isDefault']
# 使用Lightgbm进行建模
from sklearn.model_selection import train_test_split
import lightgbm as lgb
# 数据集划分
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2) # 训练集划分为训练集和验证集
train_matrix = lgb.Dataset(X_train_split, label=y_train_split)
valid_matrix = lgb.Dataset(X_val, label=y_val)

params = {
    
    
    'boosting_type': 'gbdt', # gbdt
    'objective': 'binary',   # 二分类
    'learning_rate': 0.1,    # 学习率
    'metric': 'auc',         # 评价指标
    'min_child_weight': 1e-3,
    'num_leaves': 31,        # 叶子节点数量
    'max_depth': -1,
    'reg_lambda': 0,         # 目标函数lambda
    'reg_alpha': 0,          # 目标函数alpha
    'feature_fraction': 1,   # 建树的特征选择比例
    'bagging_fraction': 1,   # 建树的样本采样比例
    'bagging_freq': 0,       # k 意味着每 k 次迭代执行bagging
    'seed': 2020,
    'nthread': 8,
#     'silent': True,
    'verbose': -1,           # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
} 

"""使用训练集数据进行模型训练"""
model = lgb.train(params, train_set=train_matrix, valid_sets=valid_matrix, \
                  num_boost_round=20000, verbose_eval=1000, early_stopping_rounds=200) # verbose_eval是打印log的间隔
# 对验证集进行预测
from sklearn import metrics
from sklearn.metrics import roc_auc_score

"""预测并计算roc的相关指标"""
val_pre_lgb = model.predict(X_val, num_iteration=model.best_iteration) # val的预测值
fpr, tpr, threshold = metrics.roc_curve(y_val, val_pre_lgb) # 绘制roc曲线
roc_auc = metrics.auc(fpr, tpr)
print('未调参前lightgbm单模型在验证集上的AUC:{}'.format(roc_auc))
"""画出roc曲线图"""
plt.figure(figsize=(8, 8))
plt.title('Validation ROC')
plt.plot(fpr, tpr, 'b', label = 'Val AUC = %0.4f' % roc_auc)
plt.ylim(0,1)
plt.xlim(0,1)
plt.legend(loc='best')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
# 画出对角线
plt.plot([0,1],[0,1],'r--')
plt.show()
# 更进一步的,使用5折交叉验证进行模型性能评估
"""使用lightgbm 5折交叉验证进行建模预测"""
cv_scores = []

# 5折交叉验证
folds = 5
seed = 2020
kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

for i, (train_index, valid_index) in enumerate(kf.split(X_train, y_train)):
    print('************************************ {
    
    } \
    ************************************'.format(str(i+1)))
    X_train_split, y_train_split, X_val, y_val = X_train.iloc[train_index], \
          y_train[train_index], X_train.iloc[valid_index], y_train[valid_index]
    train_matrix = lgb.Dataset(X_train_split, label=y_train_split)
    valid_matrix = lgb.Dataset(X_val, label=y_val)
          
    params = {
    
    
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'learning_rate': 0.1,
    'metric': 'auc',
    'min_child_weight': 1e-3,
    'num_leaves': 31,
    'max_depth': -1,
    'reg_lambda': 0,
    'reg_alpha': 0,
    'feature_fraction': 1,
    'bagging_fraction': 1,
    'bagging_freq': 0,
    'seed': 2020,
    'nthread': 8,
    'verbose': -1,
    }
          
    model = lgb.train(params, train_set=train_matrix, num_boost_round=20000, \
                      valid_sets=valid_matrix, verbose_eval=1000, early_stopping_rounds=200)
          
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    cv_scores.append(roc_auc_score(y_val, val_pred))
      
print("lgb_scotrainre_list:{}".format(cv_scores))
print("lgb_score_mean:{}".format(np.mean(cv_scores)))
print("lgb_score_std:{}".format(np.std(cv_scores)))

Model Tuning

  1. Greedy Tuning

First use the parameters that currently have the greatest impact on the model for tuning to achieve model optimization under the current parameters, and then use the parameters that have the second most impact on the model to tune, and so on until all parameters are adjusted.

The disadvantage of this method is that it may be adjusted to the local optimum instead of the global optimum, but it only needs to be optimized and debugged step by step, which is easy to understand.

It should be noted that the order of parameter adjustment in the tree model, that is, the degree of influence of each parameter on the model, here are the commonly used parameters and the order of parameter adjustment in the daily adjustment process:

  1. ①:max_depth、num_leaves
  2. ②:min_data_in_leaf、min_child_weight
  3. ③:bagging_fraction、 feature_fraction、bagging_freq
  4. ④:reg_lambda、reg_alpha
  5. ⑤:min_split_gain
from sklearn.model_selection import cross_val_score

# 调objective
best_obj = dict()
for obj in objective:
    model = LGBMRegressor(objective=obj)
    """预测并计算roc的相关指标"""
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc').mean()
    best_obj[obj] = score
    
# num_leaves
best_leaves = dict()
for leaves in num_leaves:
model = LGBMRegressor(objective=min(best_obj.items(), key=lambda x:x[1])[0],
num_leaves=leaves)
"""预测并计算roc的相关指标"""
score = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc').mean()
best_leaves[leaves] = score

# max_depth
best_depth = dict()
for depth in max_depth:
model = LGBMRegressor(objective=min(best_obj.items(), key=lambda x:x[1])[0],
num_leaves=min(best_leaves.items(), key=lambda x:x[1])[0],
max_depth=depth)
"""预测并计算roc的相关指标"""
score = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc').mean()
best_depth[depth] = score

"""
可依次将模型的参数通过上面的方式进行调整优化,并且通过可视化观察在每一个最优参数下模型的得分情况
"""

Guess you like

Origin blog.csdn.net/zhangyifeng_1995/article/details/130078763