金融风控_特征衍生案例代码实现_放款中数据分析

 oil_data_for_tree.xlsx文件是滴滴给司机放的加油的贷款.
uid:用户id bad_ind:目标值 coupon_amount:优惠券
oil_actv_dt:放款的时间. oil_amount:消耗的油的数量 channel_code:从哪个渠道来的
create_dt:创建账户的时间 sale_amount:销售的金额 oil_code:加的是哪种油
total_oil_cnt:总共加油的数量 discount_amount:打折之后的金额 source_app:来源的APP
pay amount total:支付总金额 amount:还剩下多少钱 scene 场景
class_new:给用户的评级 pay_amount:花了多少钱 call_source:通话来源
import pandas as pd
import numpy as np
data = pd.read_excel('oil_data_for_tree.xlsx')
data.head()
#此表是B卡的用户,属于放款中。A卡是放款前,C卡是催收。
  uid oil_actv_dt create_dt total_oil_cnt pay_amount_total class_new bad_ind oil_amount discount_amount sale_amount amount pay_amount coupon_amount payment_coupon_amount channel_code oil_code scene source_app call_source
0 A8217710 2018-08-19 2018-08-17 275.0 48295495.4 B 0 3308.56 1760081.0 1796001.0 1731081.0 8655401.0 1.0 1.0 1 3 2 0 3
1 A8217710 2018-08-19 2018-08-16 275.0 48295495.4 B 0 4674.68 2487045.0 2537801.0 2437845.0 12189221.0 1.0 1.0 1 3 2 0 3
2 A8217710 2018-08-19 2018-08-15 275.0 48295495.4 B 0 1873.06 977845.0 997801.0 961845.0 4809221.0 1.0 1.0 1 2 2 0 3
3 A8217710 2018-08-19 2018-08-14 275.0 48295495.4 B 0 4837.78 2526441.0 2578001.0 2484441.0 12422201.0 1.0 1.0 1 2 2 0 3
4 A8217710 2018-08-19 2018-08-13 275.0 48295495.4 B 0 2586.38 1350441.0 1378001.0 1328441.0 6642201.0 1.0 1.0 1 2 2 0 3
set(data.class_new) #创建集合set  集合是一个容器类型,可以存储多个数据
#{'A', 'B', 'C', 'D', 'E', 'F'}
#数据重组
#把特征进行区分,哪些是原始的,哪些是可以做特征衍生的,哪些是文本类的
org_lst = ['uid','create_dt','oil_actv_dt','class_new','bad_ind']#不需要处理,建模时不会用到。
agg_lst = ['oil_amount','discount_amount','sale_amount','amount','pay_amount','coupon_amount','payment_coupon_amount']
#add_list 做统计量的计算,求和,评价,求方差,标准差,极差...
#数据类型的需要做特征衍生,
dstc_lst = ['channel_code','oil_code','scene','source_app','call_source']
#dstc_lst分类型的变量,计算(有几种不同的取值)
#origin_list 不需要做特殊变换,直接去掉
#aggregation_list 数值型变量做聚合
#dstc_lst 文本型变量做cnt
df = data[org_lst].copy()
df[agg_lst] = data[agg_lst].copy()
df[dstc_lst] = data[dstc_lst].copy()
#.copy() 函数返回一个字典的浅复制。浅复制是指当对象的字段值被复制时,字段引用的对象不会被复制
df.isnull().sum().head()   #df.isna().sum()等同效果
'''
uid               0
create_dt      4944
oil_actv_dt       0
class_new         0
bad_ind           0
dtype: int64
'''
df.info()
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50609 entries, 0 to 50608
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   uid                    50609 non-null  object        
 1   create_dt              45665 non-null  datetime64[ns]
 2   oil_actv_dt            50609 non-null  datetime64[ns]
 3   class_new              50609 non-null  object        
 4   bad_ind                50609 non-null  int64         
 5   oil_amount             45665 non-null  float64       
 6   discount_amount        45665 non-null  float64       
 7   sale_amount            45665 non-null  float64       
 8   amount                 45665 non-null  float64       
 9   pay_amount             45665 non-null  float64       
 10  coupon_amount          45665 non-null  float64       
 11  payment_coupon_amount  45663 non-null  float64       
 12  channel_code           50609 non-null  int64         
 13  oil_code               50609 non-null  int64         
 14  scene                  50609 non-null  int64         
 15  source_app             50609 non-null  int64         
 16  call_source            50609 non-null  int64         
dtypes: datetime64[ns](2), float64(7), int64(6), object(2)
memory usage: 6.6+ MB

'''
df.describe()
  bad_ind oil_amount discount_amount sale_amount amount pay_amount coupon_amount payment_coupon_amount channel_code oil_code scene source_app call_source
count 50609.000000 45665.000000 4.566500e+04 4.566500e+04 4.566500e+04 4.566500e+04 45665.000000 45663.000000 50609.000000 50609.000000 50609.000000 50609.000000 50609.000000
mean 0.017764 425.376107 1.832017e+05 1.881283e+05 1.808673e+05 9.043344e+05 0.576853 149.395397 1.476378 1.617894 1.906519 0.306072 2.900729
std 0.132093 400.596244 2.007574e+05 2.048742e+05 1.977035e+05 9.885168e+05 0.494064 605.138823 1.511470 3.074166 0.367280 0.893682 0.726231
min 0.000000 1.000000 0.000000e+00 0.000000e+00 1.000000e+00 5.000000e+00 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 175.440000 6.039100e+04 6.200100e+04 5.976100e+04 2.988010e+05 0.000000 1.000000 1.000000 0.000000 2.000000 0.000000 3.000000
50% 0.000000 336.160000 1.229310e+05 1.279240e+05 1.209610e+05 6.048010e+05 1.000000 1.000000 1.000000 0.000000 2.000000 0.000000 3.000000
75% 0.000000 557.600000 2.399050e+05 2.454010e+05 2.360790e+05 1.180391e+06 1.000000 100.000000 1.000000 0.000000 2.000000 0.000000 3.000000
max 1.000000 7952.820000 3.916081e+06 3.996001e+06 3.851081e+06 1.925540e+07 1.000000 50000.000000 6.000000 9.000000 2.000000 3.000000 4.000000
df.count()#总体数据4万多条
'''
uid                      50609
create_dt                45665
oil_actv_dt              50609
class_new                50609
bad_ind                  50609
oil_amount               45665
discount_amount          45665
sale_amount              45665
amount                   45665
pay_amount               45665
coupon_amount            45665
payment_coupon_amount    45663
channel_code             50609
oil_code                 50609
scene                    50609
source_app               50609
call_source              50609
dtype: int64

'''

 对creat_dt做补全,用oil_actv_dt来填补 截取6个月的数据。构造变量的时候不能直接对历史所有数据做累加。否则随着时间推移,变量分布会有很大的变化。

# oil_actv_dt放款的日期,create_dt创建账户的日期
# 用放款日期填充创建日期
def time_isna(x,y):
    if str(x) == 'NaT': 
    #NaT  非时间空值,Not a Time.该值可以存储在 datetime 数组中以指示未知或缺失的 datetime 值。NaT 返回一个 (NaT) datetime 非时间标量值。
        x = y  #有缺失的话创建时间等于放贷时间
    else:
        x = x  #没有缺失,创建时间等于创建时间
    return x
df2 = df.sort_values(['uid','create_dt'],ascending = False) #降序 根据'uid','create_dt'
df2['create_dt'] = df2.apply(lambda x: time_isna(x.create_dt,x.oil_actv_dt),axis = 1)
#创建账户的时间对放贷的时间做了一个填充.如果有缺失的话,做填充。
df2['dtn'] = (df2.oil_actv_dt - df2.create_dt).apply(lambda x :x.days)
df = df2[df2['dtn']<180]
#创建和放贷的时间是不是小于180天,小于180天的提取出来.
'''
B卡 货后管理 用户注册数据 从三方购买,如果半年之内没有新的操作
从不同渠道买来的评分数据 有效期半年用户第一次来的适合 个人用户的数据不全,需要从其他合作方购买 同盾
用户第一次来的时候, 个人用户的数据不全,需要从其他合作方购买 同盾

'''
df.head()
  uid create_dt oil_actv_dt class_new bad_ind oil_amount discount_amount sale_amount amount pay_amount coupon_amount payment_coupon_amount channel_code oil_code scene source_app call_source dtn
50608 B96436391985035703 2018-10-08 2018-10-08 B 0 NaN NaN NaN NaN NaN NaN NaN 6 9 2 3 4 0
50607 B96436391984693397 2018-10-11 2018-10-11 E 0 NaN NaN NaN NaN NaN NaN NaN 6 9 2 3 4 0
50606 B96436391977217468 2018-10-17 2018-10-17 B 0 NaN NaN NaN NaN NaN NaN NaN 6 9 2 3 4 0
50605 B96436391976480892 2018-09-28 2018-09-28 B 0 NaN NaN NaN NaN NaN NaN NaN 6 9 2 3 4 0
50604 B96436391972106043 2018-10-19 2018-10-19 A 0 NaN NaN NaN NaN NaN NaN NaN 6 9 2 3 4 0

特征选择 

对org_list变量求历史贷款天数的最大间隔,并且去重

#对org_list变量求历史贷款天数的最大间隔,并且去重
base = df[org_lst]           # base是框架,代表org_lst数据
base['dtn'] = df['dtn']      #把间隔时间小于180天的提取出来另列一列.
base = base.sort_values(['uid','create_dt'],ascending = False)
#根据'uid'先去排序,然后是'create_dt'。做降序排列
base = base.drop_duplicates(['uid'],keep = 'first')#去重操作
#keep : 有三个值,{‘first’, ‘last’, False},first:保留第一次出现的重复行,删除后面的重复行。
base.shape   #一共1万多个用户,40000多条数据,说明同一用户产生了不同的贷款记录
#(11099, 6)

特征衍生

gn = pd.DataFrame() #pd.DataFrame 创建一个DataFrame对象
#由于同一个用户会有多条记录,所以使得下面的操作有意义。一个用户只有一条记录,则无需用此方法。
for i in agg_lst:
    tp = pd.DataFrame(df.groupby('uid').apply(lambda df:len(df[i])).reset_index())
    #用户的id去分组,看一看这个用户在每个维度上一共有几条记录
    #按照uid进行分组,uid作为索引列。; df[i]取值, len判断有几条,
    tp.columns = ['uid',i + '_cnt']
    if gn.empty == True: #1.字符串为空,2.查不到数据 3.集合为空
        gn = tp
    else:
        gn = pd.merge(gn,tp,on = 'uid',how = 'left') #left说明on在最左边,所以uid在最左边
        
   #用户的id去分组,判断数据是否大于0,统计大于0的条目数
    tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.where(df[i]>0,1,0).sum()).reset_index())
    tp.columns = ['uid',i + '_num']
    if gn.empty == True:
        gn = tp
    else:
        gn = pd.merge(gn,tp,on = 'uid',how = 'left')
    
   #用户的id去分组,忽略nan把每个维度上用户的数据做求和的情况
    tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nansum(df[i])).reset_index())
    tp.columns = ['uid',i + '_tot']
    if gn.empty == True:
        gn = tp
    else:
        gn = pd.merge(gn,tp,on = 'uid',how = 'left')
    
    #用户的id去分组,忽略nan把每个维度上用户的数据做求平均的情况
    tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmean(df[i])).reset_index())
    tp.columns = ['uid',i + '_avg']
    if gn.empty == True:
        gn = tp
    else:
        gn = pd.merge(gn,tp,on = 'uid',how = 'left')
      
    #用户的id去分组,忽略nan把每个维度上用户的数据做求最大值的情况
    tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmax(df[i])).reset_index())
    tp.columns = ['uid',i + '_max']
    if gn.empty == True:
        gn = tp
    else:
        gn = pd.merge(gn,tp,on = 'uid',how = 'left')
        
        #用户的id去分组,忽略nan把每个维度上用户的数据做求最小值的情况
    tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmin(df[i])).reset_index())
    tp.columns = ['uid',i + '_min']
    if gn.empty == True:
        gn = tp
    else:
        gn = pd.merge(gn,tp,on = 'uid',how = 'left')
        
        #用户的id去分组,忽略nan把每个维度上用户的数据做求方差的情况
    tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanvar(df[i])).reset_index())
    tp.columns = ['uid',i + '_var']
    if gn.empty == True:
        gn = tp
    else:
        gn = pd.merge(gn,tp,on = 'uid',how = 'left')
        
         #用户的id去分组,忽略nan把每个维度上用户的数据做求极差的情况
    tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmax(df[i]) -np.nanmin(df[i]) ).reset_index())
    tp.columns = ['uid',i + '_var']
    if gn.empty == True:
        gn = tp
    else:
        gn = pd.merge(gn,tp,on = 'uid',how = 'left')
#对数值型的变量最特征衍生
gn.columns
'''
Index(['uid', 'oil_amount_cnt', 'oil_amount_num', 'oil_amount_tot',
       'oil_amount_avg', 'oil_amount_max', 'oil_amount_min',
       'oil_amount_var_x', 'oil_amount_var_y', 'discount_amount_cnt',
       'discount_amount_num', 'discount_amount_tot', 'discount_amount_avg',
       'discount_amount_max', 'discount_amount_min', 'discount_amount_var_x',
       'discount_amount_var_y', 'sale_amount_cnt', 'sale_amount_num',
       'sale_amount_tot', 'sale_amount_avg', 'sale_amount_max',
       'sale_amount_min', 'sale_amount_var_x', 'sale_amount_var_y',
       'amount_cnt', 'amount_num', 'amount_tot', 'amount_avg', 'amount_max',
       'amount_min', 'amount_var_x', 'amount_var_y', 'pay_amount_cnt',
       'pay_amount_num', 'pay_amount_tot', 'pay_amount_avg', 'pay_amount_max',
       'pay_amount_min', 'pay_amount_var_x', 'pay_amount_var_y',
       'coupon_amount_cnt', 'coupon_amount_num', 'coupon_amount_tot',
       'coupon_amount_avg', 'coupon_amount_max', 'coupon_amount_min',
       'coupon_amount_var_x', 'coupon_amount_var_y',
       'payment_coupon_amount_cnt', 'payment_coupon_amount_num',
       'payment_coupon_amount_tot', 'payment_coupon_amount_avg',
       'payment_coupon_amount_max', 'payment_coupon_amount_min',
       'payment_coupon_amount_var_x', 'payment_coupon_amount_var_y'],
      dtype='object')
'''
gn.shape[1]  #返回的是生成的总列数,7个列,每个赋值8个。56个加上初始值57
#57
#分类型变量,数条目数,有几个不同的分类
# 对dstc_lst变量求distinct个数
gc = pd.DataFrame()
#分类特征,看一看每个用户的所有记录中一共有几个不同的分类,把不同分类的数据记下来
for i in dstc_lst:
    tp = pd.DataFrame(df.groupby('uid').apply(lambda df: len(set(df[i]))).reset_index())
    #用set把重复值去掉,len查看有多少条数据,创建一个新的dataframe.定义给tp。set在这里相当于distinct
    tp.columns = ['uid',i + '_dstc']
    if gc.empty == True:
        gc = tp
    else:
        gc = pd.merge(gc,tp,on = 'uid',how = 'left')
gc.columns
'''
Index(['uid', 'channel_code_dstc', 'oil_code_dstc', 'scene_dstc',
       'source_app_dstc', 'call_source_dstc'],
      dtype='object')
'''
# 将变量组合在一起
fn = pd.merge(base,gn,on= 'uid')
fn = pd.merge(fn,gc,on= 'uid') 
fn.shape
#(11099, 67)
fn = fn.fillna(0) #组合后会有很多空值,此处填充为0

fn.head(100)
  uid create_dt oil_actv_dt class_new bad_ind dtn oil_amount_cnt oil_amount_num oil_amount_tot oil_amount_avg ... payment_coupon_amount_avg payment_coupon_amount_max payment_coupon_amount_min payment_coupon_amount_var_x payment_coupon_amount_var_y channel_code_dstc oil_code_dstc scene_dstc source_app_dstc call_source_dstc
0 B96436391985035703 2018-10-08 2018-10-08 B 0 0 1 0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1 1 1 1 1
1 B96436391984693397 2018-10-11 2018-10-11 E 0 0 1 0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1 1 1 1 1
2 B96436391977217468 2018-10-17 2018-10-17 B 0 0 1 0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1 1 1 1 1
3 B96436391976480892 2018-09-28 2018-09-28 B 0 0 1 0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1 1 1 1 1
4 B96436391972106043 2018-10-19 2018-10-19 A 0 0 1 0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1 1 1 1 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
95 B96117370332355190 2018-10-19 2018-10-19 B 0 0 1 0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1 1 1 1 1
96 B96117370330101658 2018-10-12 2018-10-12 B 0 0 1 0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1 1 1 1 1
97 B96117370330066347 2018-10-01 2018-10-01 D 0 0 1 0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1 1 1 1 1
98 B96117370328724350 2018-09-20 2018-09-20 C 0 0 1 0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1 1 1 1 1
99 B96117370321159033 2018-10-08 2018-10-08 D 0 0 1 0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1 1 1 1 1

100 rows × 67 columns


#把['uid','oil_actv_dt','create_dt','bad_ind','class_new']删掉,剩下的最为特征
x = fn.drop(['uid','oil_actv_dt','create_dt','bad_ind','class_new'],axis = 1)
#org_lst之所以前期会处理,在这里又删除,是因为它要删除重复性,把一共的1万多个用户留下来,
#总体数据4万多条数据,把同一用户产生了不同的贷款记录去掉,再去和其他数据做整合处理
#(11099行,62列)

#bad_ind目标值
y = fn.bad_ind.copy()
from sklearn import tree
#利用回归树
'''
所谓分类树就是面向分类的,每个决策树最末端的叶子结点出来的是一个分类标签,不是0就是1或者2等类别。回归树就是面向回归的,回归就是拟合函数一样,输出连续值
'''
dtree = tree.DecisionTreeRegressor(max_depth = 2,min_samples_leaf = 500,min_samples_split = 5000)
#限制树的最大深度
#叶子最少包含样本的个数
#节点必须包含训练样本的个数
dtree = dtree.fit(x,y) #利用x的值,预测y是否是坏人
import pydotplus 
#用于在图表语言中的计算机处理和过程图表。
from IPython.display import Image
from six import StringIO
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files/Graphviz/bin/'
#https://graphviz.org/download/,需要下载,安装到指定目录
with open("dt.dot", "w") as f:
    tree.export_graphviz(dtree, out_file=f)
dot_data = StringIO()
tree.export_graphviz(dtree, out_file=dot_data,
                         feature_names=x.columns,
                         class_names=['bad_ind'],
                         filled=True, rounded=True,
                         special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) 
Image(graph.create_png())
'''  
feature_names=x.columns 所有的特征
rounded=True  圆弧矩形
filled=True 填充颜色
decision_tree 策树的的对象名
out_file 不导出文件,选择 None
special_characters=True 格式化显示形式(作用不大)
''' 

sum(fn.bad_ind)/len(fn.bad_ind)
#0.04658077304261645

猜你喜欢

转载自blog.csdn.net/weixin_48135624/article/details/114298795