4.代码
利用pandas.DataFrame来做数据预处理
import pandas as pd
import os
#切换到保存数据的目录下
os.chdir('/Users/mac/spark/bank_sample')
#列出目录的文件名
loanfile = os.listdir()
loanfile
#output:
"""
['RFM_TRAD_FLOW.csv',
'district.csv',
'clients.csv',
'Studycase_CreditCard_Python.py',
'accounts.csv',
'loans.csv',
'商业取数逻辑.xlsx',
'促销营销.py',
'trans.csv',
'Studycase_CreditCard_Pyspark.ipynb',
'card.csv',
'order.csv',
'disp.csv']
"""
找到csv流数据文件
csv_file = locals()
for i in loanfile:
if i.endswith('csv'):
create[i.split('.')[0]] = pd.read_csv(i,encoding = 'gbk')
print(i.split('.')[0])
#output:
"""
RFM_TRAD_FLOW
district
clients
accounts
loans
trans
card
order
disp
"""
#定义Y
bad_good={
'A':0,'B':1,'C':2,'D':1}
loans['bad_good'] = loans.status.map(map_good)
loans.head()
#output
"""
loan_id account_id date amount duration payments status bad_good
0 5314 1787 1993-07-05 96396 12 8033 B 1
1 5316 1801 1993-07-11 165960 36 4610 A 0
2 6863 9188 1993-07-28 127080 60 2118 A 0
3 5325 1843 1993-08-03 105804 36 2939 A 0
4 7240 11013 1993-09-06 274740 60 4579 A 0
"""
提取数据X
属性信息
data2 = pd.merge(loans, disp, on = 'account_id', how = 'left')
data2 = pd.merge(data2, clients, on = 'client_id', how = 'left')
data2.head()
"""
loan_id account_id date amount duration payments status bad_good disp_id client_id type sex birth_date district_id
0 5314 1787 1993-07-05 96396 12 8033 B 1 2166 2166 所有者 0 1947-07-22 30
1 5316 1801 1993-07-11 165960 36 4610 A 0 2181 2181 所有者 1 1968-07-22 46
2 6863 9188 1993-07-28 127080 60 2118 A 0 11006 11314 所有者 1 1936-06-02 45
3 5325 1843 1993-08-03 105804 36 2939 A 0 2235 2235 所有者 0 1940-04-20 14
4 7240 11013 1993-09-06 274740 60 4579 A 0 13231 13539 所有者 1 1978-09-07 63
"""
经济信息
data3 = pd.merge(data2, district, left_on = 'district_id', right_on = 'A1', how = 'left')
data3.head()
"""
loan_id account_id date amount duration payments status bad_good disp_id client_id ... A1 GDP A4 A10 A11 A12 A13 A14 A15 a16
0 5314 1787 1993-07-05 96396 12 8033 B 1 2166 2166 ... 30 16979 94812 81.8 9650 3.38 3.67 100 15.7 14.8
1 5316 1801 1993-07-11 165960 36 4610 A 0 2181 2181 ... 46 14111 112709 73.5 8369 1.79 2.31 117 12.7 11.6
2 6863 9188 1993-07-28 127080 60 2118 A 0 11006 11314 ... 45 12888 77917 53.5 8390 2.28 2.89 132 13.3 13.6
3 5325 1843 1993-08-03 105804 36 2939 A 0 2235 2235 ... 14 31891 177686 74.8 10045 1.42 1.71 135 18.6 17.7
4 7240 11013 1993-09-06 274740 60 4579 A 0 13231 13539 ... 63 11322 86513 50.5 8288 3.79 4.52 110 9.0 8.4
"""
data4temp1 = pd.merge(loans[['account_id', 'date']], trans[['account_id', 'type', 'amount', 'balance', 'date']], on = 'account_id')
data4temp1
"""
account_id date type amount balance t_date
0 1787 1993-07-05 debit 1100 1100 1993-3-22
1 1787 1993-07-05 debit 9900 11000 1993-4-21
2 1787 1993-07-05 debit 5800 16800 1993-5-21
3 1787 1993-07-05 debit 3300 20100 1993-6-20
4 1787 1993-07-05 debit 42248 62348 1993-7-8
"""
data4temp1.columns = ['account_id', 'date', 'type', 'amount', 'balance', 't_date']
data4temp1 = data4temp1.sort_values(by = ['account_id', 't_date'])
data4temp1['date'] = pd.to_datetime(data4temp1['date'])
data4temp1['t_date'] = pd.to_datetime(data4temp1['t_date'])
data4temp1.head()
"""
account_id date type amount balance t_date
10013 2 1994-01-05 debit 20236 30744 1993-10-12
10014 2 1994-01-05 credit 3000 27744 1993-10-24
10011 2 1994-01-05 credit 4000 17774 1993-10-03
10015 2 1994-01-05 credit 15 27830 1993-10-31
10016 2 1994-01-05 debit 101 27845 1993-10-31
"""
取观察窗口的数据
import datetime
data4temp2 = data4temp1[data4temp1.date > data4temp1.t_date][data4temp1.date < data4temp1.t_date + datetime.timedelta(days = 365)]
data4temp2.tail()
"""
account_id date type amount balance t_date
126295 11362 1996-12-27 debit 16141 31015 1996-09-05
126296 11362 1996-12-27 credit 129 30886 1996-09-06
126297 11362 1996-12-27 credit 330 30556 1996-09-07
126298 11362 1996-12-27 credit 56 25200 1996-09-08
126299 11362 1996-12-27 credit 5300 25256 1996-09-08
"""
账户余额的 平均余额 标准差 变异系数
data4temp3 = data4temp2.groupby('account_id')['balance'].agg([('avg_balance', 'mean'),('stdev_balance', 'std')])
data4temp3['cv_balance'] = data4temp3[['avg_balance', 'stdev_balance']].apply(lambda x : x[1]/x[0], axis = 1)
data4temp3.head()
"""
avg_balance stdev_balance cv_balance
account_id
2 32590.759259 12061.802206 0.370099
19 25871.223684 15057.521648 0.582018
25 60792.953488 21315.720151 0.350628
37 38528.140187 22177.557191 0.575620
38 31383.581818 10950.723180 0.348932
"""
平均支出和平均收入(credit:income,debit:out)
data4temp4 = data4temp2.groupby(['account_id', 'type'])[['amount']].sum()
data4temp4.head()
"""
amount
account_id type
2 credit 153020
debit 276514
19 credit 198020
debit 254255
25 credit 629108
"""
转置
data4temp5 = pd.pivot_table(data4temp4, values = 'amount', index = 'account_id', columns = 'type')
data4temp5.fillna(0, inplace = True)
data4temp5['r_out_in'] = data4temp5[['debit','credit']].apply(lambda x : x[1]/x[0], axis = 1)
data4temp5.head()
"""
type credit debit r_out_in
account_id
2 153020.0 276514.0 0.553390
19 198020.0 254255.0 0.778824
25 629108.0 726479.0 0.865969
37 328541.0 386357.0 0.850356
38 105091.0 154300.0 0.681082
"""
整合所有有用信息
data4 = pd.merge(data3, data4temp3, left_on = 'account_id', right_index = True, how = 'left')
data4 = pd.merge(data4, data4temp5, left_on = 'account_id', right_index = True, how = 'left')
data4.head()
"""
loan_id account_id date amount duration payments status bad_good disp_id client_id ... A13 A14 A15 a16 avg_balance stdev_balance cv_balance credit debit r_out_in
0 5314 1787 1993-07-05 96396 12 8033 B 1 2166 2166 ... 3.67 100 15.7 14.8 12250.000000 8330.866301 0.680071 0.0 20100.0 0.000000
1 5316 1801 1993-07-11 165960 36 4610 A 0 2181 2181 ... 2.31 117 12.7 11.6 52083.918919 29122.031884 0.559137 164004.0 243576.0 0.673318
2 6863 9188 1993-07-28 127080 60 2118 A 0 11006 11314 ... 2.89 132 13.3 13.6 30061.041667 11520.127013 0.383224 54873.0 75146.0 0.730219
3 5325 1843 1993-08-03 105804 36 2939 A 0 2235 2235 ... 1.71 135 18.6 17.7 41297.640000 14151.357776 0.342667 86018.0 120310.0 0.714970
4 7240 11013 1993-09-06 274740 60 4579 A 0 13231 13539 ... 4.52 110 9.0 8.4 57188.185185 25256.658706 0.441641 235214.0 276327.0 0.851216
"""
计算贷存比,贷收比
data4['r_lb'] = data4[['amount','avg_balance']].apply(lambda x: x[0]/x[1], axis = 1)
data4['r_lincome'] = data4[['amount','credit']].apply(lambda x: x[1]/x[0], axis = 1)
data4.head()
"""
loan_id account_id date amount duration payments status bad_good disp_id client_id ... A15 a16 avg_balance stdev_balance cv_balance credit debit r_out_in r_lb r_lincome
0 5314 1787 1993-07-05 96396 12 8033 B 1 2166 2166 ... 15.7 14.8 12250.000000 8330.866301 0.680071 0.0 20100.0 0.000000 7.869061 0.000000
1 5316 1801 1993-07-11 165960 36 4610 A 0 2181 2181 ... 12.7 11.6 52083.918919 29122.031884 0.559137 164004.0 243576.0 0.673318 3.186396 0.988214
2 6863 9188 1993-07-28 127080 60 2118 A 0 11006 11314 ... 13.3 13.6 30061.041667 11520.127013 0.383224 54873.0 75146.0 0.730219 4.227398 0.431799
3 5325 1843 1993-08-03 105804 36 2939 A 0 2235 2235 ... 18.6 17.7 41297.640000 14151.357776 0.342667 86018.0 120310.0 0.714970 2.561987 0.812994
4 7240 11013 1993-09-06 274740 60 4579 A 0 13231 13539 ... 9.0 8.4 57188.185185 25256.658706 0.441641 235214.0 276327.0 0.851216 4.804139 0.856133
"""
data4.columns
"""
Index(['loan_id', 'account_id', 'date', 'amount', 'duration', 'payments',
'status', 'bad_good', 'disp_id', 'client_id', 'type', 'sex',
'birth_date', 'district_id', 'A1', 'GDP', 'A4', 'A10', 'A11', 'A12',
'A13', 'A14', 'A15', 'a16', 'avg_balance', 'stdev_balance',
'cv_balance', 'credit', 'debit', 'r_out_in', 'r_lb', 'r_lincome'],
dtype='object')
"""
制作模型所用数据集
data_model = data4[data4.status!='C']
for_predict = data4[data4.status=='C']
train = data_model.sample(frac =0.75, random_state=1235).copy()
test = data_model[~ data_model.index.isin(train.index)].copy()
print('训练数据集:%s\n测试数据集:%s'%(len(train),len(test)))
#output:
"""
训练数据集:250
测试数据集:84
"""
如需数据:请添加QQ1240929749,备注:csdn数据