1.オブジェクトフィーチャデータの前処理
category_feat = list ( data_df. select_dtypes( include= [ 'object' ] ) . columns)
numerical_feat = list ( data_df. select_dtypes( exclude= [ 'object' ] ) . columns)
label = 'isDefault'
numerical_feat. remove( label)
数据类型为Object类型的特征如下:
[ 'grade' , 'subGrade' , 'employmentLength' , 'issueDate' , 'earliesCreditLine' ]
雇用年数(年):employmentLength
data_df[ 'employmentLength' ] . value_counts( dropna= False ) . sort_index( )
data_df[ 'employmentLength' ] . replace( '10+ years' , '10 years' , inplace= True )
data_df[ 'employmentLength' ] . replace( '< 1 year' , '0 years' , inplace= True )
def employmentLength_to_int ( s) :
if pd. isnull( s) :
return s
else :
return np. int8( s. split( ) [ 0 ] )
data_df[ 'employmentLength' ] = data_df[ 'employmentLength' ] . apply ( employmentLength_to_int)
借り手が最初に報告したクレジットラインが発行された月:earliestCreditLine
data_df[ 'earliesCreditLine' ] . sample( 5 )
data_df[ 'earliesCreditLine' ] = data_df[ 'earliesCreditLine' ] . apply ( lambda s: int ( s[ - 4 : ] ) )
ローン発行月:issueDate
data_df[ 'issueDate' ] = data_df[ 'issueDate' ] . apply ( lambda s: int ( s[ : 4 ] ) )
2、カテゴリー特徴データの前処理
cate_features = [ 'grade' , 'subGrade' , 'employmentTitle' , 'homeOwnership' , 'verificationStatus' , 'purpose' , 'postCode' , 'regionCode' , \
'applicationType' , 'initialListStatus' , 'title' , 'policyCode' ]
for f in cate_features:
print ( f, '类型个数:' , data_df[ f] . nunique( ) )
ワンホットエンコーディング:タイプの数は2を超えており、高次元のスパースカテゴリ機能ではありません
data_df = pd. get_dummies( data_df, columns= [ 'grade' , 'subGrade' , 'homeOwnership' , 'verificationStatus' , 'purpose' , 'regionCode' ] , drop_first= True )
for f in [ 'employmentTitle' , 'postCode' , 'title' ] :
data_df[ f+ '_cnts' ] = data_df. groupby( [ f] ) [ 'id' ] . transform( 'count' )
data_df[ f+ '_rank' ] = data_df. groupby( [ f] ) [ 'id' ] . rank( ascending= False ) . astype( int )
del data_df[ f]
3.欠損値と外れ値の扱い
data_df[ numerical_feat] = data_df[ numerical_feat] . fillna( data_df[ numerical_feat] . median( ) )
data_df[ category_feat] = data_df[ category_feat] . fillna( data_df[ category_feat] . mode( ) )
4、時間フォーマット処理
data_df[ 'issueDate' ] = pd. to_datetime( data_df[ 'issueDate' ] , format = '%Y-%m-%d' )
startdate = datetime. datetime. strptime( '2007-06-01' , '%Y-%m-%d' )
data_df[ 'issueDateDT' ] = data_df[ 'issueDate' ] . apply ( lambda x: x- startdate) . dt. days
5、特徴的な構造
for col in [ 'grade' , 'subGrade' ] :
temp_dict = data_df. groupby( [ col] ) [ 'isDefault' ] . agg( [ 'mean' ] ) . reset_index( ) . rename( columns= {
'mean' : col + '_target_mean' } )
temp_dict. index = temp_dict[ col] . values
temp_dict = temp_dict[ col + '_target_mean' ] . to_dict( )
data_df[ col + '_target_mean' ] = data_df[ col] . map ( temp_dict)
for item in [ 'n0' , 'n1' , 'n2' , 'n2.1' , 'n4' , 'n5' , 'n6' , 'n7' , 'n8' , 'n9' , 'n10' , 'n11' , 'n12' , 'n13' , 'n14' ] :
data_df[ 'grade_to_mean_' + item] = data_df. groupby( [ 'grade' ] ) [ item] . transform( 'mean' )
data_df[ 'grade_to_std_' + item] = data_df. groupby( [ 'grade' ] ) [ item] . transform( 'std' )