导入各种包
import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
from sklearn. model_selection import train_test_split
from sklearn. metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc
from sklearn. linear_model import LogisticRegression
from sklearn. model_selection import GridSearchCV
导入数据
data= pd. read_csv( './data.csv' , index_col= 0 , encoding= 'gbk' )
数据理解
y= data[ 'status' ]
X= data. drop( 'status' , axis= 1 )
print ( 'X.shape:' , X. shape)
print ( 'y的分布:' , y. value_counts( ) )
X.shape: (4754, 88)
y的分布: 0 3561
1 1193
Name: status, dtype: int64
数据准备
X. drop( [ 'id_name' , 'custid' , 'trade_no' , 'bank_card_no' ] , axis= 1 , inplace= True )
print ( X. shape)
X_num= X. select_dtypes( 'number' ) . copy( )
print ( X_num. shape)
type ( X_num. mean( ) )
X_num. fillna( X_num. mean( ) , inplace= True )
X_str= X. select_dtypes( exclude= 'number' ) . copy( )
X_str. describe( )
X_str[ 'reg_preference_for_trad' ] = X_str[ 'reg_preference_for_trad' ] . fillna( X_str[ 'reg_preference_for_trad' ] . mode( ) [ 0 ] )
X_str_dummy = pd. get_dummies( X_str[ 'reg_preference_for_trad' ] )
X_str_dummy. head( )
X_cl = pd. concat( [ X_num, X_str_dummy] , axis= 1 , sort= False )
print ( X_cl. head( ) )
"""
#数据标准化和归一化
from sklearn import preprocessing
min_max_scale = preprocessing.MinMaxScaler()
min_max_data = min_max_scale.fit_transform(X_cl)
from sklearn import preprocessing
X_cl = preprocessing.scale(X_cl)
"""
数据建模和评估(lr网格搜索)
random_state = 1118
X_train, X_test, y_train, y_test = train_test_split( X_cl, y, test_size= 0.3 , random_state= 1118 )
print ( X_train. shape)
print ( X_test. shape)
lr = LogisticRegression( )
lr. fit( X_train, y_train)
lr_param = {
'C' : [ 0.05 , 0.1 , 0.5 , 1 ] ,
'penalty' : [ 'l1' , 'l2' ] ,
}
"""
这一步可能会过拟合吧
"""
lr_grid = GridSearchCV( lr, lr_param, cv= 5 , scoring= 'roc_auc' , n_jobs= - 1 )
lr_grid. fit( X_train, y_train)
print ( lr_grid. best_score_)
print ( lr_grid. best_params_)
print ( lr_grid. cv_results_)
lr. set_params( ** lr_grid. best_params_)
lr. fit( X_train, y_train)
(3327, 85)
(1427, 85)