构建网格搜索+交叉验证(lr模型)(某金融数据集)

导入各种包

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_auc_score,roc_curve,auc
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

导入数据

data=pd.read_csv('./data.csv',index_col=0,encoding='gbk')

数据理解

#单独提取出y列标签,和其余的88列标记为x
y=data['status']
X=data.drop('status',axis=1)
#X值的行列数,以及y的分布类型
print('X.shape:',X.shape)
print('y的分布:',y.value_counts())
X.shape: (4754, 88)
y的分布: 0    3561
1    1193
Name: status, dtype: int64

数据准备

#首先剔除一些明显无用的特征,如id_name,custid,trade_no,bank_card_no
X.drop(['id_name','custid','trade_no','bank_card_no'],axis=1,inplace=True)
print(X.shape)
#选取数值型特征
X_num=X.select_dtypes('number').copy()
print(X_num.shape)
type(X_num.mean())
#使用均值填充缺失值
X_num.fillna(X_num.mean(),inplace=True)
#观察数值型以外的变量
X_str=X.select_dtypes(exclude='number').copy()
X_str.describe()
#把reg_preference用虚拟变量代替,其它三个变量删除
X_str['reg_preference_for_trad'] = X_str['reg_preference_for_trad'].fillna(X_str['reg_preference_for_trad'].mode()[0])
X_str_dummy = pd.get_dummies(X_str['reg_preference_for_trad'])
X_str_dummy.head()
#合并数值型变量和名义型(字符型)变量
X_cl = pd.concat([X_num,X_str_dummy],axis=1,sort=False)
#X_cl.shape
print(X_cl.head())
"""
#数据标准化和归一化
from sklearn import preprocessing
min_max_scale = preprocessing.MinMaxScaler()
min_max_data = min_max_scale.fit_transform(X_cl)


from sklearn import preprocessing
X_cl = preprocessing.scale(X_cl)
"""

数据建模和评估(lr网格搜索)

#以三七比例分割训练集和测试集
random_state = 1118
X_train,X_test,y_train,y_test = train_test_split(X_cl,y,test_size=0.3,random_state=1118)
print(X_train.shape)
print(X_test.shape)

#建立逻辑回归模型
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_param = {
    'C': [0.05, 0.1, 0.5, 1],
    'penalty': ['l1', 'l2'],
}
"""
这一步可能会过拟合吧
""" 
lr_grid = GridSearchCV(lr, lr_param, cv=5, scoring='roc_auc', n_jobs=-1)
lr_grid.fit(X_train, y_train)
print(lr_grid.best_score_)  # 最优分数
print(lr_grid.best_params_)  # 最优参数
print(lr_grid.cv_results_)  # 结果

lr.set_params(**lr_grid.best_params_)
lr.fit(X_train, y_train)



(3327, 85)
(1427, 85)

猜你喜欢

转载自blog.csdn.net/qq_41205464/article/details/84453152