Logistic 回归—LogisticRegressionCV实现参数优化

1、准备

# 首先 import 必要的模块
import pandas as pd 
import numpy as np
 
from sklearn.model_selection import GridSearchCV
 
#竞赛的评价指标为logloss
from sklearn.metrics import log_loss  
 
from matplotlib import pyplot
import seaborn as sns
%matplotlib inline
 
data = pd.read_csv('Otto_train.csv')
data.head()
data.info()
data.describe()
data.shape
#受机器性能所限取前两万条数据
data = data[:20000]
 
# Target 分布,看看各类样本分布是否均衡
sns.countplot(data.target)
pyplot.xlabel('target');
pyplot.ylabel('Number of occurrences');

2、数据的标准化

# 将类别字符串变成数字
y_train = data.target
y_train = y_train.map(lambda s:s[6:])
y_train = y_train.map(lambda s:int(s)-1)
 
data = data.drop(['target','id'],axis=1)
X_train = np.array(data)
 
# 数据标准化
from sklearn.preprocessing import StandardScaler
 
# 初始化特征的标准化器
ss_X = StandardScaler()
 
# 分别对训练和测试数据的特征进行标准化处理
X_train = ss_X.fit_transform(X_train)
 
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
lr= LogisticRegression()
# 交叉验证用于评估模型性能和进行参数调优(模型选择)
#分类任务中交叉验证缺省是采用StratifiedKFold
loss = cross_val_score(lr, X_train, y_train, cv=5, scoring='neg_log_loss')
print('logloss of each fold is: ',-loss)
print('cv logloss is:', -loss.mean())

3、用LogisticRegressionCV的L1正则

from sklearn.linear_model import LogisticRegressionCV

​

Cs = [1, 10,100,1000]

​

# 大量样本(6W+)、高维度(93),L1正则 --> 可选用saga优化求解器(0.19版本新功能)

# LogisticRegressionCV比GridSearchCV快

lrcv_L1 = LogisticRegressionCV(Cs=Cs, cv = 5, scoring='neg_log_loss', penalty='l1', solver='liblinear', multi_class='ovr')

lrcv_L1.fit(X_train, y_train)    

LogisticRegressionCV(Cs=[1, 10, 100, 1000], class_weight=None, cv=5,
           dual=False, fit_intercept=True, intercept_scaling=1.0,
           max_iter=100, multi_class='ovr', n_jobs=1, penalty='l1',
           random_state=None, refit=True, scoring='neg_log_loss',
           solver='liblinear', tol=0.0001, verbose=0)

lrcv_L1.scores_

# scores_:dict with classes as the keys, and the values as the grid of scores obtained during cross-validating each fold,
# Each dict value has shape (n_folds, len(Cs))
n_Cs = len(Cs)
n_classes = 3
scores =  np.zeros((n_classes,n_Cs))

for j in range(n_classes):
        scores[j][:] = np.mean(lrcv_L1.scores_[j],axis = 0)
    
mse_mean = -np.mean(scores, axis = 0)
pyplot.plot(np.log10(Cs), mse_mean.reshape(n_Cs,1)) 
#plt.plot(np.log10(reg.Cs)*np.ones(3), [0.28, 0.29, 0.30])
pyplot.xlabel('log(C)')
pyplot.ylabel('neg-logloss')
pyplot.show()

#print ('C is:',lr_cv.C_)  #对多类分类问题,每个类别的分类器有一个C

lrcv_L1.coef_

4、用LogisticRegressionCV的L2正则

from sklearn.linear_model import LogisticRegressionCV

Cs = [1, 10,100,1000]

# 大量样本(6W+)、高维度(93),L2正则 --> 缺省用lbfgs,为了和GridSeachCV比较,也用liblinear

lr_cv_L2 = LogisticRegressionCV(Cs=Cs, cv = 5, scoring='neg_log_loss', penalty='l2', solver='liblinear', multi_class='ovr')
lr_cv_L2.fit(X_train, y_train)   

lr_cv_L2.scores_

# dict with classes as the keys, and the values as the grid of scores obtained during cross-validating each fold,
# Each dict value has shape (n_folds, len(Cs))
n_Cs = len(Cs)
n_classes = 3
scores =  np.zeros((n_classes,n_Cs))

for j in range(n_classes):
        scores[j][:] = np.mean(lr_cv_L2.scores_[j],axis = 0)
    
mse_mean = -np.mean(scores, axis = 0)
pyplot.plot(np.log10(Cs), mse_mean.reshape(n_Cs,1)) 
#plt.plot(np.log10(reg.Cs)*np.ones(3), [0.28, 0.29, 0.30])
pyplot.xlabel('log(C)')
pyplot.ylabel('neg-logloss')
pyplot.show()

#print ('C is:',lr_cv.C_)  #对多类分类问题,每个类别的分类器有一个C


from sklearn.linear_model import LogisticRegressionCV

Cs = [1, 10,100,1000]

# 大量样本(6W+)、高维度(93),L2正则 --> 缺省用lbfgs
# LogisticRegressionCV比GridSearchCV快
lrcv_L2 = LogisticRegressionCV(Cs=Cs, cv = 5, scoring='neg_log_loss', penalty='l2', multi_class='ovr')
lrcv_L2.fit(X_train, y_train)    

lrcv_L2.scores_

# dict with classes as the keys, and the values as the grid of scores obtained during cross-validating each fold,
# Each dict value has shape (n_folds, len(Cs))
n_Cs = len(Cs)
n_classes = 3
scores =  np.zeros((n_classes,n_Cs))

for j in range(n_classes):
        scores[j][:] = np.mean(lrcv_L2.scores_[j],axis = 0)
    
mse_mean = -np.mean(scores, axis = 0)
pyplot.plot(np.log10(Cs), mse_mean.reshape(n_Cs,1)) 
#plt.plot(np.log10(reg.Cs)*np.ones(3), [0.28, 0.29, 0.30])
pyplot.xlabel('log(C)')
pyplot.ylabel('neg-logloss')
pyplot.show()

猜你喜欢

转载自blog.csdn.net/evolution23/article/details/85028423