利用stacking模型融合,并做预测

利用stacking模型融合,并做预测

  • 导入的包
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# 引入用到的分类算法
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import scale
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import warnings
from mlxtend.classifier import StackingClassifier


# 引入要用到的评价函数
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
  • 导入随机森林测试的结果
# 导入随机森林测试的结果
warnings.filterwarnings("ignore")

data_all=pd.read_csv('/home/infisa/wjht/project/DataWhale/output/data_forest',encoding='gbk')
  • 划分训练集和测试集和标准化数据

features = [x for x in data_all.columns if x not in ['status']]
X = data_all[features]
y = data_all['status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2018)

#标准化数据,方差为1,均值为零
standardScaler = StandardScaler()
X_train_fit = standardScaler.fit_transform(X_train)
X_test_fit = standardScaler.transform(X_test)
  • 依次训练不同分类器
# LogisticRegression
lr = LogisticRegression(random_state =2018)

# 决策树
dt = DecisionTreeClassifier(random_state=2018)

# SVM
svm = LinearSVC(random_state=2018)

# 随机森林
rfc = RandomForestClassifier(n_estimators=100, random_state=2018)

# GBDT
gbc = GradientBoostingClassifier(random_state=2018)

# xgboost
xgbc = XGBClassifier(random_state=2018)

# lightgbm
lgbc = LGBMClassifier(random_state=2018)
  • 使用stacking模型融合并做预测
sclf = StackingClassifier(classifiers=[lr,dt,svm,rfc,gbc,xgbc,lgbc],
                          meta_classifier=lr)

# 定义一个包含多个评价指标的函数
sclf.fit(X_train_fit, y_train)
y_predict = sclf.predict(X_test_fit)
pre_proba=sclf.predict_proba(X_test_fit)[:,1]
print("准确率",accuracy_score(y_test,y_predict))
print("精确率",precision_score(y_test,y_predict))
print("召回率",recall_score(y_test,y_predict))
print("F1-score",f1_score(y_test,y_predict))
print("AUC",roc_auc_score(y_test,pre_proba))
  • 预测结果
'''
准确率 0.7757533286615277
精确率 0.6054054054054054
召回率 0.31197771587743733
F1-score 0.411764705882353
AUC 0.6678664204563236
'''
  • 问题
    对选择5折交叉验证和stacking结合还是不懂出现了报错。
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

猜你喜欢

转载自blog.csdn.net/weixin_41710583/article/details/86178969