Sklearn流水线交叉验证以及超参数网格交叉评估基础案例实战-大数据ML样本集案例实战

版权声明:本套技术专栏是作者(秦凯新)平时工作的总结和升华,通过从真实商业环境抽取案例进行总结和分享,并给出商业应用的调优建议和集群环境容量规划等内容,请持续关注本套博客。QQ邮箱地址:[email protected],如有任何技术交流,可随时联系。

1 基本数据探索

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
X = pd.read_csv('C:\\ML\\MLData\\iris.data')
X.columns = ['sepal_length_cm', 'sepal_width_cm', 'petal_length_cm', 'petal_width_cm', 'class']

X.head()
X.sample(n=10)
复制代码

X.shape
(149, 5)

X.dtypes
sepal_length_cm    float64
sepal_width_cm     float64
petal_length_cm    float64
petal_width_cm     float64
class               object
dtype: object

X.describe()
复制代码

2 数据可视化探索分析

  • box 查看异常点

      X.plot(kind="box",subplots=True,layout=(1,4),figsize=(12,5))
      plt.show()
    复制代码

  • hist区间图

      X.hist(figsize=(12,5),xlabelsize=1,ylabelsize=1)
      plt.show()
    复制代码

  • 密度图

      X.plot(kind="density",subplots=True,layout=(1,4),figsize=(12,5))
      plt.show()
    复制代码

  • 热力图关系图

      fig = plt.figure(figsize=(10,10))
      ax = fig.add_subplot(111)
      cax = ax.matshow(X.corr(),vmin=-1,vmax=1,interpolation="none")
      fig.colorbar(cax)
      ticks = np.arange(0,4,1)
      ax.set_xticks(ticks)
      ax.set_yticks(ticks)
      ax.set_xticklabels(col_name)
      ax.set_yticklabels(col_name)
      plt.show()
    复制代码

3 数据比例划分

    from sklearn.model_selection import KFold
    from sklearn.model_selection import train_test_split
    
    all_inputs = iris_data[['sepal_length_cm', 'sepal_width_cm',
                             'petal_length_cm', 'petal_width_cm']].values
    
    all_classes = iris_data['class'].values
    
    (training_inputs,
     testing_inputs,
     training_classes,
     testing_classes) = train_test_split(all_inputs, all_classes, train_size=0.75, random_state=1)
复制代码

4 多分类模型集中评估

    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.ensemble import ExtraTreesClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.svm import SVC
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.naive_bayes import GaussianNB
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
    from sklearn.model_selection import KFold
    from sklearn.model_selection import train_test_split
    from sklearn.model_selection import cross_val_score
    
    models = []
    models.append(("AB",AdaBoostClassifier()))
    models.append(("GBM",GradientBoostingClassifier()))
    models.append(("RF",RandomForestClassifier()))
    models.append(("ET",ExtraTreesClassifier()))
    models.append(("SVC",SVC()))
    models.append(("KNN",KNeighborsClassifier()))
    models.append(("LR",LogisticRegression()))
    models.append(("GNB",GaussianNB()))
    models.append(("LDA",LinearDiscriminantAnalysis()))
    
    names = []
    results = []
    
    for name,model in models:
        result = cross_val_score(model,training_inputs,training_classes,scoring="accuracy",cv=5)
        names.append(name)
        results.append(result)
        print("{}  Mean:{:.4f}(Std{:.4f})".format(name,result.mean(),result.std()))
        
        AB  Mean:0.9097(Std0.0290)
        GBM  Mean:0.9370(Std0.0361)
        RF  Mean:0.9461(Std0.0442)
        ET  Mean:0.9370(Std0.0361)
        SVC  Mean:0.9640(Std0.0340)
        KNN  Mean:0.9374(Std0.0454)
        LR  Mean:0.9379(Std0.0353)
        GNB  Mean:0.9556(Std0.0391)
        LDA  Mean:0.9735(Std0.0360) 
复制代码

5 流水线交叉验证

    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler
    pipeline = []
    pipeline.append(("ScalerET", Pipeline([("Scaler",StandardScaler()),
                                         ("ET",ExtraTreesClassifier())])))
    pipeline.append(("ScalerGBM", Pipeline([("Scaler",StandardScaler()),
                                           ("GBM",GradientBoostingClassifier())])))
    pipeline.append(("ScalerRF", Pipeline([("Scaler",StandardScaler()),
                                         ("RF",RandomForestClassifier())])))
    
    names = []
    results = []
    for name,model in pipeline:
        kfold = KFold(n_splits=5,random_state=42)
        result = cross_val_score(model, training_inputs,training_classes, cv=kfold, scoring="accuracy")
        results.append(result)
        names.append(name)
        print("{}:  Error Mean:{:.4f} (Error Std:{:.4f})".format(
            name,result.mean(),result.std()))

ScalerET:   Error Mean:0.9372 (Error Std:0.0358)
ScalerGBM:  Error Mean:0.9462 (Error Std:0.0332)
ScalerRF:   Error Mean:0.9553 (Error Std:0.0275)
复制代码

6 超参数网格交叉评估

    from sklearn.model_selection import GridSearchCV
    param_grid = {
        "C":[0.1, 0.3, 0.5, 0.7, 0.9, 1.0, 1.3, 1.5, 1.7, 2.0],
        "kernel":['linear', 'poly', 'rbf', 'sigmoid']
    }
    model = SVC()
    kfold = KFold(n_splits=5, random_state=42)
    
    grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring="accuracy", cv=kfold)
    grid_result = grid.fit(training_inputs,training_classes)
    
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))
        
    Best: 0.972973 using {'C': 0.9, 'kernel': 'linear'}
    
    0.954955 (0.027681) with: {'C': 0.1, 'kernel': 'linear'}
    0.927928 (0.021620) with: {'C': 0.1, 'kernel': 'poly'}
    0.945946 (0.016821) with: {'C': 0.1, 'kernel': 'rbf'}
    0.351351 (0.049646) with: {'C': 0.1, 'kernel': 'sigmoid'}
    0.963964 (0.017933) with: {'C': 0.3, 'kernel': 'linear'}
    0.954955 (0.028629) with: {'C': 0.3, 'kernel': 'poly'}
    0.954955 (0.027681) with: {'C': 0.3, 'kernel': 'rbf'}
    0.351351 (0.049646) with: {'C': 0.3, 'kernel': 'sigmoid'}
    0.963964 (0.017933) with: {'C': 0.5, 'kernel': 'linear'}
    0.954955 (0.028629) with: {'C': 0.5, 'kernel': 'poly'}
    0.963964 (0.017933) with: {'C': 0.5, 'kernel': 'rbf'}
    0.351351 (0.049646) with: {'C': 0.5, 'kernel': 'sigmoid'}
    0.963964 (0.017933) with: {'C': 0.7, 'kernel': 'linear'}
    0.963964 (0.033773) with: {'C': 0.7, 'kernel': 'poly'}
    0.963964 (0.017933) with: {'C': 0.7, 'kernel': 'rbf'}
    0.342342 (0.045336) with: {'C': 0.7, 'kernel': 'sigmoid'}
    0.972973 (0.021914) with: {'C': 0.9, 'kernel': 'linear'}
    0.963964 (0.033773) with: {'C': 0.9, 'kernel': 'poly'}
    0.963964 (0.017933) with: {'C': 0.9, 'kernel': 'rbf'}
    0.351351 (0.049646) with: {'C': 0.9, 'kernel': 'sigmoid'}
    0.972973 (0.021914) with: {'C': 1.0, 'kernel': 'linear'}
    0.963964 (0.033773) with: {'C': 1.0, 'kernel': 'poly'}
    0.963964 (0.017933) with: {'C': 1.0, 'kernel': 'rbf'}
    0.351351 (0.049646) with: {'C': 1.0, 'kernel': 'sigmoid'}
    0.972973 (0.021914) with: {'C': 1.3, 'kernel': 'linear'}
    0.963964 (0.033773) with: {'C': 1.3, 'kernel': 'poly'}
    0.963964 (0.017933) with: {'C': 1.3, 'kernel': 'rbf'}
    0.351351 (0.049646) with: {'C': 1.3, 'kernel': 'sigmoid'}
    0.972973 (0.021914) with: {'C': 1.5, 'kernel': 'linear'}
    0.963964 (0.033773) with: {'C': 1.5, 'kernel': 'poly'}
    0.963964 (0.017933) with: {'C': 1.5, 'kernel': 'rbf'}
    0.351351 (0.049646) with: {'C': 1.5, 'kernel': 'sigmoid'}
    0.972973 (0.021914) with: {'C': 1.7, 'kernel': 'linear'}
    0.954955 (0.028629) with: {'C': 1.7, 'kernel': 'poly'}
    0.963964 (0.017933) with: {'C': 1.7, 'kernel': 'rbf'}
    0.351351 (0.049646) with: {'C': 1.7, 'kernel': 'sigmoid'}
    0.963964 (0.017933) with: {'C': 2.0, 'kernel': 'linear'}
    0.954955 (0.028629) with: {'C': 2.0, 'kernel': 'poly'}
    0.954955 (0.027681) with: {'C': 2.0, 'kernel': 'rbf'}
    0.351351 (0.049646) with: {'C': 2.0, 'kernel': 'sigmoid'}
复制代码

总结

本文没有华丽的技术,在于整合多分类模型集中评估,流水线交叉验证以及超参数网格交叉评估多种场景。

版权声明:本套技术专栏是作者(秦凯新)平时工作的总结和升华,通过从真实商业环境抽取案例进行总结和分享,并给出商业应用的调优建议和集群环境容量规划等内容,请持续关注本套博客。QQ邮箱地址:[email protected],如有任何技术交流,可随时联系。

秦凯新 于深圳

猜你喜欢

转载自juejin.im/post/5c1f0ce0f265da61620d629e
今日推荐