The characteristics and running speed of sklearn's multiple models of learning curve fitting (machine learning)


✌ Case combat: the learning curve of multiple models

1. ✌ Import related libraries

from sklearn.ensemble import RandomForestClassifier # 随机森林模型
from sklearn.tree import DecisionTreeClassifier # 决策树
from sklearn.linear_model import LogisticRegression # 逻辑回归
from sklearn.svm import SVC # 支持向量机
from sklearn.naive_bayes import GaussianNB # 朴素贝叶斯
import lightgbm as lgb # lightgbm模型

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import learning_curve # 用于画学习曲线
from sklearn.model_selection import ShuffleSplit # 分割数据集

from time import time # 导入时间模块
import datetime

from sklearn.datasets import load_digits # 手写数字数据集

2. ✌ Define drawing function

def plot_learning_curve(estimator,title,x,y,ax,ylim=None,cv=None,n_jobs=None):
    train_sizes,train_scores,test_scores=learning_curve(estimator,x,y,cv=cv,n_jobs=n_jobs)
    ax.set_title(title) # 设置每个子图的标题
    if ylim is not None:
        ax.set_ylim(*ylim) # 设置纵坐标的范围
    ax.set_xlabel("training examples") # 设置子图的x轴名称
    ax.set_ylabel("score") # 设置子图的y轴名称
    ax.grid() # 画网格图
    # 横坐标为训练样本数,纵坐标为每折下的分数均值
    ax.plot(train_sizesLin,np.mean(train_scores,axis=1),'o-',color='r',label='train score')
    ax.plot(train_sizes,np.mean(test_scores,axis=1),'o-',color='g',label='test score')
    ax.legend(loc='best') # 设置图例
    return ax

3. ✌ Prepare data

data=load_digits() # 加载数据集
x=data.data # 特征矩阵
y=data.target # 标签
# 每张子图的名称
title=['Naive Bayes','DecisionTree','SVM','RandomForest','Logistic','lgb']
# 每个模型
model=[GaussianNB(),DecisionTreeClassifier(),SVC(gamma=0.001),RandomForestClassifier(n_estimators=50),LogisticRegression(C=0.1,solver='lbfgs'),lgb.LGBMClassifier()]
# 定义分割数据集的类
cv=ShuffleSplit(n_splits=50,test_size=0.2,random_state=0)

4. ✌ Circulate call function to draw

fig,axes=plt.subplots(2,3,figsize=(18,12)) # 定义画布和子图,2行3列
axes=axes.ravel() # 子图数据降维,便于后文引用,否则为二维数组
for ind,title_,estimator in zip(range(len(title)),title,model):
    times=time() # 定义初始时间
    # 调用函数
    plot_learning_curve(estimator,title_,x,y,ax=axes[ind],ylim=[0.7,1.05],n_jobs=4,cv=cv) 
    # 打印各模型的运行时间信息
    print("{:15s}{}".format(title_,datetime.datetime.fromtimestamp(time()-times).strftime("%M:%S:%f")))
plt.show()

Insert picture description here
Insert picture description here

Guess you like

Origin blog.csdn.net/m0_47256162/article/details/113763241