集成学习Day6 sklearn分类模型的评估、优化与人脸分类实例

目录

集成学习Day6 sklearn分类模型的评估、优化与人脸分类实例

1. 模型评估与优化

（1）超参数选择

A 使用网格搜索选择超参数

B 使用随机搜索选择超参数

（2）模型评估（以乳腺癌数据集二分类为例）

A 混淆矩阵

B ROC曲线

2. 基于SVM的人脸分类

（1）数据集

（2）数据处理——PCA降维、数据集划分

（3）网格搜索寻找最优超参数

（4）训练、预测

（5）结果评估

1. 模型评估与优化

（1）超参数选择

A 使用网格搜索选择超参数

from sklearn import datasets
import pandas as pd 
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC 
import time 

iris = datasets.load_iris()
X = iris.data  
y = iris.target  
feature = iris.feature_names

start_time = time.time()
pipe_svc = make_pipeline(StandardScaler(), SVC(random_state=1))
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
param_grid = [{
    
    'svc__C':param_range, 'svc__kernel':['linear']}, \
    {
    
    'svc__C':param_range,'svc__gamma':param_range,'svc__kernel':['rbf']}]
gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=-1)
gs.fit(X, y)
end_time = time.time()
print("网格搜索经历的时间为： %.3f S" %float(end_time - start_time))
print(gs.best_score_)
print(gs.best_params_)

在这里插入图片描述

B 使用随机搜索选择超参数

from sklearn import datasets
import pandas as pd 
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC 
import time 

iris = datasets.load_iris()
X = iris.data  
y = iris.target  
feature = iris.feature_names

start_time = time.time()
pipe_svc = make_pipeline(StandardScaler(), SVC(random_state=1))
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
param_grid = [{
    
    'svc__C':param_range,'svc__kernel':['linear']},\
    {
    
    'svc__C':param_range,'svc__gamma':param_range,'svc__kernel':['rbf']}]
gs = RandomizedSearchCV(estimator=pipe_svc, param_distributions=param_grid, scoring='accuracy',cv=10, n_jobs=-1)
gs = gs.fit(X,y)
end_time = time.time()
print("随机网格搜索经历的时间为：%.3f S" % float(end_time-start_time))
print(gs.best_score_)
print(gs.best_params_)

在这里插入图片描述

（2）模型评估（以乳腺癌数据集二分类为例）

A 混淆矩阵

使用SVM对乳腺癌数据集进行二分类，并用混淆矩阵显示结果。

from sklearn import datasets
import pandas as pd 
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split # split data
from sklearn.metrics import confusion_matrix
import time 
import matplotlib.pylab as plt

df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data",header=None)
'''
乳腺癌数据集：569个恶性和良性肿瘤细胞的样本，M为恶性，B为良性
'''
X = df.iloc[:, 2:].values
y = df.iloc[:, 1].values 
le = LabelEncoder() # Encode M-B into 0-1
y = le.fit_transform(y)
le.transform(['M', 'B'])
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, stratify=y,random_state=1)
pipe_svc = make_pipeline(StandardScaler(), SVC(random_state=1))
pipe_svc.fit(X_train, y_train)
y_pred = pipe_svc.predict(X_test)
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
fig, ax = plt.subplots(figsize=(2.5, 2.5))
ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)

for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label')
plt.show()

在这里插入图片描述

B ROC曲线

from sklearn import datasets
import pandas as pd 
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split # split data
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import make_scorer,f1_score
import time 
import matplotlib.pylab as plt


# grid search 

df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data",header=None)
'''
乳腺癌数据集：569个恶性和良性肿瘤细胞的样本，M为恶性，B为良性
'''
X = df.iloc[:, 2:].values
y = df.iloc[:, 1].values 
le = LabelEncoder() # Encode M-B into 0-1
y = le.fit_transform(y)
le.transform(['M', 'B'])
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, stratify=y,random_state=1)
pipe_svc = make_pipeline(StandardScaler(), SVC(random_state=1))
pipe_svc.fit(X_train, y_train)

start_time = time.time()
pipe_svc = make_pipeline(StandardScaler(), SVC(random_state=1))
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
param_grid = [{
    
    'svc__C':param_range, 'svc__kernel':['linear']}, \
    {
    
    'svc__C':param_range,'svc__gamma':param_range,'svc__kernel':['rbf']}]

scorer = make_scorer(f1_score,pos_label=0)
gs = GridSearchCV(estimator=pipe_svc,param_grid=param_grid,scoring=scorer,cv=10)
y_pred = gs.fit(X_train,y_train).decision_function(X_test)
fpr,tpr,threshold = roc_curve(y_test, y_pred) ## 计算真阳率和假阳率
roc_auc = auc(fpr,tpr) ## 计算auc的值
plt.figure()
lw = 2
plt.figure(figsize=(7,5))
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) ## 假阳率为横坐标，真阳率为纵坐标做曲线
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([-0.05, 1.0])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic ')
plt.legend(loc="lower right")
plt.show()

在这里插入图片描述

2. 基于SVM的人脸分类

LFW数据集全称为Labeled Faces in the Wild, 是一个应用于人脸识别问题的数据集，里面每张图像都有对应的人名label，是不同的人在多种不同情况下拍的照片，在本实验中选择频次超过70的人名对应的1277张图像作为数据集。

（1）数据集

from sklearn.datasets import fetch_lfw_people
import matplotlib.pyplot as plt 
import seaborn as sns

faces = fetch_lfw_people(min_faces_per_person=60)
print("数据集类别为："+ str(faces.target_names))
print("数据集维度为: "+ str(faces.images.shape))

sns.set()
fig, ax = plt.subplots(3, 5)
fig.subplots_adjust(left=0.0625, right=1.2, wspace=1)
for i, axi in enumerate(ax.flat):
    axi.imshow(faces.images[i], cmap='bone')
    axi.set(xticks=[], yticks=[], xlabel=faces.target_names[faces.target[i]])

plt.show()

在这里插入图片描述

从结果可以看到，数据集包含1277张图像，包含七个类别：‘Ariel Sharon’，‘Colin Powell’，‘Donald Rumsfeld’，‘George W Bush’，‘Gerhard Schroeder’，‘Junichiro Koizumi’，‘Tony Blair’，这个问题为一个七分类问题。

（2）数据处理——PCA降维、数据集划分

（3）网格搜索寻找最优超参数

（4）训练、预测

（5）结果评估

在这里插入图片描述

sklearn代码：

from sklearn.datasets import fetch_lfw_people
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split # split data
from sklearn.model_selection import cross_val_score # k-cross 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# 数据集读取
faces = fetch_lfw_people(min_faces_per_person=60)
print("数据集类别为："+ str(faces.target_names))
print("数据集维度为: "+ str(faces.images.shape))

sns.set()
fig, ax = plt.subplots(3, 5)
fig.subplots_adjust(left=0.0625, right=1.2, wspace=1)
for i, axi in enumerate(ax.flat):
    axi.imshow(faces.images[i], cmap='bone')
    axi.set(xticks=[], yticks=[], xlabel=faces.target_names[faces.target[i]])

# plt.show()


# 降维
pca = PCA(n_components=150, whiten=True, random_state=42)
svc = SVC(kernel='rbf', class_weight='balanced')
model = make_pipeline(pca, svc)

# 数据集划分成训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(faces.data, faces.target, random_state=42)

# 网络搜索交叉检验来寻找最优参数组合。通过不断调整C（松弛变量）和参数gamma（控制径向基函数核的大小），确定最优模型
param_grid = {
    
    'svc__C': [1,5,10,50], 'svc__gamma':[0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model, param_grid)

grid.fit(x_train, y_train)
print(grid.best_params_)

# 预测
model = grid.best_estimator_
y_fit = model.predict(x_test)
# 比较
fig, ax = plt.subplots(4, 6)
for i, axi in enumerate(ax.flat):
    axi.imshow(x_test[i].reshape(62, 47), cmap='bone')
    axi.set(xticks=[], yticks=[])
    axi.set_ylabel(faces.target_names[y_fit[i]].split()[-1], color='black' if y_fit[i] == y_test[i] else 'red')
fig.suptitle('Predicted Names; Incorect Lables in Red', size=14)
plt.show()
print(classification_report(y_test, y_fit, target_names=faces.target_names))
mat = confusion_matrix(y_test, y_fit)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=faces.target_names,
            yticklabels=faces.target_names)
plt.xlabel('true label')
plt.ylabel('predicted label')
plt.show()

参考：
1. DataWhale组对学习-集成学习
 2. LFW数据集
 3. 基于SVM的人脸识别

集成学习Day6 sklearn分类模型的评估、优化与基于SVM的人脸分类实例

集成学习Day6 sklearn分类模型的评估、优化与人脸分类实例

目录

1. 模型评估与优化

（1）超参数选择

A 使用网格搜索选择超参数

B 使用随机搜索选择超参数

（2）模型评估（以乳腺癌数据集二分类为例）

A 混淆矩阵

B ROC曲线

2. 基于SVM的人脸分类

（1）数据集

（2）数据处理——PCA降维、数据集划分

（3）网格搜索寻找最优超参数

（4）训练、预测

（5）结果评估

猜你喜欢