[Experimental exercise] Realization of Iris (Iris) data set classification based on SVM (Python implementation)

topic

Using the SVM method to realize the classification of the Iris (Iris) data set

data set

iris.name is the attribute description about the dataset;

iris.data is the actual data set, which contains three types of data, and each type of data has 50 pieces of data.

Require

Training set: Select 80% of the data in the Iris dataset, that is, 120 data, and each category contains 40 data.

Test set: 30 data other than the training set are used.

Specific SVM method: Free to choose according to the situation.

Evaluation indicators: Select classification-related evaluation indicators to measure the classification results.

Directory Structure

    |----Exp3\
    |    |----Data\
    |    |    |----iris.data
    |    |    |----iris.names
    |    |----main.py
    |    |----Plot.py
    |    |----Result\
    |    |    |----iris-cla.png
    |    |    |----iris-petal-cla.png
    |    |    |----iris-sepal-cal.png
    |    |    |----iris-sepal-petal-cla.png

main.py 

# -*- coding: utf-8 -*- #
"""
@Project    :MachineLearningLesson
@File       :main.py 
@Author     :ZAY
@Time       :2023/6/4 15:44
@Annotation : " "
"""

import os
import torch
import sklearn
import numpy as np
from sklearn import svm
from sklearn.metrics import accuracy_score,auc,roc_curve,precision_recall_curve,f1_score, precision_score, recall_score
# Exp.Exp3为本项目存储路径,根据自己实际存储地址进行更改
from Exp.Exp3.Plot import plotSepalShow,plotPetalShow,plotSPShow 

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 将标签由文字映射为数字
def Iris_label(s):
    it = {b'Iris-setosa': 0, b'Iris-versicolor': 1, b'Iris-virginica': 2}
    return it[s]

def Iris_Sepal_Cla(data_x, label_y):
    data_x = data_x[:, 0:2]
    train_data, test_data, train_label, test_label = sklearn.model_selection.train_test_split(data_x, label_y,
                                                                                              random_state = 1,
                                                                                              train_size = 0.8,
                                                                                              test_size = 0.2)
    # 训练 SVM 分类器 https://blog.csdn.net/TeFuirnever/article/details/99646257
    classifier = svm.SVC(C = 0.5, kernel = 'rbf', gamma = 10, decision_function_shape = 'ovr')  # rbf
    classifier.fit(train_data, train_label.ravel())

    train_label_pre = classifier.predict(train_data)
    test_label_pre = classifier.predict(test_data)
    print('花萼训练集acc:', accuracy_score(train_label, train_label_pre))
    print('花萼测试集acc:', accuracy_score(test_label, test_label_pre))

    # 查看内部决策函数(返回的是样本到超平面的距离)
    train_decision_function = classifier.decision_function(train_data)
    predict_result = classifier.predict(train_data)

    # print('train_decision_function:', train_decision_function)
    # print('predict_result:', predict_result)

    plotSepalShow(test_data, test_label, data_x, label_y, classifier)

def Iris_Petal_Cla(data_x, label_y):
    # 基于SVM鸢尾花瓣长宽度二特征分类
    data_x = data_x[:, 2:4]

    train_data, test_data, train_label, test_label = sklearn.model_selection.train_test_split(data_x, label_y,
                                                                                              random_state = 1,
                                                                                              train_size = 0.8,
                                                                                              test_size = 0.2)
    classifier = svm.SVC(C = 0.5, kernel = 'rbf', gamma = 10, decision_function_shape = 'ovr')  # rbf
    classifier.fit(train_data, train_label.ravel())

    train_label_pre = classifier.predict(train_data)
    test_label_pre = classifier.predict(test_data)
    print('花瓣训练集acc:', accuracy_score(train_label, train_label_pre))
    print('花瓣测试集acc:', accuracy_score(test_label, test_label_pre))

    plotPetalShow(test_data, test_label, data_x, label_y, classifier)


if __name__ == "__main__":

    txt_path = './/Result//SVM.txt'

    data = np.loadtxt("./Data/iris.data", dtype = float, delimiter = ',', converters = {4: Iris_label})
    # 基于SVM鸢尾花萼长宽度二特征分类
    data_x, label_y = np.split(data, indices_or_sections = (4,), axis = 1)  # x为数据,y为标签

    Iris_Sepal_Cla(data_x, label_y)

    Iris_Petal_Cla(data_x, label_y)

 plotSepalShow.py plotPetalShow.py

# -*- coding: utf-8 -*- #
"""
@Project    :MachineLearningLesson
@File       :plot.py 
@Author     :ZAY
@Time       :2023/6/5 21:41
@Annotation : " "
"""

# 确定坐标轴范围
import matplotlib
import matplotlib.pyplot as plt
import numpy as np


def plotSepalShow(test_data, test_label, data_x, label_y, classifier):
    x1_min, x1_max = data_x[:, 0].min(), data_x[:, 0].max()  # 第0维特征的范围
    x2_min, x2_max = data_x[:, 1].min(), data_x[:, 1].max()  # 第1维特征的范围
    x1, x2 = np.mgrid[x1_min:x1_max:200j, x2_min:x2_max:200j]  # 生成网络采样点
    # print(x1.shape) # (200, 200)
    # print(x1.flat) # flat属性可以使得像遍历以一维数组的方法来遍历多维数组
    grid_test = np.stack((x1.flat, x2.flat), axis = 1)  # 测试点
    # print(grid_test.shape) # (40000, 2)
    # 指定默认字体
    matplotlib.rcParams['font.sans-serif'] = ['SimHei']

    # 设置颜色
    cm_light = matplotlib.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF'])
    cm_dark = matplotlib.colors.ListedColormap(['g', 'r', 'b'])

    grid_hat = classifier.predict(grid_test)  # 预测分类值
    grid_hat = grid_hat.reshape(x1.shape)  # 使之与输入的形状相同

    plt.pcolormesh(x1, x2, grid_hat, cmap = cm_light)  # 预测值的显示
    plt.scatter(data_x[:, 0], data_x[:, 1], c = label_y[:, 0], s = 30, cmap = cm_dark)  # 样本
    plt.scatter(test_data[:, 0], test_data[:, 1], c = test_label[:, 0], s = 30, edgecolors = 'k', zorder = 2,
                cmap = cm_dark)  # 圈中测试集样本点
    plt.xlabel('花萼长度', fontsize = 13)
    plt.ylabel('花萼宽度', fontsize = 13)
    plt.xlim(x1_min, x1_max)
    plt.ylim(x2_min, x2_max)
    plt.title('基于SVM鸢尾花萼长宽度二特征分类')
    plt.savefig('./Result/iris-sepal-cal.png')
    plt.show()

def plotPetalShow(test_data, test_label, data_x, label_y, classifier):
    x1_min, x1_max = data_x[:, 0].min(), data_x[:, 0].max()  # 第2维特征的范围
    x2_min, x2_max = data_x[:, 1].min(), data_x[:, 1].max()  # 第3维特征的范围
    x1, x2 = np.mgrid[x1_min:x1_max:200j, x2_min:x2_max:200j]  # 生成网络采样点
    # print(x1.shape) # (200, 200)
    # print(x1.flat) # flat属性可以使得像遍历以一维数组的方法来遍历多维数组
    grid_test = np.stack((x1.flat, x2.flat), axis = 1)  # 测试点
    # print(grid_test.shape) # (40000, 2)
    # 指定默认字体
    matplotlib.rcParams['font.sans-serif'] = ['SimHei']

    # 设置颜色
    cm_light = matplotlib.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF'])
    cm_dark = matplotlib.colors.ListedColormap(['g', 'r', 'b'])

    grid_hat = classifier.predict(grid_test)  # 预测分类值
    grid_hat = grid_hat.reshape(x1.shape)  # 使之与输入的形状相同

    plt.pcolormesh(x1, x2, grid_hat, cmap = cm_light)  # 预测值的显示
    plt.scatter(data_x[:, 0], data_x[:, 1], c = label_y[:, 0], s = 30, cmap = cm_dark)  # 样本
    plt.scatter(test_data[:, 0], test_data[:, 1], c = test_label[:, 0], s = 30, edgecolors = 'k', zorder = 2,
                cmap = cm_dark)  # 圈中测试集样本点
    plt.xlabel('花瓣长度', fontsize = 13)
    plt.ylabel('花瓣宽度', fontsize = 13)
    plt.xlim(x1_min, x1_max)
    plt.ylim(x2_min, x2_max)
    plt.title('基于SVM鸢尾花瓣长宽度二特征分类')
    plt.savefig('./Result/iris-petal-cla.png')
    plt.show()

Test Results

advanced optimization:

def Iris_Sepal_Petal_Cla(data_x, label_y):
    data_x = np.stack((data_x[:, 0], data_x[:, 2], data_x[:, 3]), axis=1)

    train_data, test_data, train_label, test_label = sklearn.model_selection.train_test_split(data_x, label_y,
                                                                                              random_state = 1,
                                                                                              train_size = 0.8,
                                                                                              test_size = 0.2)
    classifier = svm.SVC(C = 0.5, kernel = 'linear', gamma = 10, decision_function_shape = 'ovr')  # rbf
    classifier.fit(train_data, train_label.ravel())

    train_label_pre = classifier.predict(train_data)
    test_label_pre = classifier.predict(test_data)
    print('花萼和花瓣训练集acc:', accuracy_score(train_label, train_label_pre))
    print('花萼和花瓣测试集acc:', accuracy_score(test_label, test_label_pre))

    plotSPShow(classifier, data_x, label_y)

def plotSPShow(clf, x, y):
    iris_feature = 'sepal length', 'sepal width', 'petal lenght', 'petal width'
    # 开始画图
    x0_min, x0_max = x[:, 0].min(), x[:, 0].max()
    x1_min, x1_max = x[:, 1].min(), x[:, 1].max()  # 第0列的范围
    x2_min, x2_max = x[:, 2].min(), x[:, 2].max()  # 第1列的范围
    x0, x1, x2 = np.mgrid[x0_min:x0_max:50j, x1_min:x1_max:50j, x2_min:x2_max:50j]  # 生成网格采样点,3D
    grid_test = np.stack((x0.flat, x1.flat, x2.flat), axis=1)  # stack():沿着新的轴加入一系列数组, flat的作用是将数组分解成可连续访问的元素,目的就是把他拉直后合并,并且不改变数组
    print('grid_test:\n', grid_test)

    grid_hat = clf.predict(grid_test)  # 预测分类值 得到【0,0.。。。2,2,2】
    print('grid_hat:\n', grid_hat)
    grid_hat = grid_hat.reshape(x1.shape)  # reshape grid_hat和x1形状一致
    # 若3*3矩阵e,则e.shape()为3*3,表示3行3列

    cm_light = matplotlib.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF'])
    cm_dark = matplotlib.colors.ListedColormap(['g', 'r', 'b'])

    fig = plt.figure()
    ax = fig.add_subplot(projection='3d')

    # s:marker标记的大小
    # c: 颜色  可为单个,可为序列
    # depthshade: 是否为散点标记着色以呈现深度外观。对 scatter() 的每次调用都将独立执行其深度着色。
    # marker:样式
    # alpha为点的透明度,在0~1之间

    ax.scatter(xs=x1, ys=x2, zs=x0, zdir='z', s=10, c=grid_hat, depthshade=True, cmap=cm_light,alpha=0.01)
    ax.scatter(xs=x[:,1], ys=x[:,2], zs=x[:,0], zdir='z', s=30, c=np.squeeze(y), depthshade=True, cmap=cm_dark, marker="^")
    plt.title('基于SVM鸢尾花萼长度和花瓣长宽度三特征分类')
    plt.savefig('./Result/iris-sepal-petal-cla.png')
    plt.show()


if __name__ == "__main__":

    txt_path = './/Result//SVM.txt'

    data = np.loadtxt("./Data/iris.data", dtype = float, delimiter = ',', converters = {4: Iris_label})
    # 基于SVM鸢尾花萼长宽度二特征分类
    data_x, label_y = np.split(data, indices_or_sections = (4,), axis = 1)  # x为数据,y为标签

    Iris_Sepal_Petal_Cla(data_x, label_y)

Plot.py summarizes the above drawing.py code

# -*- coding: utf-8 -*- #
"""
@Project    :MachineLearningLesson
@File       :plot.py 
@Author     :ZAY
@Time       :2023/6/5 21:41
@Annotation : " "
"""

# 确定坐标轴范围
import matplotlib
import matplotlib.pyplot as plt
import numpy as np


def plotSepalShow(test_data, test_label, data_x, label_y, classifier):
    x1_min, x1_max = data_x[:, 0].min(), data_x[:, 0].max()  # 第0维特征的范围
    x2_min, x2_max = data_x[:, 1].min(), data_x[:, 1].max()  # 第1维特征的范围
    x1, x2 = np.mgrid[x1_min:x1_max:200j, x2_min:x2_max:200j]  # 生成网络采样点
    # print(x1.shape) # (200, 200)
    # print(x1.flat) # flat属性可以使得像遍历以一维数组的方法来遍历多维数组
    grid_test = np.stack((x1.flat, x2.flat), axis = 1)  # 测试点
    # print(grid_test.shape) # (40000, 2)
    # 指定默认字体
    matplotlib.rcParams['font.sans-serif'] = ['SimHei']

    # 设置颜色
    cm_light = matplotlib.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF'])
    cm_dark = matplotlib.colors.ListedColormap(['g', 'r', 'b'])

    grid_hat = classifier.predict(grid_test)  # 预测分类值
    grid_hat = grid_hat.reshape(x1.shape)  # 使之与输入的形状相同

    plt.pcolormesh(x1, x2, grid_hat, cmap = cm_light)  # 预测值的显示
    plt.scatter(data_x[:, 0], data_x[:, 1], c = label_y[:, 0], s = 30, cmap = cm_dark)  # 样本
    plt.scatter(test_data[:, 0], test_data[:, 1], c = test_label[:, 0], s = 30, edgecolors = 'k', zorder = 2,
                cmap = cm_dark)  # 圈中测试集样本点
    plt.xlabel('花萼长度', fontsize = 13)
    plt.ylabel('花萼宽度', fontsize = 13)
    plt.xlim(x1_min, x1_max)
    plt.ylim(x2_min, x2_max)
    plt.title('基于SVM鸢尾花萼长宽度二特征分类')
    plt.savefig('./Result/iris-sepal-cal.png')
    plt.show()

def plotPetalShow(test_data, test_label, data_x, label_y, classifier):
    x1_min, x1_max = data_x[:, 0].min(), data_x[:, 0].max()  # 第2维特征的范围
    x2_min, x2_max = data_x[:, 1].min(), data_x[:, 1].max()  # 第3维特征的范围
    x1, x2 = np.mgrid[x1_min:x1_max:200j, x2_min:x2_max:200j]  # 生成网络采样点
    # print(x1.shape) # (200, 200)
    # print(x1.flat) # flat属性可以使得像遍历以一维数组的方法来遍历多维数组
    grid_test = np.stack((x1.flat, x2.flat), axis = 1)  # 测试点
    # print(grid_test.shape) # (40000, 2)
    # 指定默认字体
    matplotlib.rcParams['font.sans-serif'] = ['SimHei']

    # 设置颜色
    cm_light = matplotlib.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF'])
    cm_dark = matplotlib.colors.ListedColormap(['g', 'r', 'b'])

    grid_hat = classifier.predict(grid_test)  # 预测分类值
    grid_hat = grid_hat.reshape(x1.shape)  # 使之与输入的形状相同

    plt.pcolormesh(x1, x2, grid_hat, cmap = cm_light)  # 预测值的显示
    plt.scatter(data_x[:, 0], data_x[:, 1], c = label_y[:, 0], s = 30, cmap = cm_dark)  # 样本
    plt.scatter(test_data[:, 0], test_data[:, 1], c = test_label[:, 0], s = 30, edgecolors = 'k', zorder = 2,
                cmap = cm_dark)  # 圈中测试集样本点
    plt.xlabel('花瓣长度', fontsize = 13)
    plt.ylabel('花瓣宽度', fontsize = 13)
    plt.xlim(x1_min, x1_max)
    plt.ylim(x2_min, x2_max)
    plt.title('基于SVM鸢尾花瓣长宽度二特征分类')
    plt.savefig('./Result/iris-petal-cla.png')
    plt.show()

def plotSPShow(clf, x, y):
    iris_feature = 'sepal length', 'sepal width', 'petal lenght', 'petal width'
    # 开始画图
    x0_min, x0_max = x[:, 0].min(), x[:, 0].max()
    x1_min, x1_max = x[:, 1].min(), x[:, 1].max()  # 第0列的范围
    x2_min, x2_max = x[:, 2].min(), x[:, 2].max()  # 第1列的范围
    x0, x1, x2 = np.mgrid[x0_min:x0_max:50j, x1_min:x1_max:50j, x2_min:x2_max:50j]  # 生成网格采样点,3D
    grid_test = np.stack((x0.flat, x1.flat, x2.flat), axis=1)  # stack():沿着新的轴加入一系列数组, flat的作用是将数组分解成可连续访问的元素,目的就是把他拉直后合并,并且不改变数组
    print('grid_test:\n', grid_test)

    grid_hat = clf.predict(grid_test)  # 预测分类值 得到【0,0.。。。2,2,2】
    print('grid_hat:\n', grid_hat)
    grid_hat = grid_hat.reshape(x1.shape)  # reshape grid_hat和x1形状一致
    # 若3*3矩阵e,则e.shape()为3*3,表示3行3列

    cm_light = matplotlib.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF'])
    cm_dark = matplotlib.colors.ListedColormap(['g', 'r', 'b'])

    fig = plt.figure()
    ax = fig.add_subplot(projection='3d')

    # s:marker标记的大小
    # c: 颜色  可为单个,可为序列
    # depthshade: 是否为散点标记着色以呈现深度外观。对 scatter() 的每次调用都将独立执行其深度着色。
    # marker:样式
    # alpha为点的透明度,在0~1之间

    ax.scatter(xs=x1, ys=x2, zs=x0, zdir='z', s=10, c=grid_hat, depthshade=True, cmap=cm_light,alpha=0.01)
    ax.scatter(xs=x[:,1], ys=x[:,2], zs=x[:,0], zdir='z', s=30, c=np.squeeze(y), depthshade=True, cmap=cm_dark, marker="^")
    plt.title('基于SVM鸢尾花萼长度和花瓣长宽度三特征分类')
    plt.savefig('./Result/iris-sepal-petal-cla.png')
    plt.show()

Test Results

Please private message for experimental data and complete code 

Guess you like

Origin blog.csdn.net/Next_SummerAgain/article/details/131056996