Iris data set classification-decision tree

Decision tree

Decision Tree (Decision Tree) is a basic classification and regression method. When a decision tree is used for classification, it is called a classification tree, and when it is used for regression, it is called a regression tree. Mainly introduce the classification tree.
The decision tree is composed of nodes and directed edges. There are two types of nodes: internal nodes and leaf nodes, where internal nodes represent a feature or attribute, and leaf nodes represent a class.
Decision tree algorithm is usually a process of recursively selecting the optimal feature , and segmenting the training data according to the feature, so that each sub-data set has the best classification process. Feature selection method based on the information gain criterion: For the training data set (or subset), calculate the information gain of each feature, compare their sizes, and select the feature with the largest information gain.

data set

Insert picture description here

Code

Classification at different depths

// An highlighted block
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# 花萼长度、花萼宽度,花瓣长度,花瓣宽度
iris_feature_E = 'sepal length', 'sepal width', 'petal length', 'petal width'
iris_feature = u'花萼长度', u'花萼宽度', u'花瓣长度', u'花瓣宽度'
iris_class = 'Iris-setosa', 'Iris-versicolor', 'Iris-virginica'

if __name__ == "__main__":
    mpl.rcParams['font.sans-serif'] = [u'SimHei']
    mpl.rcParams['axes.unicode_minus'] = False

    '''加载数据'''
    data = pd.read_csv('F:\pythonlianxi\shuju\iris.data', header=None)
    #print(data)
    #样本集
    x = data[range(4)]
    #标签集
    y = pd.Categorical(data[4]).codes
    # 为了可视化,仅使用前两列特征
    x = x.iloc[:, :2]
    #样本集,标签集分为测试集和验证集
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=1)
    #print(y_test.shape)
    print('开始训练模型....')
    #
    '''决策树'''
    # 决策树参数估计
    # min_samples_split = 10:如果该结点包含的样本数目大于10,则(有可能)对其分支
    # min_samples_leaf = 10:若将某结点分支后,得到的每个子结点样本数目都大于10,则完成分支;否则,不进行分支
    #建立决策树模型
    model = DecisionTreeClassifier(criterion='entropy')
    model.fit(x_train, y_train)
    #测试数据
    y_test_hat = model.predict(x_test)      # 测试数据

    # 横纵各采样值
    N, M = 50, 50 
    x1_min, x2_min = x.min()
    x1_max, x2_max = x.max()
    t1 = np.linspace(x1_min, x1_max, N)
    t2 = np.linspace(x2_min, x2_max, M)
    # 生成网格采样点
    x1, x2 = np.meshgrid(t1, t2)  
    # 测试点
    x_show = np.stack((x1.flat, x2.flat), axis=1)  

    #图形颜色
    cm_light = mpl.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF'])
    cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b'])
    # 预测值
    y_show_hat = model.predict(x_show) 
    # 使之与输入的形状相同
    y_show_hat = y_show_hat.reshape(x1.shape) 
    # print (y_show_hat)

    '''绘图'''
    plt.figure(facecolor='w')
    plt.pcolormesh(x1, x2, y_show_hat, cmap=cm_light)  # 预测值的显示
    plt.scatter(x_test[0], x_test[1], c=y_test.ravel(), edgecolors='k', s=150, zorder=10, cmap=cm_dark, marker='*')  # 测试数据
    plt.scatter(x[0], x[1], c=y.ravel(), edgecolors='k', s=40, cmap=cm_dark)  # 全部数据
    plt.xlabel(iris_feature[0], fontsize=15)
    plt.ylabel(iris_feature[1], fontsize=15)
    plt.xlim(x1_min, x1_max)
    plt.ylim(x2_min, x2_max)
    plt.grid(True)
    plt.title(u'鸢尾花数据的决策树分类', fontsize=17)
    plt.show()
    #
    '''测试样本'''
    # 训练集上的预测结果
    y_test = y_test.reshape(-1)
    print( y_test_hat)
    print (y_test)
    result = (y_test_hat == y_test)   # True则预测正确,False则预测错误
    #取平均
    acc = np.mean(result)
    print('准确度: %.2f%%' % (100 * acc))

    '''过拟合'''
    # 过拟合:错误率
    #给定深度14层
    depth = np.arange(1, 15)
    err_list = []
    #每个深度进行测试
    for d in depth:
        clf = DecisionTreeClassifier(criterion='entropy', max_depth=d)
        clf.fit(x_train, y_train)
        y_test_hat = clf.predict(x_test)  # 测试数据
        result = (y_test_hat == y_test)  # True则预测正确,False则预测错误
        if d == 1:
            print (result)
        err = 1 - np.mean(result)
        err_list.append(err)
        print (d, ' 错误率: %.2f%%' % (100 * err))
    
    '''绘图'''
    plt.figure(facecolor='w')
    plt.plot(depth, err_list, 'ro-', lw=2)
    plt.xlabel(u'决策树深度', fontsize=15)
    plt.ylabel(u'错误率', fontsize=15)
    plt.title(u'决策树深度与过拟合', fontsize=17)
    plt.grid(True)
    plt.show()

experiment analysis

Insert picture description here
Insert picture description here

Accuracy: 62.22%, a higher recognition rate can be achieved when the depth of the decision tree is 3.
Insert picture description here
Insert picture description here

Guess you like

Origin blog.csdn.net/weixin_42567027/article/details/107487428