【机器学习】决策树(基于ID3,C4.5,CART分类回归树算法)—— python3 实现方案

内含3种算法的核心部分.

没有找到很好的测试数据.

但就理清算法思路来说问题不大

剪枝算法目前只实现了CART回归树的后剪枝.

import numpy as np
from collections import Counter

class DecisionTree():
    def __init__(self, algorithm='ID3'):
        self.algorithm = algorithm

    def cal_entroy(self, dataSet):
        '''
        计算数据集的经验熵,数据集为np.array
        :param dataSet: 数据集m*n,m为样本数,n为特征数
        :return: 数据集的经验熵
        '''
        m = dataSet.shape[0]  # 样本数
        labels = Counter(dataSet[:, -1].reshape(m).tolist())  # 获取类别及其出现的次数
        entroy = 0  # 初始化经验熵
        for amount in labels.values():
            prob = amount / m  # 计算概率pi
            entroy -= prob * np.log(prob)  # e=-sum(pi*log(pi))
        return entroy

    def cal_gini(self, dataSet):
        '''
        计算数据集的基尼指数,数据集为np.array
        :param dataSet: 数据集m*n,m为样本数,n为特征数
        :return: 数据集的基尼指数
        '''
        m = dataSet.shape[0]
        labels = Counter(dataSet[:, -1].reshape(m).tolist())
        gini = 1
        for amount in labels.values():
            prob = amount / m
            gini -= prob**2  # g=1-sum(pi**2)
        return gini

    def cal_se(self, dataSet):
        '''
        计算数据集的方差squared error,数据集为np.array
        np.var可直接计算出均方差,乘以样本数即为方差
        :param dataSet: 数据集m*n,m为样本数,n为特征数
        :return: 数据集的方差
        '''
        return np.var(dataSet[:, -1]) * dataSet.shape[0] if dataSet.shape[0] > 0 else 0

    def split_dataSet(self, dataSet, feature, value):
        '''
        根据特征feature的特征值value,划分数据集
        :param dataSet: 数据集m*(n+1),m为样本数,n为特征数
        :param feature: 作为划分点的特征的索引
        :param value: 特征的某一个值
        :param dataType: 声明feature的值是离散型或连续型,默认是离散型,用于CART
        :return: dataSet[feature]==value的数据集,且不再包含feature特征
        '''
        m, n = dataSet.shape[0], dataSet.shape[1] - 1
        if self.algorithm == 'ID3' or self.algorithm == 'C4.5':  # 获取所有特征值等于给定值的样本D,返回去掉该特征列的D.
            splitData = np.zeros((1, n))  # 初始化一个1*n的二维数组,便于使用np.concatenate来增添数据,最后输出结果时再去掉第一行就OK.
            for i in range(m):
                if dataSet[i, feature] == value:
                    temp = np.concatenate((dataSet[i, : feature], dataSet[i, feature + 1:])).reshape(1, n)
                    splitData = np.concatenate((splitData, temp))
            return splitData[1:, :]
        else:  # 获取符合条件的样本,用于CART
            if self.algorithm == 'CARTcla':  # CART分类树,训练数据为离散型
                left = dataSet[np.nonzero(dataSet[:, feature] == value)[0], :]
                right = dataSet[np.nonzero(dataSet[:, feature] != value)[0], :]
            else:  # CART回归树,训练数据为连续型
                left = dataSet[np.nonzero(dataSet[:, feature] <= value)[0], :]
                right = dataSet[np.nonzero(dataSet[:, feature] > value)[0], :]
            return left, right


    def cal_entroy_gain(self, baseEnt, dataSet, feature):
        '''
        计算信息增益,用于ID3
        :param baseEnt: 原数据的经验熵
        :param dataSet: 数据集m*(n+1),m为样本数,n为特征数
        :param feature: 作为划分点的特征的索引
        :param value: 特征的某一个值
        :return: 按照指定特征划分后的信息增益
        '''
        newEnt = 0
        values = np.unique(dataSet[:, feature])  # 获取特征值的取值范围
        for value in values:
            newEnt += self.cal_entroy(self.split_dataSet(dataSet, feature, value))
        return baseEnt - newEnt

    def cal_entroy_gain_rate(self, baseEnt, dataSet, feature):
        '''
        计算信息增益比,用于C4.5
        :param baseEnt: 原数据的经验熵
        :param dataSet: 数据集m*(n+1),m为样本数,n为特征数
        :param feature: 作为划分点的特征的索引
        :param value: 特征的某一个值
        :return: 按照指定特征划分后的信息增益比
        '''
        newEnt, splitEnt = 0, 0
        values = np.unique(dataSet[:, feature])
        for value in values:
            splitData = self.split_dataSet(dataSet, feature, value)
            newEnt += self.cal_entroy(splitData)
            prob = splitData.shape[0] / dataSet.shape[0]
            splitEnt -= prob * np.log(prob)
        return (baseEnt - newEnt) / splitEnt

    def cal_split_gini(self, dataSet, feature):
        '''
        计算数据集按照某一特征的值划分后,可以取得的最小基尼指数,返回该基尼指数和对应的值. 用于CART分类树
        :param dataSet: 数据集m*(n+1),m为样本数,n为特征数
        :param feature: 作为划分点的特征的索引
        :return: 最小基尼指数与其对应的特征值
        '''
        values = np.unique(dataSet[:, feature])
        minGini, minValue = np.inf, 0
        for value in values:
            left, right = self.split_dataSet(dataSet, feature, value)
            newGini = left.shape[0] / dataSet.shape[0] * self.cal_gini(left) + right.shape[0] / dataSet.shape[0] * self.cal_gini(right)
            if newGini < minGini:
                minGini = newGini
                minValue = value
        return minGini, minValue

    def cal_split_se(self, dataSet, feature):
        '''
        计算数据集按照某一特征的值划分后,可以取得的最小方差,返回该方差和对应的值. 用于CART回归树
        :param dataSet: 数据集m*(n+1),m为样本数,n为特征数
        :param feature: 作为划分点的特征的索引
        :return: 最小基尼指数与其对应的特征值
        '''
        values = np.unique(dataSet[:, feature])
        minSe, minValue = np.inf, 0
        for value in values:
            left, right = self.split_dataSet(dataSet, feature, value)
            newSe = self.cal_se(left) + self.cal_se(right)
            if newSe < minSe:
                minSe = newSe
                minValue = value
        return minSe, minValue

    def choose_best_feature(self, dataSet):
        '''
        根据各算法的要求,选取对划分数据效果最好的特征.
        :param dataSet: 数据集m*(n+1),m为样本数,n为特征数
        :return: 对于ID3和C.45,返回最佳特征的索引值;对于CART回归树和分类树,返回最佳特征的索引值和对应的特征值
        '''
        m, n = dataSet.shape[0], dataSet.shape[1] - 1
        baseEnt = self.cal_entroy(dataSet)
        deltaGini, deltaInfo = np.inf, -np.inf  # 前者用于CART,后者用于ID3和C.45
        bestFeature, bestValue = -1, 0  # 定义最佳特征索引和特征值
        for feature in range(n):
            if self.algorithm == 'ID3':
                newDeltaInfo = self.cal_entroy_gain(baseEnt, dataSet, feature)
                if newDeltaInfo > deltaInfo:
                    bestFeature = feature
                    deltaInfo = newDeltaInfo
            elif self.algorithm == 'C4.5':
                newDeltaInfo = self.cal_entroy_gain_rate(baseEnt, dataSet, feature)
                if newDeltaInfo > deltaInfo:
                    bestFeature = feature
                    deltaInfo = newDeltaInfo
            elif self.algorithm == 'CARTcla':
                newGini, value = self.cal_split_gini(dataSet, feature)
                if newGini < deltaGini:
                    deltaGini = newGini
                    bestValue = value
                    bestFeature = feature
            else:  # CART回归树
                newSe, value = self.cal_split_se(dataSet, feature)
                if newSe < deltaGini:
                    deltaGini = newSe
                    bestValue = value
                    bestFeature = feature
        if self.algorithm == 'ID3' or self.algorithm == 'C4.5':
            return bestFeature
        else:
            return bestFeature, bestValue

    def training(self, dataSet, featureLabel=None):
        '''
        训练模型,即生成决策树的函数.利用字典来作为树的数据结构.ID3和C4.5是N叉树,CART是二叉树
        :param dataSet: 数据集m*(n+1),m为样本数,n为特征数
        :param featureLabel: 索引值对应的含义列表,若没有给定,则用初始数据的索引值代替.
        :return: 字典形式的决策树
        '''
        dataSet = np.array(dataSet)
        targets = dataSet[:, -1]
        if np.unique(targets).shape[0] == 1: # 即标签列表中只有一个类别,返回此类别
            return targets[0]
        if dataSet.shape[1] == 1: # 对应 没有特征值可分的情况
            return Counter(targets.tolist()).most_common(1)[0]
        if featureLabel == None: featureLabel = [i for i in range(dataSet.shape[1] - 1)]  # 若没有给定对照表,则用初始数据的索引值代替.

        if self.algorithm == 'ID3' or self.algorithm == 'C4.5':
            bestFeature = self.choose_best_feature(dataSet)  # 选取最佳分类特征索引值
            bestFeatureLabel = featureLabel[bestFeature]  # 获取其含义
            featureLabel_copy = featureLabel.copy()  # 避免对源数据的修改
            featureLabel_copy.pop(bestFeature)  # 因为这个表要传递给子树使用,所以删去表中的这个元素(不然会导致索引值混乱,从而无法对应正确的特征)
            mytree = {bestFeatureLabel: {}}  # 创建根节点
            values = np.unique(dataSet[:, bestFeature])
            for value in values:  # 针对最佳分类特征的每一个属性值,创建子树
                sublabel = featureLabel_copy[:]# 更新 子 特征-含义 列表
                mytree[bestFeatureLabel][value] = self.training(self.split_dataSet(dataSet, bestFeature, value),sublabel)
        else:
            bestFeature, bestValue = self.choose_best_feature(dataSet)
            bestFeatureLabel = featureLabel[bestFeature]
            mytree = {}
            mytree['FeatLabel'] = bestFeatureLabel  # 记录结点选择的特征
            mytree['FeatValue'] = bestValue  # 记录结点选择的特征的值
            lSet, rSet = self.split_dataSet(dataSet, bestFeature, bestValue)
            mytree['left'] = self.training(lSet, featureLabel)  # 构建左子树
            mytree['right'] = self.training(rSet, featureLabel)  # 构建右子树
        return mytree

    def predict(self, tree, testData, featureLabel=None):
        '''
        使用训练好的决策树,对单个待测样本进行预测.如果要预测一个数据集,可以把数据集拆开来一个一个的进行预测再组合起来.
        :param tree: 训练好的决策树
        :param testData: 待测样本1*n
        :return: 预测结果
        '''
        if not isinstance(tree, dict):  # 终止条件,意味着到达叶子结点,返回叶子结点的值
            return tree
        if featureLabel == None: featureLabel = [i for i in range(testData.shape[1] - 1)]
        if self.algorithm == 'ID3' or self.algorithm == 'C4.5':
            bestFeatureLabel = list(tree.keys())[0]  # 获取特征-含义对照表的值
            bestFeature = featureLabel.index(bestFeatureLabel)  # 获取特征的索引值
            subTree = tree[bestFeatureLabel]  # 获取子树
            valueOffeat = subTree[testData[bestFeature]]  # 找到测试样本相应特征值对应的子树,遍历该子树
            return self.predict(valueOffeat, testData, featureLabel)
        else:
            bestFeatureLabel = tree['FeatLabel']
            bestFeature = featureLabel.index(bestFeatureLabel)
            if self.algorithm == 'CARTcla':  # CART分类树
                if testData[bestFeature] == tree['FeatValue']:
                    return self.predict(tree['left'], testData, featureLabel)
                else:
                    return self.predict(tree['right'], testData, featureLabel)
            else:  # CART回归树
                if testData[bestFeature] <= tree['FeatValue']:
                    return self.predict(tree['left'], testData, featureLabel)
                else:
                    return self.predict(tree['right'], testData, featureLabel)

    def prune(self, tree, testData):
        '''
        利用测试集,对生成树进行后剪枝(CART回归树)
        :param tree: 训练好的决策树
        :param testData: 测试集数据m*(n+1),带标签列
        :return: 剪枝后的决策树
        '''
        def istree(tree):  # 判断是否为决策树
            return isinstance(tree, dict)

        def getmean(tree):  # 返回决策树所有叶子结点的均值
            if istree(tree['left']): tree['left'] = getmean(tree['left'])
            if istree(tree['right']): tree['right'] = getmean(tree['right'])
            return (tree['left'] + tree['right']) / 2

        if self.algorithm == 'CARTreg':
            if not testData:  # 如果测试集为空,则对决策树做塌陷处理,返回树的叶子结点的均值
                return getmean(tree)
            if istree(tree['left']) or istree(tree['right']):
                left, right = self.split_dataSet(testData, tree['FeatLabel'], tree['FeatValue'])
            if istree(tree['left']): tree['left'] = self.prune(tree['left'], left)  # 遍历左子树
            if istree(tree['right']): tree['right'] = self.prune(tree['right'], right)  # 遍历右子树
            if not istree(tree['left']) and not istree(tree['right']):  # 抵达叶子结点
                left, right = self.split_dataSet(testData, tree['FeatLabel'], tree['FeatValue'])
                errorNomerge = np.sum(np.power(left[:, -1] - tree['left'], 2)) + np.sum(np.power(right[:, -1] - tree['right'], 2))
                treeMean = (tree['left'] + tree['right']) / 2
                errorMerge = np.sum(np.power(testData[:, -1] - treeMean, 2))
                if errorMerge <= errorNomerge:  # 比较合并后与合并前,测试数据的误差,那个更小
                    return treeMean
                else: return tree
            return tree



def test():
    dataSet = [[1, 1, 'yes'],
               [1, 1, 'yes'],
               [1, 0, 'no'],
               [0, 1, 'no'],
               [0, 1, 'no']]
    dataSet2 = np.array([[1, 5.56], [2, 5.70], [3, 5.91], [4, 6.40], [5, 6.80], [6, 7.05], [7, 8.90], [8, 8.70], [9, 9.00], [10, 9.05]])
    feature_label = ['No Surfacing', 'Flippers']

    DT = DecisionTree(algorithm='ID3')
    DT2 = DecisionTree(algorithm='CARTreg')
    print(DT.training(dataSet, feature_label))
    print(DT2.training(dataSet2))

猜你喜欢

转载自blog.csdn.net/zhenghaitian/article/details/83747745