Decision tree summary - Summarize the construction process of ID3, C4.5 and CART of decision tree (6)

The original address of the code used is: github.com/Erikfather/…

Interpretation of ID3 public part code:

Information entropy

The formula for calculating information entropy is:

image.png

We take the dataset by hand, and then take out the last label for classification and counting, assuming 5 in class 0 and 7 in class 1, the information entropy for this is

# 计算信息熵
def cal_entropy(dataset):
    numEntries = len(dataset)
    labelCounts = {}
    # 给所有可能分类创建字典
    for featVec in dataset:
        currentlabel = featVec[-1]
        if currentlabel not in labelCounts.keys():
            labelCounts[currentlabel] = 0
        labelCounts[currentlabel] += 1
        Ent = 0.0
        for key in labelCounts:
            p = float(labelCounts[key]) / numEntries
            Ent = Ent - p * log(p, 2)  # 以2为底求对数
    return Ent
复制代码

Divide the dataset

This part is that we assume that a feature has two values ​​of 0, 1, and 2. Our dataset first takes out the feature sequence whose feature value is 0, then removes the feature of this column, and returns the last one with feature 0 removed. feature data set

def splitdataset(dataset, axis, value):
    retdataset = []  # 创建返回的数据集列表
    for featVec in dataset:  # 抽取符合划分特征的值
        if featVec[axis] == value:
            reducedfeatVec = featVec[:axis]  # 去掉axis特征
            reducedfeatVec.extend(featVec[axis + 1:])  # 将符合条件的特征添加到返回的数据集列表
            retdataset.append(reducedfeatVec)
    return retdataset
复制代码

The original ID3 code flow (calculation process)

full original code

Pick the best features
# ID3算法
def ID3_chooseBestFeatureToSplit(dataset):
    # 特征数量
    numFeatures = len(dataset[0]) - 1
    
    # 计算基础信息熵(方便后面计算信息增益)
    baseEnt = cal_entropy(dataset)
    
    # ID3主要是以信息增益为准则选择划分属性
    bestInfoGain = 0.0
    bestFeature = -1
    for i in range(numFeatures):  # 遍历所有特征

        # 得到单一特征的所有值(集合过滤)
        featList = [example[i] for example in dataset]
        uniqueVals = set(featList)  # 将特征列表创建成为set集合,元素不可重复。创建唯一的分类标签列表
        newEnt = 0.0
        
        # 使用上面的特征值进行划分数据集合,计算给特征集合下的所有信息熵的总和
        for value in uniqueVals:  # 计算每种划分方式的信息熵
            subdataset = splitdataset(dataset, i, value)
            p = len(subdataset) / float(len(dataset))
            newEnt += p * cal_entropy(subdataset)
        
        # 得到信息增益
        infoGain = baseEnt - newEnt
        print(u"ID3中第%d个特征的信息增益为:%.3f" % (i, infoGain))
        
        # 通过历史最优的方式留下最好的特征和信息增益
        if (infoGain > bestInfoGain):
            bestInfoGain = infoGain  # 计算最好的信息增益
            bestFeature = i
            
    # 最后返回信息增益最好的特征
    return bestFeature
复制代码
build tree control flow
----------------------------------------------------------
树的构建的过程是:通过递归的思想来不断的创建树
----------------------------------------------------------

# 利用ID3算法创建决策树
def ID3_createTree(dataset, labels, test_dataset):
    # 首先先将所有的标签的列表拿出来
    classList = [example[-1] for example in dataset]
    
    # 这个部分是递归停止的部分---停止条件【拿到的数据集中的标签都是一样的,或者只剩下一个数据】
    if classList.count(classList[0]) == len(classList):
        # 类别完全相同,停止划分
        return classList[0]
    if len(dataset[0]) == 1:
        # 遍历完所有特征时返回出现次数最多的
        return majorityCnt(classList)
    
    # 挑选最好的特征(返回的是标号)
    bestFeat = ID3_chooseBestFeatureToSplit(dataset)
    # 在这我们可以换成文字的标签
    bestFeatLabel = labels[bestFeat]
    print(u"此时最优索引为:" + (bestFeatLabel))
	
    # 初始化ID3的树的结构
    ID3Tree = {bestFeatLabel: {}}
    # 删除labels中已经选择了的标签
    del (labels[bestFeat])
    # 得到列表包括节点所有的属性值
    featValues = [example[bestFeat] for example in dataset]
    uniqueVals = set(featValues)

    # 在创建决策树之前的准备工作
    if pre_pruning:
        ans = []
        for index in range(len(test_dataset)):
            ans.append(test_dataset[index][-1])
        result_counter = Counter()
        for vec in dataset:
            result_counter[vec[-1]] += 1
        # result_counter.most_common(1) [(1,9)] 主要是取出出现次数最多的特征的值
        leaf_output = result_counter.most_common(1)[0][0]
        
        # 主要是计算初始的准确率(将测试集全部预测成上面得到的leaf_output的准确率)
        root_acc = cal_acc(test_output=[leaf_output] * len(test_dataset), label=ans)
        outputs = []
        ans = []
        
        # 通过选出来的每个值进行划分数据后,我们观察准确率的上升的情况(这个里面我们选出来的特征的值是0,1,我们将测试集进行分割后针对叶子节点所属的类别,进行归属,计算此时的准确率)
        # 计算的方法就是,我们通过特征进行分割后,0下面有一波数据,1下面有一波数据,我们将数据中标签数量最多的代表本叶子节点的类别,然后我们将测试数据,也用这个特征的值进行分类,我们将上面的到的类别直接赋予测试集合,然后计算整体的准确率是否有提升
        for value in uniqueVals:
            cut_testset = splitdataset(test_dataset, bestFeat, value)
            cut_dataset = splitdataset(dataset, bestFeat, value)
            for vec in cut_testset:
                ans.append(vec[-1])
            result_counter = Counter()
            for vec in cut_dataset:
                result_counter[vec[-1]] += 1
            leaf_output = result_counter.most_common(1)[0][0]
            outputs += [leaf_output] * len(cut_testset)
        cut_acc = cal_acc(test_output=outputs, label=ans)

        # 若通过这个特征分割后,我们的准确率较之前上升了,我们就是用此特征分割
        # 若此特征分割后,并没有准确率的提升,我们就直接返回做叶子节点,不进行分割
        if cut_acc <= root_acc:
            return leaf_output

    # 构建决策树的主体(递归构建决策树)
    # 实际上这个部分就是我们针对这个特征进行分支后,在对子集进行构建决策树
    for value in uniqueVals:
        subLabels = labels[:]
        ID3Tree[bestFeatLabel][value] = ID3_createTree(
            splitdataset(dataset, bestFeat, value),
            subLabels,
            splitdataset(test_dataset, bestFeat, value))

    # 构建决策树的后续步骤
    if post_pruning:
        tree_output = classifytest(ID3Tree,
                                   featLabels=['年龄段', '有工作', '有自己的房子', '信贷情况'],
                                   testDataSet=test_dataset)
        ans = []
        for vec in test_dataset:
            ans.append(vec[-1])
        root_acc = cal_acc(tree_output, ans)
        result_counter = Counter()
        for vec in dataset:
            result_counter[vec[-1]] += 1
        leaf_output = result_counter.most_common(1)[0][0]
        cut_acc = cal_acc([leaf_output] * len(test_dataset), ans)

        if cut_acc >= root_acc:
            return leaf_output

    return ID3Tree
复制代码
helper function
# 这个函数的主要的作用是,通过我们构造的决策树,我们将单条的数据进行输入,得到这个数据的预测的标签
def classify(inputTree, featLabels, testVec):
    """
    输入:决策树,分类标签,测试数据
    输出:决策结果
    描述:跑决策树
    """
    firstStr = list(inputTree.keys())[0]
    secondDict = inputTree[firstStr]
    featIndex = featLabels.index(firstStr)
    classLabel = '0'
    for key in secondDict.keys():
        if testVec[featIndex] == key:
            if type(secondDict[key]).__name__ == 'dict':
                classLabel = classify(secondDict[key], featLabels, testVec)
            else:
                classLabel = secondDict[key]
    return classLabel

# 针对整个测试集合的
def classifytest(inputTree, featLabels, testDataSet):
    """
    输入:决策树,分类标签,测试数据集
    输出:决策结果
    描述:跑决策树
    """
    classLabelAll = []
    for testVec in testDataSet:
        classLabelAll.append(classify(inputTree, featLabels, testVec))
    return classLabelAll


# 计算准确率,主要是测试数据的预测标签和真实标签之间的准确率
def cal_acc(test_output, label):
    """
    :param test_output: the output of testset
    :param label: the answer
    :return: the acc of
    """
    assert len(test_output) == len(label)
    count = 0
    for index in range(len(test_output)):
        if test_output[index] == label[index]:
            count += 1

    return float(count / len(test_output))
复制代码

C4.5 public part code interpretation:

# C4.5算法:使用“增益率”来选择划分属性,实现和id3基本一致
# C4.5算法
def C45_chooseBestFeatureToSplit(dataset):
    numFeatures = len(dataset[0]) - 1
    baseEnt = cal_entropy(dataset)
    bestInfoGain_ratio = 0.0
    bestFeature = -1
    for i in range(numFeatures):  # 遍历所有特征
        featList = [example[i] for example in dataset]
        uniqueVals = set(featList)  # 将特征列表创建成为set集合,元素不可重复。创建唯一的分类标签列表
        newEnt = 0.0
        IV = 0.0
        for value in uniqueVals:  # 计算每种划分方式的信息熵
            subdataset = splitdataset(dataset, i, value)
            p = len(subdataset) / float(len(dataset))
            newEnt += p * cal_entropy(subdataset)
            IV = IV - p * log(p, 2)
        infoGain = baseEnt - newEnt
        if (IV == 0):  # fix the overflow bug
            continue
        
        # 比id3多一个步骤就是计算信息熵增益率
        infoGain_ratio = infoGain / IV  # 这个feature的infoGain_ratio
        print(u"C4.5中第%d个特征的信息增益率为:%.3f" % (i, infoGain_ratio))
        if (infoGain_ratio > bestInfoGain_ratio):  # 选择最大的gain ratio
            bestInfoGain_ratio = infoGain_ratio
            bestFeature = i  # 选择最大的gain ratio对应的feature
    return bestFeature
复制代码

Interpretation of CART public part code:

The formula for calculating the Gini coefficient

image.png

# CART算法,是基于基尼系数进行挑选最优化的特征
# CART算法
def CART_chooseBestFeatureToSplit(dataset):
    numFeatures = len(dataset[0]) - 1
    bestGini = 999999.0
    bestFeature = -1
    for i in range(numFeatures):
        featList = [example[i] for example in dataset]
        uniqueVals = set(featList)
        gini = 0.0
        for value in uniqueVals:
            subdataset = splitdataset(dataset, i, value)
            p = len(subdataset) / float(len(dataset))
            subp = len(splitdataset(subdataset, -1, '0')) / float(len(subdataset))
        gini += p * (1.0 - pow(subp, 2) - pow(1 - subp, 2))
        print(u"CART中第%d个特征的基尼值为:%.3f" % (i, gini))
        if (gini < bestGini):
            bestGini = gini
            bestFeature = i
    return bestFeature
复制代码

Guess you like

Origin juejin.im/post/7085631482022068231