决策树ID3和C4.5算法实现

前提基础：

决策树基本原理，本文参考李航博士著《统计学习方法》

熵的概念

信息增益

信息增益比：可以修正信息增益来划分训练数据集时，存在偏向选择取值较多的特征的问题。可以对此问题进行校正

训练集生成决策树的两个关键问题：

在构建新节点时候如何选择特征及对应的判定条件？ ID3 C4.5
如何停止构建新的节点？or 什么样子的节点被认定为叶子节点？ 所有label都相同 or 没有特征可分割

决策树算法优点：

输入数据可以不做归一化，数据清洗阶段可以相对少做许多工作；
对缺失值不敏感，可以处理不相关特征数据；
效率高，速度快；

决策树算法缺点：

连续性特征往往需要离散化；
处理特征关联性比较强的数据表现得不是很好；

后续

本文并没有探讨决策树剪枝问题；后续更新
本文并没有介绍分类与回归树（classification and regress tree，CART）模型；后续更新

代码如下：

---------参考了wepo大神---------

数据集采用《统计学习方法》第五章表5.1 贷款申请样本数据表

年龄：1->青年 2->中年 3->老年

有工作： 0->否 1->是

有自己的房子： 0->否 1->是

信贷情况：1->一般 2->好 3->非常好

类别：yes->是 no->否

代码部分：

import numpy as np

class decisionTree:
    """
    使用方法： clf = decisioTree(), 参数mode可选ID3 or C4.5
    - 训练，调用fit方法： clf.fit(X, y) X,y均为np.ndarray类型
    - 预测，调用predict方法：clf.predict(x) X为np.ndarray
    - 可视化决策树结构，调用show()方法
    """
    def __init__(self, mode = 'ID3'):
        self.tree = None

        if mode == 'ID3' or mode == 'C4.5':
            self.mode = mode
        else:
            raise Exception('mode should be C4.5 or ID3')

    def calcEntropy(self, y):
        """
        :param y: 数据集的标签
        :return: 熵值
        """
        num = y.shape[0]
        # 统计y中不同laebl值的个数，并用字典labelCounts存储
        labelCounts = {}
        for label in y:
            if label not in labelCounts.keys():
                labelCounts[label] = 0
            labelCounts[label] += 1

        # 计算熵值
        entropy = 0.
        for key in labelCounts:
            prob = 1. * labelCounts[key] / num
            entropy = entropy - prob * np.log2(prob)
        return entropy

    def splitDataSet(self, X, y, index, value):
        """
        :param X:
        :param y:
        :param index:
        :param value:
        :return: 返回数据集中特征下标为index，特征值等于value的子数据集
        """
        ret = []
        featVec = X[:, index]
        X = X[:, [i for i in range(X.shape[1]) if i != index]]

        for i in range(len(featVec)):
            if featVec[i] == value:
                ret.append(i)
        return X[ret, :], y[ret]

    def chooseBestFeatureToSplit_ID3(self, X, y):
        """
        :param X:
        :param y:
        :return:
        """
        # 特征个数
        numFeatures = X.shape[1]
        # 原始数据集的熵
        oldEntropy = self.calcEntropy(y)
        # 记录最大的信息增益
        bestInfoGain = 0.
        # 信息增益最大时候，所需要选择分割特征的下标
        bestFeatureIndex = -1

        # 对每个特征都计算一下信息增益，并用bestInfoGain记录最大的那个
        for i in range(numFeatures):
            featList = X[:, i]
            uniqueVals = set(featList)
            newEntropy = 0.
            # 对第i个特征的各个value，得到各个子数据集，计算各个子数据集的熵，
            # 进一步地可以计算得到第i个特征分割原始数据集后的熵newEntropy

            for value in uniqueVals:
                sub_X, sub_y = self.splitDataSet(X, y, i, value)
                prob = 1. * len(sub_y) / len(y)
                newEntropy = newEntropy + prob * self.calcEntropy(sub_y)

            # 计算信息增益，根据信息增益选择最佳分割特征
            infoGain = oldEntropy - newEntropy
            if infoGain > bestInfoGain:
                bestInfoGain = infoGain
                bestFeatureIndex = i
        return bestFeatureIndex

    def chooseBestFeatureToSplit_C45(self, X, y):
        numFeatures = X.shape[1]
        oldEntropy = self.calcEntropy(y)
        bestGainRatio = 0.
        bestFeatureIndex = -1
        # 对每个特征都计算一下信息增益比
        for i in range(numFeatures):
            featList = X[:, i]
            uniqueVals = set(featList)
            newEntropy = 0.
            splitInformation = 0.

            for value in uniqueVals:
                sub_X, sub_y = self.splitDataSet(X, y, i, value)
                prob = len(sub_y) * 1. / len(y)
                newEntropy = newEntropy + prob * self.calcEntropy(sub_y)
                splitInformation -= prob * np.log2(prob)

            if splitInformation == 0.0:
                pass
            else:
                infoGain = oldEntropy - newEntropy
                gainRatio = infoGain / splitInformation
                if gainRatio > bestGainRatio:
                    bestGainRatio = gainRatio
                    bestFeatureIndex = i
        return bestFeatureIndex

    def majority(self, labelList):
        """
        :param labelList:
        :return: 返回labelList中出现次数最多的label
        """
        labelCount = {}
        for vote in labelList:
            if vote not in labelCount.keys():
                labelCount[vote] = 0
            labelCount[vote] += 1
        sortedClassCount = sorted(labelCount.items(), key=lambda x: x[1], reverse=True)
        return sortedClassCount[0][0]

    def createTree(self, X, y, featureIndex):

        labelList = list(y)
        # 所有label都相同的话，则停止分割，返回该label
        if labelList.count(labelList[0]) == len(labelList):
            return labelList[0]

        # 没有特征可分割时，停止分割，返回出现次数最多的label
        if len(featureIndex) == 0:
            return self.majority(labelList)

        # 可以继续分割的话，选择最佳分割特征
        if self.mode == 'ID3':
            bestFeatIndex = self.chooseBestFeatureToSplit_ID3(X, y)
        else:
            bestFeatIndex = self.chooseBestFeatureToSplit_C45(X, y)

        bestFeatStr = featureIndex[bestFeatIndex]
        featureIndex = list(featureIndex)
        featureIndex.remove(bestFeatStr)
        featureIndex  = tuple(featureIndex)
        # 用字典存储决策树。最佳分割特征是key，而对应的键值仍然是一颗树（仍然用字典存储）

        myTree = {bestFeatStr:{}}
        featValues = X[:, bestFeatIndex]
        uniqueVals = set(featValues)
        for value in uniqueVals:
            # 对每个value递归的建树
            sub_X, sub_y = self.splitDataSet(X, y, bestFeatIndex, value)
            myTree[bestFeatStr][value] = self.createTree(sub_X, sub_y, featureIndex)
        return myTree

    def fit(self, X, y):
        if isinstance(X, np.ndarray) and isinstance(y, np.ndarray):
            pass
        else:
            try:
                X = np.array(X)
                y = np.array(y)
            except:
                raise TypeError("numpy.ndarray required for X,y")

        featureIndex = tuple(['x' + str(i) for i in range(X.shape[1])])
        #featureIndex = tuple(['Age', 'Job', 'House', 'Credit'])
        print(featureIndex)
        self.tree = self.createTree(X, y, featureIndex)
        return self

    def predict(self, X):
        if self.tree == None:
            raise NotFittedError("Estimatpr not fitted, call 'fit' first")

        if isinstance(X, np.ndarray):
            pass
        else:
            try:
                X = np.array(X)
            except:
                raise TypeError("numpy.ndarray required for X")

        def classify(tree, sample):
            featSides = list(tree.keys())
            # featIndex = tree.keys()[0]
            featIndex = featSides[0]
            secondDict = tree[featIndex]
            key = sample[int(featIndex[1:])]
            valueOfkey = secondDict[key]


            if isinstance(valueOfkey, dict):
                label = classify(valueOfkey, sample)
            else:
                label = valueOfkey
            return label

        if len(X.shape) == 1:
            return classify(self.tree, X)
        else:
            results = []
            for i in range(X.shape[0]):
                results.append(classify(self.tree, X[i]))
            return np.array(results)

    def show(self):
        if self.tree == None:
            raise NotFittedError("Estimator not fitted, call 'fit' first")

        import treePlotter
        treePlotter.createPlot(self.tree)


class NotFittedError(Exception):
    """

    """
    pass

# treePlotter.py
'''
Created on Oct 14, 2010

@author: Peter Harrington

From the book <<Machine learning in action>>
'''
import matplotlib.pyplot as plt

decisionNode = dict(boxstyle="sawtooth", fc="0.8")
leafNode = dict(boxstyle="round4", fc="0.8")
arrow_args = dict(arrowstyle="<-")

def getNumLeafs(myTree):
    numLeafs = 0
    firstStr = myTree.keys()[0]
    secondDict = myTree[firstStr]
    for key in secondDict.keys():
        if type(secondDict[key]).__name__=='dict':#test to see if the nodes are dictonaires, if not they are leaf nodes
            numLeafs += getNumLeafs(secondDict[key])
        else:   numLeafs +=1
    return numLeafs

def getTreeDepth(myTree):
    maxDepth = 0
    firstStr = myTree.keys()[0]
    secondDict = myTree[firstStr]
    for key in secondDict.keys():
        if type(secondDict[key]).__name__=='dict':#test to see if the nodes are dictonaires, if not they are leaf nodes
            thisDepth = 1 + getTreeDepth(secondDict[key])
        else:   thisDepth = 1
        if thisDepth > maxDepth: maxDepth = thisDepth
    return maxDepth

def plotNode(nodeTxt, centerPt, parentPt, nodeType):
    createPlot.ax1.annotate(nodeTxt, xy=parentPt,  xycoords='axes fraction',
             xytext=centerPt, textcoords='axes fraction',
             va="center", ha="center", bbox=nodeType, arrowprops=arrow_args )
    
def plotMidText(cntrPt, parentPt, txtString):
    xMid = (parentPt[0]-cntrPt[0])/2.0 + cntrPt[0]
    yMid = (parentPt[1]-cntrPt[1])/2.0 + cntrPt[1]
    createPlot.ax1.text(xMid, yMid, txtString, va="center", ha="center", rotation=30)

def plotTree(myTree, parentPt, nodeTxt):#if the first key tells you what feat was split on
    numLeafs = getNumLeafs(myTree)  #this determines the x width of this tree
    #depth = getTreeDepth(myTree)
    firstStr = myTree.keys()[0]     #the text label for this node should be this
    cntrPt = (plotTree.xOff + (1.0 + float(numLeafs))/2.0/plotTree.totalW, plotTree.yOff)
    plotMidText(cntrPt, parentPt, nodeTxt)
    plotNode(firstStr, cntrPt, parentPt, decisionNode)
    secondDict = myTree[firstStr]
    plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD
    for key in secondDict.keys():
        if type(secondDict[key]).__name__=='dict':#test to see if the nodes are dictonaires, if not they are leaf nodes   
            plotTree(secondDict[key],cntrPt,str(key))        #recursion
        else:   #it's a leaf node print the leaf node
            plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW
            plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)
            plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
    plotTree.yOff = plotTree.yOff + 1.0/plotTree.totalD
#if you do get a dictonary you know it's a tree, and the first element will be another dict

def createPlot(inTree):
    fig = plt.figure(1, facecolor='white')
    fig.clf()
    axprops = dict(xticks=[], yticks=[])
    createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)    #no ticks
    #createPlot.ax1 = plt.subplot(111, frameon=False) #ticks for demo puropses 
    plotTree.totalW = float(getNumLeafs(inTree))
    plotTree.totalD = float(getTreeDepth(inTree))
    plotTree.xOff = -0.5/plotTree.totalW; plotTree.yOff = 1.0;
    plotTree(inTree, (0.5,1.0), '')
    plt.show()

# main.py
from id3_c45 import decisionTree

if __name__ == '__main__':
    # Toy data
    X = [[1, 0, 0, 1],
         [1, 0, 0, 2],
         [1, 1, 0, 2],
         [1, 1, 1, 1],
         [1, 0, 0, 1],
         [2, 0, 0, 1],
         [2, 0, 0, 2],
         [2, 1, 1, 2],
         [2, 0, 1, 3],
         [2, 0, 1, 3],
         [3, 0, 1, 3],
         [3, 0, 1, 2],
         [3, 1, 0, 2],
         [3, 1, 0, 3],
         [3, 0, 0, 1]]
    y = ['no', 'no', 'yes', 'yes', 'no', 'no', 'no', 'yes', 'yes', 'yes',
         'yes', 'yes', 'yes', 'yes', 'no']

    clf = decisionTree(mode='ID3')
    clf.fit(X, y)
    clf.show()
    print clf.predict(X)

    clf_ = decisionTree(mode='C4.5')
    clf_.fit(X, y)
    clf_.show()
    print clf_.predict(X)

决策树ID3和C4.5算法实现

猜你喜欢