决策树python代码

决策树C4.5的实现

C4.5的python实现

import numpy as np
import math
def createDataSet():
    dataSet=[[0,0,0,0,"N"],
             [0,0,0,1,"N"],
             [1,0,0,0,"Y"],
             [2,1,0,0,"Y"],
             [2,2,1,0,"Y"],
             [2,2,1,1,"N"],
             [1,2,1,1,"Y"]]
    labels=["outlook","temperature","humidity","windy"]
    return dataSet,labels
#给定数据计算熵
def calcShannonEnt(dataSet):
    numEntries=len(dataSet)#统计数据量
    labelCounts = {}#记录每个类的数量
    for fectureVec in dataSet:
        currentLabel = fectureVec[-1]
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel]=0
        labelCounts[currentLabel]+=1
    shannonEnt = 0.0
    #计算该数据的base熵
    for key in labelCounts:
        prob = float(labelCounts[key])/numEntries
        shannonEnt-=prob*math.log(prob,2)
    return shannonEnt
#划分数据
def splitDataSet(dataSet,axis,value):
    '''
    :param dataSet: 给定数据
    :param axis: 特征
    :param value: 该特征的一个特征值
    :return: 在该特征下 该特征值所对应的数据
    '''
    retDataSet=[]
    for featVec in dataSet:
        if featVec[axis]==value:
            reduceFeacVec = featVec[:axis]
            reduceFeacVec.extend(featVec[axis+1:])
            retDataSet.append(reduceFeacVec)
    return retDataSet
#选择最大的信息增益率的特征
def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0])-1#计算特征数量
    baseEntropy=calcShannonEnt(dataSet)#计算base熵 与按着某一特征划分后的熵作对比

    bestInfoGainRatio = 0.0#最大信息增益率
    bestFeature = -1#最好的特征

    for i in range(numFeatures):#遍历每一个特征
        featlist = [example[i] for example in dataSet]#每个特征的值 比如:特征性别 值为男女 取第i列的数据
        uniqueVals = set(featlist)#记录该特征的值都有什么

        newEntropy = 0.0#根据特征i划分后的熵
        splitInfo = 0.0#特诊i本身的熵

        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet,i,value)
            prob = float(len(subDataSet))/len(dataSet)#在按着特征i划分 值value的概率 比如性别 值为男在总人数中的比率
            newEntropy+=prob*calcShannonEnt(subDataSet)#计算特征i划分后的熵
            if prob==0:
                continue
            splitInfo+=-prob*math.log(prob,2)#按着特征i划分 特征i自身的信息熵
        infoGain = baseEntropy - newEntropy #计算特征i带来的信息增益
        if(splitInfo==0):#只有一类时 防止上溢出
            continue
        infoGainRatio = infoGain/splitInfo#特征i的信息增益率
        if(infoGainRatio>bestInfoGainRatio):#选择最大的信息增益率的特征
            bestInfoGainRatio = infoGainRatio
            bestFeature = i
        return bestFeature
#统计下最多的类是哪个
def majorityCnt(classList):
    classDic={}
    for i in classList:
        if i not in classDic.keys():
            classDic[i]=0
        classDic[i]+=1
    maxValue=sorted(classDic,key=lambda x:dict[x])[-1]
    return maxValue

    #构建树
def createTree(dataSet,labels):
    classList = [example[-1] for example in dataSet]#得到类标签
    if classList.count(classList[0])== len(classList):#样本中都是属于一类的 则返回
        return classList[0]
    if len(dataSet[0])==1:#特征已经没有了 只有一个标签在
        return majorityCnt(classList)#返回数量最多的那个类
    bestFeature = chooseBestFeatureToSplit(dataSet)
    # if bestFeature == None:
    #     return None
    bestFeatureLabel=labels[bestFeature]
    myTree = {bestFeatureLabel:{}}
    del (labels[bestFeature])
    featValues = [example[bestFeature]for example in dataSet]#在当前最好的特征下 继续建立树
    uniqueVals = set(featValues)
    for value in uniqueVals:#当前特征下 对于每一个特征值 建立树
        subLabels = labels[:]
        myTree[bestFeatureLabel][value]=createTree(splitDataSet(dataSet,bestFeature,value),subLabels)
    return myTree
if __name__ == '__main__':
    dataSet,labels=createDataSet()
    print(createTree(dataSet,labels))

猜你喜欢

转载自blog.csdn.net/weixin_40642306/article/details/83411205