小白python学习——机器学习篇——决策树算法

一.找到最佳划分特征值

1.计算每一个特征的信息增熵,划分前后数据集熵差以及划分数据集

代码:

def calcshannonEnt(dataSet):
    numEntries = len(dataSet)
    labelCounts = {}
    for featVec in dataSet:
        currentLabel = featVec[-1]
        if currentLabel not  in labelCounts.keys():
            labelCounts[currentLabel]=0
        labelCounts[currentLabel]+=1
    shannoEnt=0.0
    for key in labelCounts:
        prob = float(labelCounts[key])/numEntries
        shannoEnt-= prob*log(prob,2)
    return shannoEnt
#   数据集的熵
def splitDataSet(dataSet,axis,value):
    retDataSet=[]
    for featVec in dataSet:
        if featVec[axis] == value:
            reduceFeatVec = featVec[:axis]
            reduceFeatVec.extend(featVec[axis+1:])
            retDataSet.append(reduceFeatVec)
    return retDataSet
#   划分数据集

def chooseBestFeatureToSplit(dataSet):
    numFeatures= len(dataSet[0])-1
    baseEntropy = calcshannonEnt(dataSet)  #熵  前   (划分数据集前的熵)
    bestInfoGain  = 0.0
    bestFeature  = -1
    for i in range(numFeatures):
        featList = [example[i] for example in dataSet]   #所以行(列表)的第一个(第i个)数据集合
        uniqueVals= set(featList)       #集合的性质:不重复   所以里面包含特征i的所有可能
        newEntropy=0.0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet,i,value)
            prob= len(subDataSet)/float(len(dataSet))
            newEntropy = prob*calcshannonEnt(subDataSet)+newEntropy      #计算该特征分类的熵值:  各种可能分类后的数据集*频率之和  =  一个特征值的分类
        infoGain= baseEntropy - newEntropy
        if(infoGain  >  bestInfoGain):
            bestInfoGain = infoGain
            bestFeature = i
    return bestFeature

二.建树

def createTree(dataSet, labels):  # 创建决策树
    classList = [example[-1] for example in dataSet]  # 把整个数据集合中的最后一列(类别)放进一个List
    if classList.count(classList[0]) == len(classList):  # 如果classlist中所有的类别都为同一个类别,递归结束,返回该类别
        return classList[0]
    if len(dataSet[0]) == 1:  # 使用完了所有特征,仍然不能讲数据划分成仅包含唯一类别的分组
        return majorityCnt(classList)
    bestFeat = chooseBestFeatureToSplit(dataSet)  # 计算最佳特征切分数据
    bestFeatLabel = labels[bestFeat]  # 最佳特征对应的标签
    myTree = {bestFeatLabel: {}}  # 通过字典存放决策树
    del (labels[bestFeat])  # 删除已经进入树的标签
    featValues = [example[bestFeat] for example in dataSet]  # 在整个数据集中最优特征对应的所有特征值
    uniqueVals = set(featValues)
    for value in uniqueVals:
        subLabels = labels[:]
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)  # 每个特征值分支下建立子数
    return myTree

def classify(inputTree, featLabels, testVec):
    firstStr = inputTree.keys()[0]
    secondDict = inputTree[firstStr]
    featIndex = featLabels.index(firstStr)
    key = testVec[featIndex]
    valueOfFeat = secondDict[key]
    if isinstance(valueOfFeat, dict):
        classLabel = classify(valueOfFeat, featLabels, testVec)
    else:
        classLabel = valueOfFeat
    return classLabel

猜你喜欢

转载自blog.csdn.net/qq_40602790/article/details/81295975