C4.5的python实现
import numpy as np
import math
def createDataSet():
dataSet=[[0,0,0,0,"N"],
[0,0,0,1,"N"],
[1,0,0,0,"Y"],
[2,1,0,0,"Y"],
[2,2,1,0,"Y"],
[2,2,1,1,"N"],
[1,2,1,1,"Y"]]
labels=["outlook","temperature","humidity","windy"]
return dataSet,labels
def calcShannonEnt(dataSet):
numEntries=len(dataSet)
labelCounts = {}
for fectureVec in dataSet:
currentLabel = fectureVec[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel]=0
labelCounts[currentLabel]+=1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key])/numEntries
shannonEnt-=prob*math.log(prob,2)
return shannonEnt
def splitDataSet(dataSet,axis,value):
'''
:param dataSet: 给定数据
:param axis: 特征
:param value: 该特征的一个特征值
:return: 在该特征下 该特征值所对应的数据
'''
retDataSet=[]
for featVec in dataSet:
if featVec[axis]==value:
reduceFeacVec = featVec[:axis]
reduceFeacVec.extend(featVec[axis+1:])
retDataSet.append(reduceFeacVec)
return retDataSet
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0])-1
baseEntropy=calcShannonEnt(dataSet)
bestInfoGainRatio = 0.0
bestFeature = -1
for i in range(numFeatures):
featlist = [example[i] for example in dataSet]
uniqueVals = set(featlist)
newEntropy = 0.0
splitInfo = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet,i,value)
prob = float(len(subDataSet))/len(dataSet)
newEntropy+=prob*calcShannonEnt(subDataSet)
if prob==0:
continue
splitInfo+=-prob*math.log(prob,2)
infoGain = baseEntropy - newEntropy
if(splitInfo==0):
continue
infoGainRatio = infoGain/splitInfo
if(infoGainRatio>bestInfoGainRatio):
bestInfoGainRatio = infoGainRatio
bestFeature = i
return bestFeature
def majorityCnt(classList):
classDic={}
for i in classList:
if i not in classDic.keys():
classDic[i]=0
classDic[i]+=1
maxValue=sorted(classDic,key=lambda x:dict[x])[-1]
return maxValue
def createTree(dataSet,labels):
classList = [example[-1] for example in dataSet]
if classList.count(classList[0])== len(classList):
return classList[0]
if len(dataSet[0])==1:
return majorityCnt(classList)
bestFeature = chooseBestFeatureToSplit(dataSet)
bestFeatureLabel=labels[bestFeature]
myTree = {bestFeatureLabel:{}}
del (labels[bestFeature])
featValues = [example[bestFeature]for example in dataSet]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = labels[:]
myTree[bestFeatureLabel][value]=createTree(splitDataSet(dataSet,bestFeature,value),subLabels)
return myTree
if __name__ == '__main__':
dataSet,labels=createDataSet()
print(createTree(dataSet,labels))