版权声明:本文为博主原创文章,能帮到你很开心(^_^)~,欢迎转载,转载请注明出处~【博客园新博客地址,欢迎大家来踩:https://www.cnblogs.com/GrPhoenix/】 https://blog.csdn.net/qq_36396104/article/details/84574680
老师给的题目:
代码实现【两种算法合在一个文件里】:
from numpy import *
def createDataSet():
dataSet = [[1, 1, 1, 0, 'no'],
[1, 1, 1, 1, 'no'],
[0, 1, 1, 0, 'yes'],
[-1, 0, 1, 0, 'yes'],
[-1,-1,0,0,'yes'],
[-1,-1,0,1,'no'],
[0,-1,0,1,'yes'],
[1,0,1,0,'no'],
[1,-1,0,0,'yes'],
[-1,0,0,0,'yes'],
[1,0,0,1,'yes'],
[0,0,1,1,'yes'],
[0,1,0,0,'yes'],
[-1,0,1,1,'no']]
labels = ['weather','temperature','humidity','wind speed','activity']
return dataSet, labels
#计算数据集的entropy
def calcEntropy(dataSet):
totalNum = len(dataSet)
labelNum = {}
entropy = 0
for data in dataSet:
label = data[-1]
if label in labelNum:
labelNum[label] += 1
else:
labelNum[label] = 1
for key in labelNum:
p = labelNum[key] / totalNum
entropy -= p * log2(p)
return entropy
def calcEntropyForFeature(featureList):
totalNum = len(featureList)
dataNum = {}
entropy = 0
for data in featureList:
if data in dataNum:
dataNum[data] += 1
else:
dataNum[data] = 1
for key in dataNum:
p = dataNum[key] / totalNum
entropy -= p * log2(p)
return entropy
#选择最优划分属性ID3
def chooseBestFeatureID3(dataSet, labels):
bestFeature = 0
initialEntropy = calcEntropy(dataSet)
biggestEntropyG = 0
for i in range(len(labels)):
currentEntropy = 0
feature = [data[i] for data in dataSet]
subSet = splitDataSetByFeature(i, dataSet)
totalN = len(feature)
for key in subSet:
prob = len(subSet[key]) / totalN
currentEntropy += prob * calcEntropy(subSet[key])
entropyGain = initialEntropy - currentEntropy
if(biggestEntropyG < entropyGain):
biggestEntropyG = entropyGain
bestFeature = i
return bestFeature
#选择最优划分属性C4.5
def chooseBestFeatureC45(dataSet, labels):
bestFeature = 0
initialEntropy = calcEntropy(dataSet)
biggestEntropyGR = 0
for i in range(len(labels)):
currentEntropy = 0
feature = [data[i] for data in dataSet]
entropyFeature = calcEntropyForFeature(feature)
subSet = splitDataSetByFeature(i, dataSet)
totalN = len(feature)
for key in subSet:
prob = len(subSet[key]) / totalN
currentEntropy += prob * calcEntropy(subSet[key])
entropyGain = initialEntropy - currentEntropy
entropyGainRatio = entropyGain / entropyFeature
if(biggestEntropyGR < entropyGainRatio):
biggestEntropyGR = entropyGainRatio
bestFeature = i
return bestFeature
def splitDataSetByFeature(i, dataSet):
subSet = {}
feature = [data[i] for data in dataSet]
for j in range(len(feature)):
if feature[j] not in subSet:
subSet[feature[j]] = []
splittedDataSet = dataSet[j][:i]
splittedDataSet.extend(dataSet[j][i + 1:])
subSet[feature[j]].append(splittedDataSet)
return subSet
def checkIsOneCateg(newDataSet):
flag = False
categoryList = [data[-1] for data in newDataSet]
category = set(categoryList)
if(len(category) == 1):
flag = True
return flag
def majorityCateg(newDataSet):
categCount = {}
categList = [data[-1] for data in newDataSet]
for c in categList:
if c not in categCount:
categCount[c] = 1
else:
categCount[c] += 1
sortedCateg = sorted(categCount.items(), key = lambda x:x[1], reverse = True)
return sortedCateg[0][0]
#创建ID3树
def createDecisionTreeID3(decisionTree, dataSet, tmplabels):
labels=[]
for tmp in tmplabels:
labels.append(tmp)
bestFeature = chooseBestFeatureID3(dataSet, labels)
decisionTree[labels[bestFeature]] = {}
currentLabel = labels[bestFeature]
subSet = splitDataSetByFeature(bestFeature, dataSet)
del(labels[bestFeature])
newLabels = labels[:]
for key in subSet:
newDataSet = subSet[key]
flag = checkIsOneCateg(newDataSet)
if(flag == True):
decisionTree[currentLabel][key] = newDataSet[0][-1]
else:
if (len(newDataSet[0]) == 1): #无特征值可划分
decisionTree[currentLabel][key] = majorityCateg(newDataSet)
else:
decisionTree[currentLabel][key] = {}
createDecisionTreeID3(decisionTree[currentLabel][key], newDataSet, newLabels)
# 创建C4.5树
def createDecisionTreeC45(decisionTree, dataSet, tmplabels):
labels=[]
for tmp in tmplabels:
labels.append(tmp)
bestFeature = chooseBestFeatureC45(dataSet, labels)
decisionTree[labels[bestFeature]] = {}
currentLabel = labels[bestFeature]
subSet = splitDataSetByFeature(bestFeature, dataSet)
del (labels[bestFeature])
newLabels = labels[:]
for key in subSet:
newDataSet = subSet[key]
flag = checkIsOneCateg(newDataSet)
if (flag == True):
decisionTree[currentLabel][key] = newDataSet[0][-1]
else:
if (len(newDataSet[0]) == 1): # 无特征值可划分
decisionTree[currentLabel][key] = majorityCateg(newDataSet)
else:
decisionTree[currentLabel][key] = {}
createDecisionTreeC45(decisionTree[currentLabel][key], newDataSet, newLabels)
#测试数据分类
def classify(inputTree, featLabels, testVec):
firstStr = list(inputTree.keys())#得到节点所代表的属性eg:'flippers'
firstStr = firstStr[0]
secondDict = inputTree[firstStr]#得到该节点的子节点,是一个dict,eg:{0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}
featIndex = featLabels.index(firstStr)#得到firstStr在所给的featLabels(属性)中的位置,以便将testVec中的值与相应的属性对应
for key in secondDict.keys():#将testVec中的值放入决策树中进行判断
if testVec[featIndex] == key:
if type(secondDict[key]).__name__=='dict':#如果还有子节点则继续判断
classLabel = classify(secondDict[key],featLabels,testVec)
else: classLabel = secondDict[key]#否则返回该节点的值
return classLabel
if __name__ == '__main__':
dataSetID3, labelsID3 = createDataSet()
testData1 = [1, 1, 1, 0]
testData2 = [1,-1,0,0]
bestFeatureID3 = chooseBestFeatureID3(dataSetID3, labelsID3)
decisionTreeID3 = {}
createDecisionTreeID3(decisionTreeID3, dataSetID3, labelsID3)
print("ID3 decision tree: ", decisionTreeID3)
# category1ID3 = classifyTestData(decisionTreeID3, testData1)
# print(testData1 , ", classified as by ID3: " , category1ID3)
# category2ID3 = classifyTestData(decisionTreeID3, testData2)
# print(testData2 , ", classified as by ID3: " , category2ID3)
for tmp in dataSetID3:
category = classify(decisionTreeID3,labelsID3,tmp[0:4])
print(tmp[0:4], ", classified as by ID3: " , category)
dataSetC45, labelsC45 = createDataSet()
bestFeatureC45 = chooseBestFeatureC45(dataSetC45, labelsC45)
decisionTreeC45 = {}
createDecisionTreeC45(decisionTreeC45, dataSetC45, labelsC45)
print("C4.5 decision tree: ", decisionTreeC45)
# category1C45 = classifyTestData(decisionTreeC45, testData1)
# print(testData1 , ", classified as by C4.5: " , category1C45)
# category2C45 = classifyTestData(decisionTreeC45, testData2)
# print(testData2 , ", classified as by C4.5: " , category2C45)
for tmp in dataSetC45:
category = classify(decisionTreeC45,labelsC45,tmp[0:4])
print(tmp[0:4], ", classified as by C4.5: " , category)