I used to debug it according to other people's instructions, and I didn't organize it very much, it was a bit messy.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import log
import math
import operator
import sys
# !pip install treePlotter
def createDataSet():
dataset = [
[1,1,'yes'],
[1,1,'yes'],
[1,0,'no'],
[0,1,'no'],
[0,1,'no'],
]
labels = ['no surfacing','flippers']
return dataset,labels
dataset,labels = createDataSet()
dataset,labels
([[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']],
['no surfacing', 'flippers'])
# 计算信息熵,就是根据提供的数据集的标签计算信息熵
def calcShannonEnt(dataset):
num_entries = len(dataset) # 样本数
label_counts = {
} # 创建一个字典:key是最后一列数值,即标签
for feat_vec in dataset: # 遍历整个数据集,每次取一行
current_label = feat_vec[-1] # 取该行最后一列的值
if current_label not in label_counts.keys():
label_counts[current_label] = 0
label_counts[current_label] +=1
shannonEnt = 0.0 # 初始化信息熵
for key in label_counts:
prob = float(label_counts[key])/num_entries
shannonEnt -= prob * log(prob,math.exp(1)) # log base 2 计算信息熵
return shannonEnt
# num_entries = len(dataset) # 样本数
# label_counts = {} # 创建一个字典:key是最后一列数值,即标签
# for feat_vec in dataset: # 遍历整个数据集,每次取一行
# current_label = feat_vec[-1] # 取该行最后一列的值
# if current_label not in label_counts.keys():
# label_counts[current_label] = 0
# label_counts[current_label] +=1
# print(feat_vec,label_counts)
"""
[1, 1, 'yes'] {'yes': 1}
[1, 1, 'yes'] {'yes': 2}
[1, 0, 'no'] {'yes': 2, 'no': 1}
[0, 1, 'no'] {'yes': 2, 'no': 2}
[0, 1, 'no'] {'yes': 2, 'no': 3}
"""
# shannonEnt = 0.0 # 初始化信息熵
# for key in label_counts:
# prob = float(label_counts[key])/num_entries
# print(prob)
# shannonEnt -= prob * log(prob,math.exp(1)) # log base e 计算信息熵
# print(shannonEnt)
"""
0.4
0.6
0.6730116670092565
"""
'\n0.4\n0.6\n0.6730116670092565\n'
calcShannonEnt(dataset)
0.6730116670092565
# 按给定的特征划分数据,返回dataset(但不包括该特征),第axis列,值=value 的所有数据和标签
def split_dataset(dataset,axis,value): # #axis是dataSet数据集下要进行特征划分的列号例如outlook是0列,value是该列下某个特征值,0列中的sunny
ret_dataset = []
# 遍历数据集,并抽取按axis的当前value特征进划分的数据集(不包括axis列的值)
for feat_vec in dataset:
if feat_vec[axis] == value:
reduced_featvec = feat_vec[:axis]
reduced_featvec.extend(feat_vec[axis+1:])
ret_dataset.append(reduced_featvec)
return ret_dataset
# ret_dataset = []
# # 遍历数据集,并抽取按axis的当前value特征进划分的数据集(不包括axis列的值)
# for feat_vec in dataset:
# # print(feat_vec)
# if feat_vec[0] == 0:
# reduced_featvec = feat_vec[:0]
# #print(reduced_featvec,'1')
# reduced_featvec.extend(feat_vec[1:])
# #print(reduced_featvec,'2')
# ret_dataset.append(reduced_featvec)
#print('---------')
#print(ret_dataset) # [[1, 'no'], [1, 'no']]
# ret_dataset = []
# for feat_vec in dataset:
# # print(feat_vec)
# if feat_vec[0] == 1:
# reduced_featvec = feat_vec[:0]
# #print(reduced_featvec,'1')
# reduced_featvec.extend(feat_vec[1:])
# #print(reduced_featvec,'2')
# ret_dataset.append(reduced_featvec)
# print('---------')
# print(ret_dataset) # [[1, 'yes'], [1, 'yes'], [0, 'no']]
# 选取当前数据集下,用于划分数据集的最优特征
def choose_best_feature_to_split(dataset):
num_features = len(dataset[0]) - 1 # 获取当前数据集的特征个数
base_entropy = calcShannonEnt(dataset) # 计算当前数据集的信息熵
best_infogain = 0.0;best_feature = -1 # 初始化最优信息增益和最优特征
for i in range(num_features):
featlist = [example[i] for example in dataset] # #获取数据集中当前特征下的所有值
unique_vals = set(featlist) # 获取特征值,例如outlook下的sunny,overcast,rainy
new_entropy = 0.0
for value in unique_vals: # 计算每种划分方式的信息熵
sub_dataset = split_dataset(dataset,i,value)
prob = len(sub_dataset)/float(len(dataset))
new_entropy += prob * calcShannonEnt(sub_dataset)
infogain = base_entropy - new_entropy # 计算信息增益
if (infogain > best_infogain): #比较每个特征的信息增益,只要最好的信息增益
best_infogain = infogain
best_feature = i
return best_feature
# 代码调试
# num_features = len(dataset[0]) - 1 # 获取当前数据集的特征个数
# print('当前数据集的特征个数',num_features)
# base_entropy = calcShannonEnt(dataset) # 计算当前数据集的信息熵
# print('当前数据集的信息熵',base_entropy)
# best_infogain = 0.0;best_feature = -1 # 初始化最优信息增益和最优特征
# for i in range(num_features):
# featlist = [example[i] for example in dataset] # #获取数据集中当前特征下的所有值
# print(f'特征{i},当前特征数据{featlist}')
# unique_vals = set(featlist) # 获取特征值,例如outlook下的sunny,overcast,rainy
# print(f'特征值为:{unique_vals}')
# new_entropy = 0.0
# for value in unique_vals: # 计算每种划分方式的信息熵
# sub_dataset = split_dataset(dataset,i,value)
# print(f'计算当前特征信息熵的子数据集:{sub_dataset}')
# prob = len(sub_dataset)/float(len(dataset))
# print(f'当前特征子数据集占比;{prob}')
# new_entropy += prob * calcShannonEnt(sub_dataset)
# infogain = base_entropy - new_entropy # 计算信息增益
# print('信息增益:{infogain}')
"""
当前数据集的特征个数 2
当前数据集的信息熵 0.6730116670092565
特征0,当前特征数据[1, 1, 1, 0, 0]
特征值为:{0, 1}
计算当前特征信息熵的子数据集:[[1, 'no'], [1, 'no']]
当前特征子数据集占比;0.4
计算当前特征信息熵的子数据集:[[1, 'yes'], [1, 'yes'], [0, 'no']]
当前特征子数据集占比;0.6
特征1,当前特征数据[1, 1, 0, 1, 1]
特征值为:{0, 1}
计算当前特征信息熵的子数据集:[[1, 'no']]
当前特征子数据集占比;0.2
计算当前特征信息熵的子数据集:[[1, 'yes'], [1, 'yes'], [0, 'no'], [0, 'no']]
当前特征子数据集占比;0.8
"""
"\n当前数据集的特征个数 2\n当前数据集的信息熵 0.6730116670092565\n特征0,当前特征数据[1, 1, 1, 0, 0]\n特征值为:{0, 1}\n计算当前特征信息熵的子数据集:[[1, 'no'], [1, 'no']]\n当前特征子数据集占比;0.4\n计算当前特征信息熵的子数据集:[[1, 'yes'], [1, 'yes'], [0, 'no']]\n当前特征子数据集占比;0.6\n特征1,当前特征数据[1, 1, 0, 1, 1]\n特征值为:{0, 1}\n计算当前特征信息熵的子数据集:[[1, 'no']]\n当前特征子数据集占比;0.2\n计算当前特征信息熵的子数据集:[[1, 'yes'], [1, 'yes'], [0, 'no'], [0, 'no']]\n当前特征子数据集占比;0.8\n"
##### 该函数使用分类名称的列表,然后创建键值为classList中唯一值的数据字典。字典
##### 对象的存储了classList中每个类标签出现的频率。最后利用operator操作键值排序字典,
##### 并返回出现次数最多的分类名称m
def majorityCnt(classlist):
class_count={
}
for vote in class_count:
if vote not in class_count.keys():
class_count[vote] = 0
class_count[vote] +=1
sorted_class_count = sorted(class_count.iteritems(),key = operator.itemgetter(1),reverse=True)
return sorted_class_count[0][0]
# 生成决策树主方法
def createTree(dataset,labels):
classlist = [example[-1] for example in dataset] # 返回当前数据集下标签列所有值
if classlist.count(classlist[0])==len(classlist):
return classlist #当类别完全相同时则停止继续划分,直接返回该类的标签
if len(dataset[0])==1: ###遍历完所有的特征时,仍然不能将数据集划分成仅包含唯一类别的分组 dataSet
return majorityCnt(classlist)#由于无法简单的返回唯一的类标签,这里就返回出现次数最多的类别作为返回值
best_feat = choose_best_feature_to_split(dataset)#获取最好的特征索引
best_feat_label = labels[best_feat]
# 这里直接使用字典变量来存储树信息,这对于绘制树形图很重要。
my_tree = {
best_feat_label:{
}} # 当前数据集选取最好的特征存储在bestFeat中
del(labels[best_feat]) # 删除已经在选取的特征
feat_values = [example[best_feat] for example in dataset]
unique_vals = set(feat_values)
for value in unique_vals:
sub_labels = labels[:] # 复制所有的标签,这样树就不会把现有的标签弄乱
my_tree[best_feat_label][value] = createTree(split_dataset(dataset,best_feat,value),sub_labels)
return my_tree
def classify(input_tree,feat_labels,test_vec):
first_str = input_tree.keys()[0]
second_dict = input_tree[first_str]
feat_index = feat_labels.index(firststr)
key = test_vec[feat_index]
value_of_feat = second_dict[key]
if isinstance(value_of_feat,dict):
class_label = classify(value_of_feat,feat_labels,test_vec)
else:
class_label = value_of_feat
return class_label
def store_tree(input_tree,filename):
import pickle
fw = open(filename,'w')
pickle.dump(input_tree,fw)
fw.close()
def grab_tree(filename):
import pickle
fr = open(filename)
return pickle.load(fr)
#代码调试
fr = open('play.tennies.txt',encoding='utf-8')
lenses = [x.strip().split(' ') for x in fr.readlines()]
lenses_labels = ['outlook','temperature','humidity','windy']
# print(lenses,lenses_labels)
lenses_tree = createTree(lenses,lenses_labels)
lenses_tree
{'outlook': {'sunny': {'humidity': {'high': ['no', 'no', 'no'],
'normal': ['yes', 'yes']}},
'rainy': {'windy': {'FALSE': ['yes', 'yes', 'yes'], 'TRUE': ['no', 'no']}},
'overcast': ['yes', 'yes', 'yes', 'yes']}}
# 模拟建树过程
# classlist = [example[-1] for example in lenses] # 返回当前数据集下标签列所有值
# if classlist.count(classlist[0])==len(classlist):
# print (classlist) #当类别完全相同时则停止继续划分,直接返回该类的标签
# if len(lenses[0])==1: ###遍历完所有的特征时,仍然不能将数据集划分成仅包含唯一类别的分组 dataSet
# print (majorityCnt(classlist))#由于无法简单的返回唯一的类标签,这里就返回出现次数最多的类别作为返回值
# best_feat = choose_best_feature_to_split(lenses)#获取最好的特征索引
# best_feat_label = lenses_labels[best_feat]
# print(best_feat,best_feat_label)
# 0 outlook
# my_tree = {best_feat_label:{}} # 当前数据集选取最好的特征存储在bestFeat中
# print(my_tree,lenses_labels) # {'outlook': {}} ['outlook', 'temperature', 'humidity', 'windy']
# del(lenses_labels[best_feat]) # 删除已经在选取的特征
# lenses_labels # ['temperature', 'humidity', 'windy']
# feat_values = [example[best_feat] for example in lenses]
# unique_vals = set(feat_values)
# unique_vals
# {'overcast', 'rainy', 'sunny'}
# for value in unique_vals:
# sub_labels = lenses_labels[:] # 复制所有的标签,这样树就不会把现有的标签弄乱
# print(value,sub_labels)
# sunny ['temperature', 'humidity', 'windy']
# rainy ['temperature', 'humidity', 'windy']
# overcast ['temperature', 'humidity', 'windy']
import matplotlib.pyplot as plt
decisionNode = dict(boxstyle="sawtooth", fc="0.8") #定义文本框与箭头的格式
leafNode = dict(boxstyle="round4", fc="0.8")
arrow_args = dict(arrowstyle="<-")
def getNumLeafs(myTree): #获取树叶节点的数目
numLeafs = 0
temp = myTree.copy()
firstStr = list(temp.keys())[0]
secondDict = myTree[firstStr]
for key in secondDict.keys():
if type(secondDict[key]).__name__=='dict':#测试节点的数据类型是不是字典,如果是则就需要递归的调用getNumLeafs()函数
numLeafs += getNumLeafs(secondDict[key])
else: numLeafs +=1
return numLeafs
def getTreeDepth(myTree): #获取树的深度
maxDepth = 0
temp = myTree.copy()
firstStr = list(temp.keys())[0]
#firstStr = myTree.keys()[0]
secondDict = myTree[firstStr]
for key in secondDict.keys():
if type(secondDict[key]).__name__=='dict':#test to see if the nodes are dictonaires, if not they are leaf nodes
thisDepth = 1 + getTreeDepth(secondDict[key])
else: thisDepth = 1
if thisDepth > maxDepth: maxDepth = thisDepth
return maxDepth
# 绘制带箭头的注释
def plotNode(nodeTxt, centerPt, parentPt, nodeType):
createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction',
xytext=centerPt, textcoords='axes fraction',
va="center", ha="center", bbox=nodeType, arrowprops=arrow_args )
#计算父节点和子节点的中间位置,在父节点间填充文本的信息
def plotMidText(cntrPt, parentPt, txtString):
xMid = (parentPt[0]-cntrPt[0])/2.0 + cntrPt[0]
yMid = (parentPt[1]-cntrPt[1])/2.0 + cntrPt[1]
createPlot.ax1.text(xMid, yMid, txtString, va="center", ha="center", rotation=30)
# 画决策树的准备方法
def plotTree(myTree, parentPt, nodeTxt):#if the first key tells you what feat was split on
numLeafs = getNumLeafs(myTree) #计算树的宽度
depth = getTreeDepth(myTree) #计算树的深度
temp = myTree.copy()
firstStr = list(temp.keys())[0]
#firstStr = myTree.keys()[0] #the text label for this node should be this
cntrPt = (plotTree.xOff + (1.0 + float(numLeafs))/2.0/plotTree.totalW, plotTree.yOff)
plotMidText(cntrPt, parentPt, nodeTxt)
plotNode(firstStr, cntrPt, parentPt, decisionNode)
secondDict = myTree[firstStr]
plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD
for key in secondDict.keys():
if type(secondDict[key]).__name__=='dict':#test to see if the nodes are dictonaires, if not they are leaf nodes
plotTree(secondDict[key],cntrPt,str(key)) #recursion
else: #it's a leaf node print the leaf node
plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW
plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)
plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
plotTree.yOff = plotTree.yOff + 1.0/plotTree.totalD
#if you do get a dictonary you know it's a tree, and the first element will be another dict
# 画决策树主方法
def createPlot(inTree):
fig = plt.figure(1, facecolor='white')
fig.clf()
axprops = dict(xticks=[], yticks=[])
createPlot.ax1 = plt.subplot(111, frameon=False, **axprops) #no ticks
#createPlot.ax1 = plt.subplot(111, frameon=False) #ticks for demo puropses
plotTree.totalW = float(getNumLeafs(inTree))
plotTree.totalD = float(getTreeDepth(inTree))
plotTree.xOff = -0.5/plotTree.totalW; plotTree.yOff = 1.0;
plotTree(inTree, (0.5,1.0), '')
plt.show()
#def createPlot():
# fig = plt.figure(1, facecolor='white')
# fig.clf()
# createPlot.ax1 = plt.subplot(111, frameon=False) #ticks for demo puropses
# plotNode('a decision node', (0.5, 0.1), (0.1, 0.5), decisionNode)
# plotNode('a leaf node', (0.8, 0.1), (0.3, 0.8), leafNode)
# plt.show()
def retrieveTree(i):
listOfTrees =[{
'no surfacing': {
0: 'no', 1: {
'flippers': {
0: 'no', 1: 'yes'}}}},
{
'no surfacing': {
0: 'no', 1: {
'flippers': {
0: {
'head': {
0: 'no', 1: 'yes'}}, 1: 'no'}}}}
]
return listOfTrees[i]
createPlot(lenses_tree)
pip install jupyter_contrib_nbextensions
```python
jupyter contrib nbextension install --user
pip install jupyter_nbextensions_configurator -i