Decision tree algorithm (tear by hand)

I used to debug it according to other people's instructions, and I didn't organize it very much, it was a bit messy.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import log
import math
import operator
import sys
# !pip install treePlotter
def createDataSet():
    dataset = [
        [1,1,'yes'],
        [1,1,'yes'],
        [1,0,'no'],
        [0,1,'no'],
        [0,1,'no'],
    ]
    labels = ['no surfacing','flippers']
    return dataset,labels
dataset,labels = createDataSet()
dataset,labels
([[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']],
 ['no surfacing', 'flippers'])
# 计算信息熵,就是根据提供的数据集的标签计算信息熵
def calcShannonEnt(dataset):
    num_entries = len(dataset) # 样本数
    label_counts = {
    
    } # 创建一个字典:key是最后一列数值,即标签
    for feat_vec in dataset: # 遍历整个数据集,每次取一行
        current_label = feat_vec[-1] # 取该行最后一列的值
        if current_label not in label_counts.keys():
            label_counts[current_label] = 0
        label_counts[current_label] +=1
    shannonEnt = 0.0 # 初始化信息熵
    for key in label_counts:
        prob = float(label_counts[key])/num_entries
        shannonEnt -= prob * log(prob,math.exp(1)) # log base 2 计算信息熵
    return shannonEnt      
# num_entries = len(dataset) # 样本数
# label_counts = {} # 创建一个字典:key是最后一列数值,即标签
# for feat_vec in dataset: # 遍历整个数据集,每次取一行
#     current_label = feat_vec[-1] # 取该行最后一列的值
#     if current_label not in label_counts.keys():
#         label_counts[current_label] = 0
#     label_counts[current_label] +=1
#     print(feat_vec,label_counts)
"""
[1, 1, 'yes'] {'yes': 1}
[1, 1, 'yes'] {'yes': 2}
[1, 0, 'no'] {'yes': 2, 'no': 1}
[0, 1, 'no'] {'yes': 2, 'no': 2}
[0, 1, 'no'] {'yes': 2, 'no': 3}
"""
# shannonEnt = 0.0 # 初始化信息熵
# for key in label_counts:
#     prob = float(label_counts[key])/num_entries
#     print(prob)
#     shannonEnt -= prob * log(prob,math.exp(1)) # log base e 计算信息熵
# print(shannonEnt)
"""
0.4
0.6
0.6730116670092565
"""
'\n0.4\n0.6\n0.6730116670092565\n'
calcShannonEnt(dataset)
0.6730116670092565
# 按给定的特征划分数据,返回dataset(但不包括该特征),第axis列,值=value 的所有数据和标签
def split_dataset(dataset,axis,value): # #axis是dataSet数据集下要进行特征划分的列号例如outlook是0列,value是该列下某个特征值,0列中的sunny
    ret_dataset = []
    # 遍历数据集,并抽取按axis的当前value特征进划分的数据集(不包括axis列的值)
    for feat_vec in dataset:
        if feat_vec[axis] == value:
            reduced_featvec = feat_vec[:axis]
            reduced_featvec.extend(feat_vec[axis+1:])
            ret_dataset.append(reduced_featvec)
    return ret_dataset
# ret_dataset = []
#     # 遍历数据集,并抽取按axis的当前value特征进划分的数据集(不包括axis列的值)
# for feat_vec in dataset:
# #     print(feat_vec)
#     if feat_vec[0] == 0:
#         reduced_featvec = feat_vec[:0]
#         #print(reduced_featvec,'1')
#         reduced_featvec.extend(feat_vec[1:])
#         #print(reduced_featvec,'2')
#         ret_dataset.append(reduced_featvec)
        
#print('---------')
#print(ret_dataset) # [[1, 'no'], [1, 'no']]

# ret_dataset = []
# for feat_vec in dataset:
# #     print(feat_vec)
#     if feat_vec[0] == 1:
#         reduced_featvec = feat_vec[:0]
#         #print(reduced_featvec,'1')
#         reduced_featvec.extend(feat_vec[1:])
#         #print(reduced_featvec,'2')
#         ret_dataset.append(reduced_featvec)
# print('---------')
# print(ret_dataset) # [[1, 'yes'], [1, 'yes'], [0, 'no']]

# 选取当前数据集下,用于划分数据集的最优特征
def choose_best_feature_to_split(dataset):
    num_features = len(dataset[0]) - 1 # 获取当前数据集的特征个数
    base_entropy = calcShannonEnt(dataset) # 计算当前数据集的信息熵
    best_infogain = 0.0;best_feature = -1 # 初始化最优信息增益和最优特征
    for i in range(num_features):
        featlist = [example[i] for example in dataset] # #获取数据集中当前特征下的所有值
        unique_vals = set(featlist) # 获取特征值,例如outlook下的sunny,overcast,rainy
        new_entropy = 0.0
        for value in unique_vals: # 计算每种划分方式的信息熵
            sub_dataset = split_dataset(dataset,i,value)
            prob = len(sub_dataset)/float(len(dataset))
            new_entropy += prob * calcShannonEnt(sub_dataset)
        infogain = base_entropy - new_entropy # 计算信息增益
        if (infogain > best_infogain): #比较每个特征的信息增益,只要最好的信息增益
            best_infogain = infogain
            best_feature = i
    return best_feature
# 代码调试
# num_features = len(dataset[0]) - 1 # 获取当前数据集的特征个数
# print('当前数据集的特征个数',num_features)
# base_entropy = calcShannonEnt(dataset) # 计算当前数据集的信息熵
# print('当前数据集的信息熵',base_entropy)
# best_infogain = 0.0;best_feature = -1 # 初始化最优信息增益和最优特征
# for i in range(num_features):
#         featlist = [example[i] for example in dataset] # #获取数据集中当前特征下的所有值
#         print(f'特征{i},当前特征数据{featlist}')
#         unique_vals = set(featlist) # 获取特征值,例如outlook下的sunny,overcast,rainy
#         print(f'特征值为:{unique_vals}')
#         new_entropy = 0.0
#         for value in unique_vals: # 计算每种划分方式的信息熵
#             sub_dataset = split_dataset(dataset,i,value)
#             print(f'计算当前特征信息熵的子数据集:{sub_dataset}')
#             prob = len(sub_dataset)/float(len(dataset))
#             print(f'当前特征子数据集占比;{prob}')
#             new_entropy += prob * calcShannonEnt(sub_dataset)
#         infogain = base_entropy - new_entropy # 计算信息增益
#         print('信息增益:{infogain}')

"""
当前数据集的特征个数 2
当前数据集的信息熵 0.6730116670092565
特征0,当前特征数据[1, 1, 1, 0, 0]
特征值为:{0, 1}
计算当前特征信息熵的子数据集:[[1, 'no'], [1, 'no']]
当前特征子数据集占比;0.4
计算当前特征信息熵的子数据集:[[1, 'yes'], [1, 'yes'], [0, 'no']]
当前特征子数据集占比;0.6
特征1,当前特征数据[1, 1, 0, 1, 1]
特征值为:{0, 1}
计算当前特征信息熵的子数据集:[[1, 'no']]
当前特征子数据集占比;0.2
计算当前特征信息熵的子数据集:[[1, 'yes'], [1, 'yes'], [0, 'no'], [0, 'no']]
当前特征子数据集占比;0.8
"""
"\n当前数据集的特征个数 2\n当前数据集的信息熵 0.6730116670092565\n特征0,当前特征数据[1, 1, 1, 0, 0]\n特征值为:{0, 1}\n计算当前特征信息熵的子数据集:[[1, 'no'], [1, 'no']]\n当前特征子数据集占比;0.4\n计算当前特征信息熵的子数据集:[[1, 'yes'], [1, 'yes'], [0, 'no']]\n当前特征子数据集占比;0.6\n特征1,当前特征数据[1, 1, 0, 1, 1]\n特征值为:{0, 1}\n计算当前特征信息熵的子数据集:[[1, 'no']]\n当前特征子数据集占比;0.2\n计算当前特征信息熵的子数据集:[[1, 'yes'], [1, 'yes'], [0, 'no'], [0, 'no']]\n当前特征子数据集占比;0.8\n"
##### 该函数使用分类名称的列表,然后创建键值为classList中唯一值的数据字典。字典
##### 对象的存储了classList中每个类标签出现的频率。最后利用operator操作键值排序字典,
##### 并返回出现次数最多的分类名称m
def majorityCnt(classlist):
    class_count={
    
    }
    for vote in class_count:
        if vote not in class_count.keys():
            class_count[vote] = 0
        class_count[vote] +=1
    sorted_class_count = sorted(class_count.iteritems(),key = operator.itemgetter(1),reverse=True)
    return sorted_class_count[0][0]
# 生成决策树主方法
def createTree(dataset,labels):
    classlist = [example[-1] for example in dataset] # 返回当前数据集下标签列所有值
    if classlist.count(classlist[0])==len(classlist):
        return classlist #当类别完全相同时则停止继续划分,直接返回该类的标签
    if len(dataset[0])==1: ###遍历完所有的特征时,仍然不能将数据集划分成仅包含唯一类别的分组 dataSet
        return majorityCnt(classlist)#由于无法简单的返回唯一的类标签,这里就返回出现次数最多的类别作为返回值
    best_feat = choose_best_feature_to_split(dataset)#获取最好的特征索引
    best_feat_label = labels[best_feat]
    
    # 这里直接使用字典变量来存储树信息,这对于绘制树形图很重要。
    my_tree = {
    
    best_feat_label:{
    
    }} # 当前数据集选取最好的特征存储在bestFeat中
    del(labels[best_feat]) # 删除已经在选取的特征
    feat_values = [example[best_feat] for example in dataset]
    unique_vals = set(feat_values)
    for value in unique_vals:
        sub_labels = labels[:] # 复制所有的标签,这样树就不会把现有的标签弄乱
        my_tree[best_feat_label][value] = createTree(split_dataset(dataset,best_feat,value),sub_labels)
    return my_tree    
def classify(input_tree,feat_labels,test_vec):
    first_str = input_tree.keys()[0]
    second_dict = input_tree[first_str]
    feat_index = feat_labels.index(firststr)
    key = test_vec[feat_index]
    value_of_feat = second_dict[key]
    if isinstance(value_of_feat,dict):
        class_label = classify(value_of_feat,feat_labels,test_vec)
    else:
        class_label = value_of_feat
    return class_label
def store_tree(input_tree,filename):
    import pickle
    fw = open(filename,'w')
    pickle.dump(input_tree,fw)
    fw.close()
def grab_tree(filename):
    import pickle
    fr = open(filename)
    return pickle.load(fr)
#代码调试
fr = open('play.tennies.txt',encoding='utf-8')
lenses = [x.strip().split(' ') for x in fr.readlines()]
lenses_labels = ['outlook','temperature','humidity','windy']
# print(lenses,lenses_labels)
lenses_tree = createTree(lenses,lenses_labels)
lenses_tree
{'outlook': {'sunny': {'humidity': {'high': ['no', 'no', 'no'],
    'normal': ['yes', 'yes']}},
  'rainy': {'windy': {'FALSE': ['yes', 'yes', 'yes'], 'TRUE': ['no', 'no']}},
  'overcast': ['yes', 'yes', 'yes', 'yes']}}
# 模拟建树过程
# classlist = [example[-1] for example in lenses] # 返回当前数据集下标签列所有值
# if classlist.count(classlist[0])==len(classlist):
#     print (classlist) #当类别完全相同时则停止继续划分,直接返回该类的标签
# if len(lenses[0])==1: ###遍历完所有的特征时,仍然不能将数据集划分成仅包含唯一类别的分组 dataSet
#     print (majorityCnt(classlist))#由于无法简单的返回唯一的类标签,这里就返回出现次数最多的类别作为返回值
# best_feat = choose_best_feature_to_split(lenses)#获取最好的特征索引
# best_feat_label = lenses_labels[best_feat]
# print(best_feat,best_feat_label)

# 0 outlook
# my_tree = {best_feat_label:{}} # 当前数据集选取最好的特征存储在bestFeat中
# print(my_tree,lenses_labels) # {'outlook': {}} ['outlook', 'temperature', 'humidity', 'windy']
# del(lenses_labels[best_feat]) # 删除已经在选取的特征
# lenses_labels # ['temperature', 'humidity', 'windy']

# feat_values = [example[best_feat] for example in lenses]
# unique_vals = set(feat_values)
# unique_vals

# {'overcast', 'rainy', 'sunny'}
# for value in unique_vals:
#     sub_labels = lenses_labels[:] # 复制所有的标签,这样树就不会把现有的标签弄乱
#     print(value,sub_labels)
    
# sunny ['temperature', 'humidity', 'windy']
# rainy ['temperature', 'humidity', 'windy']
# overcast ['temperature', 'humidity', 'windy']
import matplotlib.pyplot as plt

decisionNode = dict(boxstyle="sawtooth", fc="0.8") #定义文本框与箭头的格式
leafNode = dict(boxstyle="round4", fc="0.8")
arrow_args = dict(arrowstyle="<-")

def getNumLeafs(myTree): #获取树叶节点的数目
    numLeafs = 0
    temp = myTree.copy()
    firstStr = list(temp.keys())[0]
    secondDict = myTree[firstStr]
    for key in secondDict.keys():
        if type(secondDict[key]).__name__=='dict':#测试节点的数据类型是不是字典,如果是则就需要递归的调用getNumLeafs()函数
            numLeafs += getNumLeafs(secondDict[key])
        else:   numLeafs +=1
    return numLeafs

def getTreeDepth(myTree): #获取树的深度
    maxDepth = 0
    temp = myTree.copy()
    firstStr = list(temp.keys())[0]
    #firstStr = myTree.keys()[0]
    secondDict = myTree[firstStr]
    for key in secondDict.keys():
        if type(secondDict[key]).__name__=='dict':#test to see if the nodes are dictonaires, if not they are leaf nodes
            thisDepth = 1 + getTreeDepth(secondDict[key])
        else:   thisDepth = 1
        if thisDepth > maxDepth: maxDepth = thisDepth
    return maxDepth

# 绘制带箭头的注释
def plotNode(nodeTxt, centerPt, parentPt, nodeType):
    createPlot.ax1.annotate(nodeTxt, xy=parentPt,  xycoords='axes fraction',
             xytext=centerPt, textcoords='axes fraction',
             va="center", ha="center", bbox=nodeType, arrowprops=arrow_args )

#计算父节点和子节点的中间位置,在父节点间填充文本的信息
def plotMidText(cntrPt, parentPt, txtString):
    xMid = (parentPt[0]-cntrPt[0])/2.0 + cntrPt[0]
    yMid = (parentPt[1]-cntrPt[1])/2.0 + cntrPt[1]
    createPlot.ax1.text(xMid, yMid, txtString, va="center", ha="center", rotation=30)

# 画决策树的准备方法
def plotTree(myTree, parentPt, nodeTxt):#if the first key tells you what feat was split on
    numLeafs = getNumLeafs(myTree)  #计算树的宽度
    depth = getTreeDepth(myTree)   #计算树的深度
    temp = myTree.copy()
    firstStr = list(temp.keys())[0]
    #firstStr = myTree.keys()[0]     #the text label for this node should be this
    cntrPt = (plotTree.xOff + (1.0 + float(numLeafs))/2.0/plotTree.totalW, plotTree.yOff)
    plotMidText(cntrPt, parentPt, nodeTxt)
    plotNode(firstStr, cntrPt, parentPt, decisionNode)
    secondDict = myTree[firstStr]
    plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD
    for key in secondDict.keys():
        if type(secondDict[key]).__name__=='dict':#test to see if the nodes are dictonaires, if not they are leaf nodes   
            plotTree(secondDict[key],cntrPt,str(key))        #recursion
        else:   #it's a leaf node print the leaf node
            plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW
            plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)
            plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
    plotTree.yOff = plotTree.yOff + 1.0/plotTree.totalD
#if you do get a dictonary you know it's a tree, and the first element will be another dict

# 画决策树主方法
def createPlot(inTree):
    fig = plt.figure(1, facecolor='white')
    fig.clf()
    axprops = dict(xticks=[], yticks=[])
    createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)    #no ticks
    #createPlot.ax1 = plt.subplot(111, frameon=False) #ticks for demo puropses 
    plotTree.totalW = float(getNumLeafs(inTree))
    plotTree.totalD = float(getTreeDepth(inTree))
    plotTree.xOff = -0.5/plotTree.totalW; plotTree.yOff = 1.0;
    plotTree(inTree, (0.5,1.0), '')
    plt.show()

#def createPlot():
#    fig = plt.figure(1, facecolor='white')
#    fig.clf()
#    createPlot.ax1 = plt.subplot(111, frameon=False) #ticks for demo puropses 
#    plotNode('a decision node', (0.5, 0.1), (0.1, 0.5), decisionNode)
#    plotNode('a leaf node', (0.8, 0.1), (0.3, 0.8), leafNode)
#    plt.show()

def retrieveTree(i):
    listOfTrees =[{
    
    'no surfacing': {
    
    0: 'no', 1: {
    
    'flippers': {
    
    0: 'no', 1: 'yes'}}}},
                  {
    
    'no surfacing': {
    
    0: 'no', 1: {
    
    'flippers': {
    
    0: {
    
    'head': {
    
    0: 'no', 1: 'yes'}}, 1: 'no'}}}}
                  ]
    return listOfTrees[i]
createPlot(lenses_tree)

insert image description here

pip install jupyter_contrib_nbextensions


```python
jupyter contrib nbextension install --user
pip install jupyter_nbextensions_configurator -i

Guess you like

Origin blog.csdn.net/qq_33489955/article/details/124313268