理论

决策树学习算法是一个迭代算法，分为判定叶节点步骤和划分子集步骤。划分子集的依据是节点所包含的样本尽可能属于同一类，常用的度量有信息熵增益和基尼指数，即最大化信息熵增益或最小化基尼指数（本文采用信息熵增益实现，具体公式略）。

叶节点判定条件：

（1）当前节点包含所有样本类标相同

（2）所有样本在所有属性上取值相同，无法划分

（3）当前节点所含样本集为空

其中（1）（2）对应信息熵增益为0.

代码

# -*- coding: utf-8 -*-
"""
Decision Tree
From 'Machine Learning, Zhihua Zhou' Ch4
Model: Dicision Tree
        Information Entropy Gain method
Dataset: P89 watermelon_3.0a (watermelon_3.0a.npy)

@author: weiyx15
"""

import numpy as np
import matplotlib.pyplot as plt

class DecisionTree:
    def load_data(self, filename):
        dic = np.load(filename)
        self.x = dic['arr_0']
        self.y = dic['arr_1']
        self.m = self.x.shape[0]
        self.d = self.x.shape[1]
        self.k = self.y.max() + 1
        
    def __init__(self):
        self.load_data('watermelon_3.0a.npz')
        
    def ent(self, D):
        nd = len(D)
        if nd == 0:
            return 0
        cnt = [0] * self.k
        p = cnt
        ent = 0
        for i in D:
            j = self.y[i]
            cnt[j] = cnt[j] + 1
        for j in range(self.k):
            p[j] = cnt[j]/nd
            if p[j] > 0:
                ent = ent - p[j]*np.log2(p[j])
        return ent 
    
    def Dsplit(self, D, ind):
        gain_max = 0
        feature_max = -1
        value_max = -1
        decision_made = -1  # decision == -1 means not leaf node
        dm0 = list()
        dm1 = list()
        for j in range(self.d):
            for thr in self.Sep[j]:
                d0 = list()
                d1 = list()
                for i in D:
                    if (self.x[i,j] < thr):
                        d0.append(i)
                    else:
                        d1.append(i)
                gain = self.ent(D) - \
                (len(d0)*self.ent(d0) + len(d1)*self.ent(d1))/len(D)
                if gain > gain_max:
                    gain_max = gain
                    feature_max = j
                    value_max = thr
                    dm0 = d0
                    dm1 = d1
        cnt = [0] * self.k
        if gain_max == 0:
            for i in D:
                cnt[self.y[i]] = cnt[self.y[i]] + 1
            decision_made = cnt.index(max(cnt))
        self.dt[ind] = tree_node(D, feature_max, value_max, decision_made)
        if decision_made == -1:
            if dm0 != []:
                self.Dsplit(dm0, 2*ind+1)
            if dm1 != []:
                self.Dsplit(dm1, 2*ind+2)
        
    def train(self):
        # calculate Separation
        data_list = list()
        for i in range(self.m):
            data_list.append(aPieceofData(self.x[i,:],self.d,self.y[i]))
        dl = [None] * self.d
        for i in range(self.d):
            dl[i] = sorted(data_list, key=lambda apiece : apiece.x[i])
        self.Sep = [list()] * self.d
        for i in range(self.m):
            for j in range(self.d):
                if i > 0:
                    self.Sep[j].append((dl[j][i-1].x[j] + dl[j][i].x[j])/2)
        # create decision tree
        # use list to represent binary decision tree
        self.dt = [None] * 4 * self.m
        Droot = list(range(self.m))
        self.Dsplit(Droot, 0)
        
    def go_through(self, xt, cur):
        if self.dt[cur]._decision != -1:
            return self.dt[cur]._decision
        else:
            if xt[self.dt[cur]._feature] < self.dt[cur]._value:
                return self.go_through(xt, 2*cur+1)
            else:
                return self.go_through(xt, 2*cur+2)
    
    def test(self, xt):
        return self.go_through(xt, 0)
        
    def plot_data(self):
        x1 = self.x[np.where(self.y==1),:]
        x1 = x1.reshape((x1.shape[1], x1.shape[2]))
        x0 = self.x[np.where(self.y==0),:]
        x0 = x0.reshape((x0.shape[1], x0.shape[2]))
        plt.plot(x1[:,0], x1[:,1], 'b.')
        plt.plot(x0[:,0], x0[:,1], 'r.') 
    
    def go_through_disp(self, cur):
        if self.dt[cur]._decision != -1:
            print('Decision: ' + str(self.dt[cur]._decision))
        else:
            print('Feature: ' + str(self.dt[cur]._feature) + \
                      '  Value: ' + str(self.dt[cur]._value))
            self.go_through_disp(2*cur+1)
            self.go_through_disp(2*cur+2)
    
    def tree_disp(self):
        self.go_through_disp(0)
    
class aPieceofData:
    def __init__(self, x, d, y):
        self.x = x
        self.d = d
        self.y = y 

class tree_node:
    def __init__(self, data, feature, value, decision):
        self._data = data           # D set
        self._feature = feature     # feature number: 0,1,2,...
        self._value = value         # feature split value
        self._decision = decision   # decision of leave node
        
if __name__ == '__main__':
    dt = DecisionTree()
    dt.train()
    ans = dt.test([0.2, 0.2])
    dt.tree_disp()

结果

先序遍历输出决策树如下：

runfile('D:/大学学习/自学/西瓜书_周志华/Ch4_Decision_Tree.py', wdir='D:/大学学习/自学/西瓜书_周志华')
Feature: 1  Value: 0.126
Decision: 0
Feature: 0  Value: 0.3815
Decision: 0
Feature: 1  Value: 0.2045
Feature: 0  Value: 0.5185
Decision: 1
Decision: 0
Decision: 1

周志华《机器学习》Ch4. 决策树：连续值决策树的python实现

理论

代码

结果

猜你喜欢