理论
决策树学习算法是一个迭代算法,分为判定叶节点步骤和划分子集步骤。划分子集的依据是节点所包含的样本尽可能属于同一类,常用的度量有信息熵增益和基尼指数,即最大化信息熵增益或最小化基尼指数(本文采用信息熵增益实现,具体公式略)。
叶节点判定条件:
(1)当前节点包含所有样本类标相同
(2)所有样本在所有属性上取值相同,无法划分
(3)当前节点所含样本集为空
其中(1)(2)对应信息熵增益为0.
代码
# -*- coding: utf-8 -*-
"""
Decision Tree
From 'Machine Learning, Zhihua Zhou' Ch4
Model: Dicision Tree
Information Entropy Gain method
Dataset: P89 watermelon_3.0a (watermelon_3.0a.npy)
@author: weiyx15
"""
import numpy as np
import matplotlib.pyplot as plt
class DecisionTree:
def load_data(self, filename):
dic = np.load(filename)
self.x = dic['arr_0']
self.y = dic['arr_1']
self.m = self.x.shape[0]
self.d = self.x.shape[1]
self.k = self.y.max() + 1
def __init__(self):
self.load_data('watermelon_3.0a.npz')
def ent(self, D):
nd = len(D)
if nd == 0:
return 0
cnt = [0] * self.k
p = cnt
ent = 0
for i in D:
j = self.y[i]
cnt[j] = cnt[j] + 1
for j in range(self.k):
p[j] = cnt[j]/nd
if p[j] > 0:
ent = ent - p[j]*np.log2(p[j])
return ent
def Dsplit(self, D, ind):
gain_max = 0
feature_max = -1
value_max = -1
decision_made = -1 # decision == -1 means not leaf node
dm0 = list()
dm1 = list()
for j in range(self.d):
for thr in self.Sep[j]:
d0 = list()
d1 = list()
for i in D:
if (self.x[i,j] < thr):
d0.append(i)
else:
d1.append(i)
gain = self.ent(D) - \
(len(d0)*self.ent(d0) + len(d1)*self.ent(d1))/len(D)
if gain > gain_max:
gain_max = gain
feature_max = j
value_max = thr
dm0 = d0
dm1 = d1
cnt = [0] * self.k
if gain_max == 0:
for i in D:
cnt[self.y[i]] = cnt[self.y[i]] + 1
decision_made = cnt.index(max(cnt))
self.dt[ind] = tree_node(D, feature_max, value_max, decision_made)
if decision_made == -1:
if dm0 != []:
self.Dsplit(dm0, 2*ind+1)
if dm1 != []:
self.Dsplit(dm1, 2*ind+2)
def train(self):
# calculate Separation
data_list = list()
for i in range(self.m):
data_list.append(aPieceofData(self.x[i,:],self.d,self.y[i]))
dl = [None] * self.d
for i in range(self.d):
dl[i] = sorted(data_list, key=lambda apiece : apiece.x[i])
self.Sep = [list()] * self.d
for i in range(self.m):
for j in range(self.d):
if i > 0:
self.Sep[j].append((dl[j][i-1].x[j] + dl[j][i].x[j])/2)
# create decision tree
# use list to represent binary decision tree
self.dt = [None] * 4 * self.m
Droot = list(range(self.m))
self.Dsplit(Droot, 0)
def go_through(self, xt, cur):
if self.dt[cur]._decision != -1:
return self.dt[cur]._decision
else:
if xt[self.dt[cur]._feature] < self.dt[cur]._value:
return self.go_through(xt, 2*cur+1)
else:
return self.go_through(xt, 2*cur+2)
def test(self, xt):
return self.go_through(xt, 0)
def plot_data(self):
x1 = self.x[np.where(self.y==1),:]
x1 = x1.reshape((x1.shape[1], x1.shape[2]))
x0 = self.x[np.where(self.y==0),:]
x0 = x0.reshape((x0.shape[1], x0.shape[2]))
plt.plot(x1[:,0], x1[:,1], 'b.')
plt.plot(x0[:,0], x0[:,1], 'r.')
def go_through_disp(self, cur):
if self.dt[cur]._decision != -1:
print('Decision: ' + str(self.dt[cur]._decision))
else:
print('Feature: ' + str(self.dt[cur]._feature) + \
' Value: ' + str(self.dt[cur]._value))
self.go_through_disp(2*cur+1)
self.go_through_disp(2*cur+2)
def tree_disp(self):
self.go_through_disp(0)
class aPieceofData:
def __init__(self, x, d, y):
self.x = x
self.d = d
self.y = y
class tree_node:
def __init__(self, data, feature, value, decision):
self._data = data # D set
self._feature = feature # feature number: 0,1,2,...
self._value = value # feature split value
self._decision = decision # decision of leave node
if __name__ == '__main__':
dt = DecisionTree()
dt.train()
ans = dt.test([0.2, 0.2])
dt.tree_disp()
结果
先序遍历输出决策树如下:
runfile('D:/大学学习/自学/西瓜书_周志华/Ch4_Decision_Tree.py', wdir='D:/大学学习/自学/西瓜书_周志华')
Feature: 1 Value: 0.126
Decision: 0
Feature: 0 Value: 0.3815
Decision: 0
Feature: 1 Value: 0.2045
Feature: 0 Value: 0.5185
Decision: 1
Decision: 0
Decision: 1