内含3种算法的核心部分.
没有找到很好的测试数据.
但就理清算法思路来说问题不大
剪枝算法目前只实现了CART回归树的后剪枝.
import numpy as np
from collections import Counter
class DecisionTree():
def __init__(self, algorithm='ID3'):
self.algorithm = algorithm
def cal_entroy(self, dataSet):
'''
计算数据集的经验熵,数据集为np.array
:param dataSet: 数据集m*n,m为样本数,n为特征数
:return: 数据集的经验熵
'''
m = dataSet.shape[0] # 样本数
labels = Counter(dataSet[:, -1].reshape(m).tolist()) # 获取类别及其出现的次数
entroy = 0 # 初始化经验熵
for amount in labels.values():
prob = amount / m # 计算概率pi
entroy -= prob * np.log(prob) # e=-sum(pi*log(pi))
return entroy
def cal_gini(self, dataSet):
'''
计算数据集的基尼指数,数据集为np.array
:param dataSet: 数据集m*n,m为样本数,n为特征数
:return: 数据集的基尼指数
'''
m = dataSet.shape[0]
labels = Counter(dataSet[:, -1].reshape(m).tolist())
gini = 1
for amount in labels.values():
prob = amount / m
gini -= prob**2 # g=1-sum(pi**2)
return gini
def cal_se(self, dataSet):
'''
计算数据集的方差squared error,数据集为np.array
np.var可直接计算出均方差,乘以样本数即为方差
:param dataSet: 数据集m*n,m为样本数,n为特征数
:return: 数据集的方差
'''
return np.var(dataSet[:, -1]) * dataSet.shape[0] if dataSet.shape[0] > 0 else 0
def split_dataSet(self, dataSet, feature, value):
'''
根据特征feature的特征值value,划分数据集
:param dataSet: 数据集m*(n+1),m为样本数,n为特征数
:param feature: 作为划分点的特征的索引
:param value: 特征的某一个值
:param dataType: 声明feature的值是离散型或连续型,默认是离散型,用于CART
:return: dataSet[feature]==value的数据集,且不再包含feature特征
'''
m, n = dataSet.shape[0], dataSet.shape[1] - 1
if self.algorithm == 'ID3' or self.algorithm == 'C4.5': # 获取所有特征值等于给定值的样本D,返回去掉该特征列的D.
splitData = np.zeros((1, n)) # 初始化一个1*n的二维数组,便于使用np.concatenate来增添数据,最后输出结果时再去掉第一行就OK.
for i in range(m):
if dataSet[i, feature] == value:
temp = np.concatenate((dataSet[i, : feature], dataSet[i, feature + 1:])).reshape(1, n)
splitData = np.concatenate((splitData, temp))
return splitData[1:, :]
else: # 获取符合条件的样本,用于CART
if self.algorithm == 'CARTcla': # CART分类树,训练数据为离散型
left = dataSet[np.nonzero(dataSet[:, feature] == value)[0], :]
right = dataSet[np.nonzero(dataSet[:, feature] != value)[0], :]
else: # CART回归树,训练数据为连续型
left = dataSet[np.nonzero(dataSet[:, feature] <= value)[0], :]
right = dataSet[np.nonzero(dataSet[:, feature] > value)[0], :]
return left, right
def cal_entroy_gain(self, baseEnt, dataSet, feature):
'''
计算信息增益,用于ID3
:param baseEnt: 原数据的经验熵
:param dataSet: 数据集m*(n+1),m为样本数,n为特征数
:param feature: 作为划分点的特征的索引
:param value: 特征的某一个值
:return: 按照指定特征划分后的信息增益
'''
newEnt = 0
values = np.unique(dataSet[:, feature]) # 获取特征值的取值范围
for value in values:
newEnt += self.cal_entroy(self.split_dataSet(dataSet, feature, value))
return baseEnt - newEnt
def cal_entroy_gain_rate(self, baseEnt, dataSet, feature):
'''
计算信息增益比,用于C4.5
:param baseEnt: 原数据的经验熵
:param dataSet: 数据集m*(n+1),m为样本数,n为特征数
:param feature: 作为划分点的特征的索引
:param value: 特征的某一个值
:return: 按照指定特征划分后的信息增益比
'''
newEnt, splitEnt = 0, 0
values = np.unique(dataSet[:, feature])
for value in values:
splitData = self.split_dataSet(dataSet, feature, value)
newEnt += self.cal_entroy(splitData)
prob = splitData.shape[0] / dataSet.shape[0]
splitEnt -= prob * np.log(prob)
return (baseEnt - newEnt) / splitEnt
def cal_split_gini(self, dataSet, feature):
'''
计算数据集按照某一特征的值划分后,可以取得的最小基尼指数,返回该基尼指数和对应的值. 用于CART分类树
:param dataSet: 数据集m*(n+1),m为样本数,n为特征数
:param feature: 作为划分点的特征的索引
:return: 最小基尼指数与其对应的特征值
'''
values = np.unique(dataSet[:, feature])
minGini, minValue = np.inf, 0
for value in values:
left, right = self.split_dataSet(dataSet, feature, value)
newGini = left.shape[0] / dataSet.shape[0] * self.cal_gini(left) + right.shape[0] / dataSet.shape[0] * self.cal_gini(right)
if newGini < minGini:
minGini = newGini
minValue = value
return minGini, minValue
def cal_split_se(self, dataSet, feature):
'''
计算数据集按照某一特征的值划分后,可以取得的最小方差,返回该方差和对应的值. 用于CART回归树
:param dataSet: 数据集m*(n+1),m为样本数,n为特征数
:param feature: 作为划分点的特征的索引
:return: 最小基尼指数与其对应的特征值
'''
values = np.unique(dataSet[:, feature])
minSe, minValue = np.inf, 0
for value in values:
left, right = self.split_dataSet(dataSet, feature, value)
newSe = self.cal_se(left) + self.cal_se(right)
if newSe < minSe:
minSe = newSe
minValue = value
return minSe, minValue
def choose_best_feature(self, dataSet):
'''
根据各算法的要求,选取对划分数据效果最好的特征.
:param dataSet: 数据集m*(n+1),m为样本数,n为特征数
:return: 对于ID3和C.45,返回最佳特征的索引值;对于CART回归树和分类树,返回最佳特征的索引值和对应的特征值
'''
m, n = dataSet.shape[0], dataSet.shape[1] - 1
baseEnt = self.cal_entroy(dataSet)
deltaGini, deltaInfo = np.inf, -np.inf # 前者用于CART,后者用于ID3和C.45
bestFeature, bestValue = -1, 0 # 定义最佳特征索引和特征值
for feature in range(n):
if self.algorithm == 'ID3':
newDeltaInfo = self.cal_entroy_gain(baseEnt, dataSet, feature)
if newDeltaInfo > deltaInfo:
bestFeature = feature
deltaInfo = newDeltaInfo
elif self.algorithm == 'C4.5':
newDeltaInfo = self.cal_entroy_gain_rate(baseEnt, dataSet, feature)
if newDeltaInfo > deltaInfo:
bestFeature = feature
deltaInfo = newDeltaInfo
elif self.algorithm == 'CARTcla':
newGini, value = self.cal_split_gini(dataSet, feature)
if newGini < deltaGini:
deltaGini = newGini
bestValue = value
bestFeature = feature
else: # CART回归树
newSe, value = self.cal_split_se(dataSet, feature)
if newSe < deltaGini:
deltaGini = newSe
bestValue = value
bestFeature = feature
if self.algorithm == 'ID3' or self.algorithm == 'C4.5':
return bestFeature
else:
return bestFeature, bestValue
def training(self, dataSet, featureLabel=None):
'''
训练模型,即生成决策树的函数.利用字典来作为树的数据结构.ID3和C4.5是N叉树,CART是二叉树
:param dataSet: 数据集m*(n+1),m为样本数,n为特征数
:param featureLabel: 索引值对应的含义列表,若没有给定,则用初始数据的索引值代替.
:return: 字典形式的决策树
'''
dataSet = np.array(dataSet)
targets = dataSet[:, -1]
if np.unique(targets).shape[0] == 1: # 即标签列表中只有一个类别,返回此类别
return targets[0]
if dataSet.shape[1] == 1: # 对应 没有特征值可分的情况
return Counter(targets.tolist()).most_common(1)[0]
if featureLabel == None: featureLabel = [i for i in range(dataSet.shape[1] - 1)] # 若没有给定对照表,则用初始数据的索引值代替.
if self.algorithm == 'ID3' or self.algorithm == 'C4.5':
bestFeature = self.choose_best_feature(dataSet) # 选取最佳分类特征索引值
bestFeatureLabel = featureLabel[bestFeature] # 获取其含义
featureLabel_copy = featureLabel.copy() # 避免对源数据的修改
featureLabel_copy.pop(bestFeature) # 因为这个表要传递给子树使用,所以删去表中的这个元素(不然会导致索引值混乱,从而无法对应正确的特征)
mytree = {bestFeatureLabel: {}} # 创建根节点
values = np.unique(dataSet[:, bestFeature])
for value in values: # 针对最佳分类特征的每一个属性值,创建子树
sublabel = featureLabel_copy[:]# 更新 子 特征-含义 列表
mytree[bestFeatureLabel][value] = self.training(self.split_dataSet(dataSet, bestFeature, value),sublabel)
else:
bestFeature, bestValue = self.choose_best_feature(dataSet)
bestFeatureLabel = featureLabel[bestFeature]
mytree = {}
mytree['FeatLabel'] = bestFeatureLabel # 记录结点选择的特征
mytree['FeatValue'] = bestValue # 记录结点选择的特征的值
lSet, rSet = self.split_dataSet(dataSet, bestFeature, bestValue)
mytree['left'] = self.training(lSet, featureLabel) # 构建左子树
mytree['right'] = self.training(rSet, featureLabel) # 构建右子树
return mytree
def predict(self, tree, testData, featureLabel=None):
'''
使用训练好的决策树,对单个待测样本进行预测.如果要预测一个数据集,可以把数据集拆开来一个一个的进行预测再组合起来.
:param tree: 训练好的决策树
:param testData: 待测样本1*n
:return: 预测结果
'''
if not isinstance(tree, dict): # 终止条件,意味着到达叶子结点,返回叶子结点的值
return tree
if featureLabel == None: featureLabel = [i for i in range(testData.shape[1] - 1)]
if self.algorithm == 'ID3' or self.algorithm == 'C4.5':
bestFeatureLabel = list(tree.keys())[0] # 获取特征-含义对照表的值
bestFeature = featureLabel.index(bestFeatureLabel) # 获取特征的索引值
subTree = tree[bestFeatureLabel] # 获取子树
valueOffeat = subTree[testData[bestFeature]] # 找到测试样本相应特征值对应的子树,遍历该子树
return self.predict(valueOffeat, testData, featureLabel)
else:
bestFeatureLabel = tree['FeatLabel']
bestFeature = featureLabel.index(bestFeatureLabel)
if self.algorithm == 'CARTcla': # CART分类树
if testData[bestFeature] == tree['FeatValue']:
return self.predict(tree['left'], testData, featureLabel)
else:
return self.predict(tree['right'], testData, featureLabel)
else: # CART回归树
if testData[bestFeature] <= tree['FeatValue']:
return self.predict(tree['left'], testData, featureLabel)
else:
return self.predict(tree['right'], testData, featureLabel)
def prune(self, tree, testData):
'''
利用测试集,对生成树进行后剪枝(CART回归树)
:param tree: 训练好的决策树
:param testData: 测试集数据m*(n+1),带标签列
:return: 剪枝后的决策树
'''
def istree(tree): # 判断是否为决策树
return isinstance(tree, dict)
def getmean(tree): # 返回决策树所有叶子结点的均值
if istree(tree['left']): tree['left'] = getmean(tree['left'])
if istree(tree['right']): tree['right'] = getmean(tree['right'])
return (tree['left'] + tree['right']) / 2
if self.algorithm == 'CARTreg':
if not testData: # 如果测试集为空,则对决策树做塌陷处理,返回树的叶子结点的均值
return getmean(tree)
if istree(tree['left']) or istree(tree['right']):
left, right = self.split_dataSet(testData, tree['FeatLabel'], tree['FeatValue'])
if istree(tree['left']): tree['left'] = self.prune(tree['left'], left) # 遍历左子树
if istree(tree['right']): tree['right'] = self.prune(tree['right'], right) # 遍历右子树
if not istree(tree['left']) and not istree(tree['right']): # 抵达叶子结点
left, right = self.split_dataSet(testData, tree['FeatLabel'], tree['FeatValue'])
errorNomerge = np.sum(np.power(left[:, -1] - tree['left'], 2)) + np.sum(np.power(right[:, -1] - tree['right'], 2))
treeMean = (tree['left'] + tree['right']) / 2
errorMerge = np.sum(np.power(testData[:, -1] - treeMean, 2))
if errorMerge <= errorNomerge: # 比较合并后与合并前,测试数据的误差,那个更小
return treeMean
else: return tree
return tree
def test():
dataSet = [[1, 1, 'yes'],
[1, 1, 'yes'],
[1, 0, 'no'],
[0, 1, 'no'],
[0, 1, 'no']]
dataSet2 = np.array([[1, 5.56], [2, 5.70], [3, 5.91], [4, 6.40], [5, 6.80], [6, 7.05], [7, 8.90], [8, 8.70], [9, 9.00], [10, 9.05]])
feature_label = ['No Surfacing', 'Flippers']
DT = DecisionTree(algorithm='ID3')
DT2 = DecisionTree(algorithm='CARTreg')
print(DT.training(dataSet, feature_label))
print(DT2.training(dataSet2))