The original address of the code used is: github.com/Erikfather/…
Interpretation of ID3 public part code:
Information entropy
The formula for calculating information entropy is:
We take the dataset by hand, and then take out the last label for classification and counting, assuming 5 in class 0 and 7 in class 1, the information entropy for this is
# 计算信息熵
def cal_entropy(dataset):
numEntries = len(dataset)
labelCounts = {}
# 给所有可能分类创建字典
for featVec in dataset:
currentlabel = featVec[-1]
if currentlabel not in labelCounts.keys():
labelCounts[currentlabel] = 0
labelCounts[currentlabel] += 1
Ent = 0.0
for key in labelCounts:
p = float(labelCounts[key]) / numEntries
Ent = Ent - p * log(p, 2) # 以2为底求对数
return Ent
复制代码
Divide the dataset
This part is that we assume that a feature has two values of 0, 1, and 2. Our dataset first takes out the feature sequence whose feature value is 0, then removes the feature of this column, and returns the last one with feature 0 removed. feature data set
def splitdataset(dataset, axis, value):
retdataset = [] # 创建返回的数据集列表
for featVec in dataset: # 抽取符合划分特征的值
if featVec[axis] == value:
reducedfeatVec = featVec[:axis] # 去掉axis特征
reducedfeatVec.extend(featVec[axis + 1:]) # 将符合条件的特征添加到返回的数据集列表
retdataset.append(reducedfeatVec)
return retdataset
复制代码
The original ID3 code flow (calculation process)
full original code
Pick the best features
# ID3算法
def ID3_chooseBestFeatureToSplit(dataset):
# 特征数量
numFeatures = len(dataset[0]) - 1
# 计算基础信息熵(方便后面计算信息增益)
baseEnt = cal_entropy(dataset)
# ID3主要是以信息增益为准则选择划分属性
bestInfoGain = 0.0
bestFeature = -1
for i in range(numFeatures): # 遍历所有特征
# 得到单一特征的所有值(集合过滤)
featList = [example[i] for example in dataset]
uniqueVals = set(featList) # 将特征列表创建成为set集合,元素不可重复。创建唯一的分类标签列表
newEnt = 0.0
# 使用上面的特征值进行划分数据集合,计算给特征集合下的所有信息熵的总和
for value in uniqueVals: # 计算每种划分方式的信息熵
subdataset = splitdataset(dataset, i, value)
p = len(subdataset) / float(len(dataset))
newEnt += p * cal_entropy(subdataset)
# 得到信息增益
infoGain = baseEnt - newEnt
print(u"ID3中第%d个特征的信息增益为:%.3f" % (i, infoGain))
# 通过历史最优的方式留下最好的特征和信息增益
if (infoGain > bestInfoGain):
bestInfoGain = infoGain # 计算最好的信息增益
bestFeature = i
# 最后返回信息增益最好的特征
return bestFeature
复制代码
build tree control flow
----------------------------------------------------------
树的构建的过程是:通过递归的思想来不断的创建树
----------------------------------------------------------
# 利用ID3算法创建决策树
def ID3_createTree(dataset, labels, test_dataset):
# 首先先将所有的标签的列表拿出来
classList = [example[-1] for example in dataset]
# 这个部分是递归停止的部分---停止条件【拿到的数据集中的标签都是一样的,或者只剩下一个数据】
if classList.count(classList[0]) == len(classList):
# 类别完全相同,停止划分
return classList[0]
if len(dataset[0]) == 1:
# 遍历完所有特征时返回出现次数最多的
return majorityCnt(classList)
# 挑选最好的特征(返回的是标号)
bestFeat = ID3_chooseBestFeatureToSplit(dataset)
# 在这我们可以换成文字的标签
bestFeatLabel = labels[bestFeat]
print(u"此时最优索引为:" + (bestFeatLabel))
# 初始化ID3的树的结构
ID3Tree = {bestFeatLabel: {}}
# 删除labels中已经选择了的标签
del (labels[bestFeat])
# 得到列表包括节点所有的属性值
featValues = [example[bestFeat] for example in dataset]
uniqueVals = set(featValues)
# 在创建决策树之前的准备工作
if pre_pruning:
ans = []
for index in range(len(test_dataset)):
ans.append(test_dataset[index][-1])
result_counter = Counter()
for vec in dataset:
result_counter[vec[-1]] += 1
# result_counter.most_common(1) [(1,9)] 主要是取出出现次数最多的特征的值
leaf_output = result_counter.most_common(1)[0][0]
# 主要是计算初始的准确率(将测试集全部预测成上面得到的leaf_output的准确率)
root_acc = cal_acc(test_output=[leaf_output] * len(test_dataset), label=ans)
outputs = []
ans = []
# 通过选出来的每个值进行划分数据后,我们观察准确率的上升的情况(这个里面我们选出来的特征的值是0,1,我们将测试集进行分割后针对叶子节点所属的类别,进行归属,计算此时的准确率)
# 计算的方法就是,我们通过特征进行分割后,0下面有一波数据,1下面有一波数据,我们将数据中标签数量最多的代表本叶子节点的类别,然后我们将测试数据,也用这个特征的值进行分类,我们将上面的到的类别直接赋予测试集合,然后计算整体的准确率是否有提升
for value in uniqueVals:
cut_testset = splitdataset(test_dataset, bestFeat, value)
cut_dataset = splitdataset(dataset, bestFeat, value)
for vec in cut_testset:
ans.append(vec[-1])
result_counter = Counter()
for vec in cut_dataset:
result_counter[vec[-1]] += 1
leaf_output = result_counter.most_common(1)[0][0]
outputs += [leaf_output] * len(cut_testset)
cut_acc = cal_acc(test_output=outputs, label=ans)
# 若通过这个特征分割后,我们的准确率较之前上升了,我们就是用此特征分割
# 若此特征分割后,并没有准确率的提升,我们就直接返回做叶子节点,不进行分割
if cut_acc <= root_acc:
return leaf_output
# 构建决策树的主体(递归构建决策树)
# 实际上这个部分就是我们针对这个特征进行分支后,在对子集进行构建决策树
for value in uniqueVals:
subLabels = labels[:]
ID3Tree[bestFeatLabel][value] = ID3_createTree(
splitdataset(dataset, bestFeat, value),
subLabels,
splitdataset(test_dataset, bestFeat, value))
# 构建决策树的后续步骤
if post_pruning:
tree_output = classifytest(ID3Tree,
featLabels=['年龄段', '有工作', '有自己的房子', '信贷情况'],
testDataSet=test_dataset)
ans = []
for vec in test_dataset:
ans.append(vec[-1])
root_acc = cal_acc(tree_output, ans)
result_counter = Counter()
for vec in dataset:
result_counter[vec[-1]] += 1
leaf_output = result_counter.most_common(1)[0][0]
cut_acc = cal_acc([leaf_output] * len(test_dataset), ans)
if cut_acc >= root_acc:
return leaf_output
return ID3Tree
复制代码
helper function
# 这个函数的主要的作用是,通过我们构造的决策树,我们将单条的数据进行输入,得到这个数据的预测的标签
def classify(inputTree, featLabels, testVec):
"""
输入:决策树,分类标签,测试数据
输出:决策结果
描述:跑决策树
"""
firstStr = list(inputTree.keys())[0]
secondDict = inputTree[firstStr]
featIndex = featLabels.index(firstStr)
classLabel = '0'
for key in secondDict.keys():
if testVec[featIndex] == key:
if type(secondDict[key]).__name__ == 'dict':
classLabel = classify(secondDict[key], featLabels, testVec)
else:
classLabel = secondDict[key]
return classLabel
# 针对整个测试集合的
def classifytest(inputTree, featLabels, testDataSet):
"""
输入:决策树,分类标签,测试数据集
输出:决策结果
描述:跑决策树
"""
classLabelAll = []
for testVec in testDataSet:
classLabelAll.append(classify(inputTree, featLabels, testVec))
return classLabelAll
# 计算准确率,主要是测试数据的预测标签和真实标签之间的准确率
def cal_acc(test_output, label):
"""
:param test_output: the output of testset
:param label: the answer
:return: the acc of
"""
assert len(test_output) == len(label)
count = 0
for index in range(len(test_output)):
if test_output[index] == label[index]:
count += 1
return float(count / len(test_output))
复制代码
C4.5 public part code interpretation:
# C4.5算法:使用“增益率”来选择划分属性,实现和id3基本一致
# C4.5算法
def C45_chooseBestFeatureToSplit(dataset):
numFeatures = len(dataset[0]) - 1
baseEnt = cal_entropy(dataset)
bestInfoGain_ratio = 0.0
bestFeature = -1
for i in range(numFeatures): # 遍历所有特征
featList = [example[i] for example in dataset]
uniqueVals = set(featList) # 将特征列表创建成为set集合,元素不可重复。创建唯一的分类标签列表
newEnt = 0.0
IV = 0.0
for value in uniqueVals: # 计算每种划分方式的信息熵
subdataset = splitdataset(dataset, i, value)
p = len(subdataset) / float(len(dataset))
newEnt += p * cal_entropy(subdataset)
IV = IV - p * log(p, 2)
infoGain = baseEnt - newEnt
if (IV == 0): # fix the overflow bug
continue
# 比id3多一个步骤就是计算信息熵增益率
infoGain_ratio = infoGain / IV # 这个feature的infoGain_ratio
print(u"C4.5中第%d个特征的信息增益率为:%.3f" % (i, infoGain_ratio))
if (infoGain_ratio > bestInfoGain_ratio): # 选择最大的gain ratio
bestInfoGain_ratio = infoGain_ratio
bestFeature = i # 选择最大的gain ratio对应的feature
return bestFeature
复制代码
Interpretation of CART public part code:
The formula for calculating the Gini coefficient
# CART算法,是基于基尼系数进行挑选最优化的特征
# CART算法
def CART_chooseBestFeatureToSplit(dataset):
numFeatures = len(dataset[0]) - 1
bestGini = 999999.0
bestFeature = -1
for i in range(numFeatures):
featList = [example[i] for example in dataset]
uniqueVals = set(featList)
gini = 0.0
for value in uniqueVals:
subdataset = splitdataset(dataset, i, value)
p = len(subdataset) / float(len(dataset))
subp = len(splitdataset(subdataset, -1, '0')) / float(len(subdataset))
gini += p * (1.0 - pow(subp, 2) - pow(1 - subp, 2))
print(u"CART中第%d个特征的基尼值为:%.3f" % (i, gini))
if (gini < bestGini):
bestGini = gini
bestFeature = i
return bestFeature
复制代码