以下是我根据信息增益率原理设计的C4.5算法,在分类算法中,sklearn无法实现C4.5算法,人们只能借助其他框架或者独立编程实现C4.5算法。
# --*-- coding:utf-8 --*--
import numpy as np
# 计算信息熵
def cal_ent(list): # list为M行N列的样本集
list_count = len(list)
lable_count = {}
for i in list:
k = i[-1]
if k not in lable_count:
lable_count[k] = 1
else:
lable_count[k] += 1
ent = 0.0
for key in lable_count:
val1 = lable_count[key] / list_count
val2 = -val1 * np.log2(val1)
ent += val2
return ent
# 按照特定特征分类数据集并输出
def splitdataset(list, vec_order, val):
list_out0 = []
for i in list:
list_out = []
if i[vec_order] == val:
list_out.extend(i[:vec_order])
list_out.extend(i[vec_order + 1:])
list_out0.append(list_out)
return list_out0
# 抽取特征函数
def get_feat(list):
out = []
for i in range(len(list[0])):
s = []
for j in list:
k = []
k.append(j[i])
s.append(k)
out.append(s)
return out
# 抽取单一特征函数
def get_feat0(list, i):
out = []
for k in list:
out.append(k[i])
return out
# 选区最好划分特征:
def get_bestfeature(list):
# 计算每个特征的熵
vect_list = get_feat(list)
ent_list = []
for i in vect_list:
ent_list.append(cal_ent(i))
# 划分特征集
max_rate = 0.0
best_vate = 0
for i in range(len(list[0]) - 1):
vect_order = i
type_list = set(get_feat0(list, i))
f_score = 0.0
for j in type_list:
son_list = splitdataset(list, i, j)
f_score += (len(son_list) / len(list)) * cal_ent(son_list)
if max_rate < float((ent_list[-1] - f_score) / ent_list[vect_order]):
max_rate = float((ent_list[-1] - f_score) / ent_list[vect_order])
best_vate = vect_order
return best_vate
def c45(list):
mytree = {'node': '', 'son_tree': {}}
# 观察数据集是否属于同一类
list_target = get_feat0(list, -1)
list_set = set(list_target)
if len(list_set) == 1:
mytree['node'] = list_target[0]
# if 数据集属于同一类:
# 返回类标签
else:
# 寻找划分数据集的最好特征
best_feat = get_bestfeature(list)
# 划分数据集
mytree['node'] = best_feat
list_v = get_feat(list)
for i1 in list_v:
mytree['son_tree'] = c45(i1)
return mytree