Decision tree-continuous value processing code

The data set is the watermelon data set 3.0, and the density and sugar content in it are continuous features. Notes about the function are written in the code.
The idea comes from decision tree (3)-continuous value processing , thank you guys

import numpy as np
from math import log
import operator
from collections import Counter
# 获取数据集(西瓜数据集3.0)
def get_data():
    # 特征矩阵
    feat_matrix = np.array([
        ['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', 0.697, 0.460],
        ['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', 0.774, 0.376],
        ['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', 0.634, 0.264],
        ['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', 0.608, 0.318],
        ['浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', 0.556, 0.215],
        ['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘', 0.403, 0.237],
        ['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘', 0.481, 0.149],
        ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑', 0.437, 0.211],
        ['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑', 0.666, 0.091],
        ['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘', 0.243, 0.267],
        ['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑', 0.245, 0.057],
        ['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘', 0.343, 0.099],
        ['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑', 0.639, 0.161],
        ['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑', 0.657, 0.198],
        ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘', 0.360, 0.370],
        ['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑', 0.593, 0.042],
        ['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑', 0.719, 0.103]
                                                                                ])

    '''有一个问题就是,numpy array中的元素类型一定是一致的,按照上面创建的特征矩阵中,
    连续特征(密度、含糖率)的特征值在特征矩阵中的存储类型都自动转为 'numpy.str_'
    这样会导致连续特征的相关操作特别的麻烦emmmmm
    所以,决定,将离散特征的特征值都数值化,用大于等于1的数字给不同特征值按出现顺序编码。
    也就是说,比如色泽这一特征,‘青绿’为1、‘乌黑’为2...'''
    values_size = feat_matrix.shape[0]
    for discretefeat_idx in range(6):  # 6个离散特征
        dict_value_encode = {
    
    }
        encode = 0
        for value in feat_matrix[:,discretefeat_idx]:
            if value not in dict_value_encode.keys():
                encode += 1
                dict_value_encode[value] = encode
        for value_idx in range(values_size):
            feat_matrix[value_idx][discretefeat_idx] = dict_value_encode[feat_matrix[value_idx][discretefeat_idx]]
    feat_matrix = feat_matrix.astype(np.float64)
    # 类别标签
    labels = np.array(['是', '是', '是', '是', '是', '是', '是', '是',
                       '否', '否', '否', '否', '否', '否', '否', '否', '否'])
    # 特征名
    feat_names = np.array(['色泽', '根蒂', '敲击', '纹理', '脐部', '触感','密度','含糖率'])
    return feat_matrix,labels,feat_names

# 计算经验熵
def cal_entropy(x):
    x_set = set(x)
    x_size = x.shape[0]
    entropy = 0.0
    for label in x_set:
        p = np.count_nonzero(x == label) / x_size
        entropy -= p * log(p,2)
    return entropy

# 根据选定特征计算条件信息熵
def cal_conditionalentropy(feat_values,labels):
    values_set = set(feat_values)
    values_size = feat_values.shape[0]
    c_entropy = 0.0
    for value in values_set:
        p = np.count_nonzero(feat_values == value)/values_size
        c_entropy += p * cal_entropy(labels[feat_values == value])
    return c_entropy

# 计算连续特征的最小条件熵,以得到最大的信息增益,原理是上面的博客
# 大致原理:
'''
设某连续特征有n个取值,先将它们从小到大排序
使用二分法,尝试n-1种中间值(函数中的阈值就是中间值,通过前后两个元素的平均数计算得)
比较各自的条件熵,取最小的那个,最终得到最大的信息增益
返回值有二,一是最小的条件熵,二是中间值
'''
def cal_min_conditionalentropy(feat_values,labels):
    values_size = feat_values.shape[0]
    zip_feat_values_labels = dict(zip(feat_values,labels)) # 将连续特征取值和对应的分类标签zip起来
    # 按照特征值升序排序,分类标签跟着一起排
    zip_feat_values_labels_sorted = dict(sorted(zip_feat_values_labels.items(),key=operator.itemgetter(0)))
    feat_values_sorted = np.array(list(zip_feat_values_labels_sorted.keys())) # 排序过后的特征取值
    labels_sorted = np.array(list(zip_feat_values_labels_sorted.values()))  # 排序过后的分类标签
    thresholds = [(feat_values_sorted[idx]+feat_values_sorted[idx+1])/2  # n个特征取值有n-1个缝隙,得n-1个阈值
                  for idx in range(feat_values_sorted.shape[0]-1)]
    min_c_entropy = float('inf')
    min_c_entropy_threshold = (feat_values_sorted[0] + feat_values_sorted[1])/2 # 初始化阈值是第一个缝隙中的
    for threshold in thresholds:
        filter_left = feat_values_sorted <= threshold  # 阈值左边的部分
        feat_values_left = feat_values_sorted[filter_left]
        labels_left = labels_sorted[filter_left]

        filter_right = feat_values_sorted > threshold  # 阈值右边的部分
        feat_values_right = feat_values_sorted[filter_right]
        labels_right = labels_sorted[filter_right]
        c_entropy = feat_values_left.shape[0]/values_size*cal_entropy(labels_left) +\
                    feat_values_right.shape[0]/values_size*cal_entropy(labels_right)
        if c_entropy <= min_c_entropy:
            min_c_entropy = c_entropy
            min_c_entropy_threshold = threshold
    return min_c_entropy,min_c_entropy_threshold  # 返回有二,最小的条件信息熵和对应的阈值

# 根据选定特征计算信息增益
def cal_info_gain(feat_values,labels):
    # 如果是离散值
    if feat_values[0].item()>=1:
        return cal_entropy(labels) - cal_conditionalentropy(feat_values,labels),'discrete'
    # 如果是连续的
    else:
        min_c_entropy, min_c_entropy_threshold = cal_min_conditionalentropy(feat_values,labels)
        return cal_entropy(labels) - min_c_entropy,min_c_entropy_threshold


# 根据选定特征计算信息增益比
def cal_info_gain_ratio(feat_values,labels):
    return (cal_info_gain(feat_values,labels) + 0.01)/(cal_entropy(feat_values)+0.01)


# 生成决策树中的第二个终止条件满足时,返回实例数最大的类
def get_max_label(labels):
    return Counter(labels)[0][0]

# 选择信息增益、信息增益比最大的特征
def get_best_feat(feat_matrix,labels):
    feat_num = feat_matrix.shape[1]
    best_feat_idx = -1
    max_info_gain = 0.0
    ret_sign = 'discrete'  # 默认是离散的
    for feat_idx in range(feat_num):
        feat_values = feat_matrix[:,feat_idx]
        info_gain,sign = cal_info_gain(feat_values,labels)
        if info_gain >= max_info_gain:
            max_info_gain = info_gain
            best_feat_idx = feat_idx
            ret_sign = sign
    return best_feat_idx,ret_sign

# 根据选定特征,划分得到子集
def get_subset(feat_matrix,labels,best_feat,sign):
    feat_values = feat_matrix[:,best_feat]
    if sign == 'discrete':
        values_set = set(feat_values)
        feat_matrix = np.delete(feat_matrix,best_feat,1)
        feat_matrixset = {
    
    }
        labelsset = {
    
    }
        for value in values_set:
            feat_matrixset[value] = feat_matrix[feat_values==value]
            labelsset[value] = labels[feat_values==value]
    # 连续值
    else:
        threshold = sign
        feat_matrixset = {
    
    }
        labelsset = {
    
    }
        # 左
        filter_left = feat_values <= threshold
        feat_matrixset['<={}'.format(threshold)] = feat_matrix[filter_left]
        labelsset['<={}'.format(threshold)] = labels[filter_left]
        # 右
        filter_right = feat_values > threshold
        feat_matrixset['>{}'.format(threshold)] = feat_matrix[filter_right]
        labelsset['>{}'.format(threshold)] = labels[filter_right]
    return feat_matrixset,labelsset

"""
introduction:
    生成一棵决策树
parameter:
    feat_matrix:特征矩阵
    labels:类别标签
    feat_names:特征名称
    method:选择方法(信息增益、信息增益比)
"""
def create_decision_tree(feat_matrix,labels,feat_names,method):
    # 首先考虑两种终止条件:1、类别标签只有一种类型 2、特征没有其他取值
    if len(set(labels)) == 1:
        return labels[0] # 类型为numpy.int32
    if feat_matrix.shape[0] == 0:
        return get_max_label(labels)
    # 选择信息增益最大的特征,sign标志着是离散特征还是连续特征,若连续,sign为阈值
    best_feat,sign = get_best_feat(feat_matrix,labels)
    best_feat_name = feat_names[best_feat]
    # 初始化树
    decision_tree = {
    
    best_feat_name:{
    
    }}
    # 如果是离散的,则要删除该特征,否则不删。思考:连续特征何时删?当
    if sign == 'discrete':
        feat_names = np.delete(feat_names,best_feat)
    # 得到子集(得到字典类型的子集,键为选定特征的不同取值)
    feat_matrixset, labelsset = get_subset(feat_matrix,labels,best_feat,sign)
    # 递归构造子树
    for value in feat_matrixset.keys():
        decision_tree[best_feat_name][value] = create_decision_tree(feat_matrixset[value],labelsset[value],feat_names,method)
    return decision_tree
if __name__ == "__main__":
   feat_matrix,labels,feat_names = get_data()
   decision_tree = create_decision_tree(feat_matrix,labels,feat_names,method='ID3')
   print(decision_tree)
   createPlot(decision_tree)

The final visualization result:
Insert picture description here

Guess you like

Origin blog.csdn.net/jokerxsy/article/details/105920168