周志华《机器学习》西瓜书 小白Python学习笔记(四) ———— 第四章 决策树 python代码 预剪枝

周志华《机器学习》西瓜书 小白Python学习笔记(三) ———— 第四章 决策树 python代码 预剪枝

基于西瓜数据集2.0(提取码:esa8)
,选择信息增益作为属性选择指标,建立决策树。
步骤如下:
输入离散变量的取值集合与标签,并读取数据.

D_keys = {
    '色泽': ['青绿', '乌黑', '浅白'],
    '根蒂': ['蜷缩', '硬挺', '稍蜷'],
    '敲声': ['清脆', '沉闷', '浊响'],
    '纹理': ['稍糊', '模糊', '清晰'],
    '脐部': ['凹陷', '稍凹', '平坦'],
    '触感': ['软粘', '硬滑'],
}
keys = ['是', '否']

根据留出法将数据集分为相斥的测试集与训练集,比例为3:7,为了保证均匀取出好瓜和坏瓜,采用分层抽样,即分别在好瓜和坏瓜的集合中随机取样。

#划分训练集&测试集,留出法,比例为7:3,分层抽样
def traintest(dataSet):
    dataSet0 = dataSet[dataSet['好瓜'] == '是']
    dataSet1 = dataSet[dataSet['好瓜'] == '否']
    list0 = dataSet0.sample(frac=0.7)
    list0 = list0.append(dataSet1.sample(frac=0.7))
    rowlist = []
    for indexs in list0.index:
        rowlist.append(indexs)
    list1 = dataSet.drop(rowlist, axis=0)
    return list0,list1

编写在生成树时会用到的子函数:

# 叶节点选择其类别为D中样本最多的类
def choose_largest_example(D):
    count = D['好瓜'].value_counts()
    return '是' if count['是'] > count['否'] else '否'

# 测试决策树的准确率
def test_Tree(Tree, data_test):
    accuracy = 0
    for index, row in data_test.iterrows():
        result = dfs_Tree(Tree, row)
        if result == row['好瓜']:
            # print(row.values, Tree)
            accuracy += 1


    return accuracy / data_test.shape[0]

# 判断D中的样本在A上的取值是否相同
def same_value(D, A):
    for key in A:
        if key in D_keys and len(D[key].value_counts()) > 1:
            return False

    return True


# 计算给定数据集的熵
def calc_Ent(dataSet):
    numEntries = dataSet['power'].sum()
    Count = dataSet.groupby('好瓜')['power'].sum()
    Ent = 0.0

    for key in keys:
        # print(Count[key])
        if key not in Count:
            Ent -= 0.0
        else:
            prob = Count[key] / numEntries
            Ent -= prob * math.log(prob, 2)

    return Ent


# 计算按key划分的信息增益值
def calc_Gain_D(D, D_no_nan, key, Ent_D):
    Ent = 0.0
    D_size = D['power'].sum()
    D_nan_size = D_no_nan['power'].sum()
    for value in D_keys[key]:
        Dv = D.loc[D[key] == value]
        Dv_size = Dv['power'].sum()
        Ent_Dv = calc_Ent(Dv)
        Ent += Dv_size / D_nan_size * Ent_Dv

    return D_nan_size / D_size * (Ent_D - Ent)


# 生成连续值属性的候选划分点集合T
def candidate_T(D, key, n):
    L = set(D[key])
    T = []
    a, Sum = 0, 0
    for value in L:
        Sum += value
        a += 1
        if a == n:
            T.append(Sum / n)
            a, Sum = 0, 0

    if a > 0:
        T.append(Sum / a)

    return T


# 计算样本D基于划分点t二分后的连续值属性信息增益
def calc_Gain_t(D, D_no_nan, key, t, Ent_D):
    Ent = 0.0
    D_size = D['power'].sum()
    D_nan_size = D_no_nan['power'].sum()

    Dv = D.loc[D[key] <= t]
    Dv_size = Dv['power'].sum()
    Ent_Dv = calc_Ent(Dv)
    Ent += Dv_size / D_nan_size * Ent_Dv

    Dv = D.loc[D[key] > t]
    Dv_size = Dv['power'].sum()
    Ent_Dv = calc_Ent(Dv)
    Ent += Dv_size / D_nan_size * Ent_Dv

    return D_nan_size / D_size * (Ent_D - Ent)


# 计算样本D基于不同划分点t二分后的连续值属性信息增益,找出最大增益划分点
def calc_Gain_C(D, D_no_nan, key, Ent_D):
    n = 2
    T = candidate_T(D, key, n)
    max_Gain, max_partition = -1, -1
    for t in T:
        Gain = calc_Gain_t(D, D_no_nan, key, t, Ent_D)
        if max_Gain < Gain:
            max_Gain = Gain
            max_partition = t

    return max_Gain, max_partition


# 从A中选择最优的划分属性值,若为连续值,返回划分点
def choose_best_attribute(D, A):
    max_Gain, max_partition, partition, best_attr = -1, -1, -1, ''
    for key in A:
        # 划分属性为离散属性时
        if key in D_keys:
            D_no_nan = D.loc[pd.notna(D[key])]
            Ent_D = calc_Ent(D_no_nan)
            Gain = calc_Gain_D(D, D_no_nan, key, Ent_D)
        # 划分属性为连续属性时
        else:
            D_no_nan = D.loc[pd.notna(D[key])]
            Ent_D = calc_Ent(D_no_nan)
            Gain, partition = calc_Gain_C(D, D_no_nan, key, Ent_D)

        if max_Gain < Gain:
            best_attr = key
            max_Gain = Gain
            max_partition = partition

    return best_attr, max_partition

生成树,并进行预剪枝:

# 函数TreeGenerate 递归生成决策树,以下情形导致递归返回
# 1. 当前结点包含的样本全属于一个类别
# 2. 当前属性值为空, 或是所有样本在所有属性值上取值相同,无法划分
# 3. 当前结点包含的样本集合为空,不可划分
def TreeGenerate(D, A):
    Count = D['好瓜'].value_counts()
    if len(Count) == 1:
        return D['好瓜'].values[0]

    if len(A) == 0 or same_value(D, A):
        return choose_largest_example(D)

    node = {}
    best_attr, partition = choose_best_attribute(D, A)
    D_size = D.shape[0]
    # 最优划分属性为离散属性时
    if best_attr in D_keys:
        for value in D_keys[best_attr]:
            Dv = D.loc[D[best_attr] == value].copy()
            Dv_size = Dv.shape[0]
            Dv.loc[pd.isna(Dv[best_attr]), 'power'] = Dv_size / D_size
            temp1 = test_Tree(choose_largest_example(D),data_test)
            if Dv.shape[0] == 0:
                node[value] = choose_largest_example(D)
            else:
                new_A = [key for key in A if key != best_attr]
                node[value] = TreeGenerate(Dv, new_A)
                temp0 = test_Tree(node[value],data_test)
                if temp1 > temp0:
                    node[value] = choose_largest_example(D)


    # 最优划分属性为连续属性时
    else:
        # print(best_attr, partition)
        # print(D.values)
        left = D.loc[D[best_attr] <= partition].copy()
        Dv_size = left.shape[0]
        left.loc[pd.isna(left[best_attr]), 'power'] = Dv_size / D_size
        left_key = '<= ' + str(partition)

        if left.shape[0] == 0:
            node[left_key] = choose_largest_example(D)
        else:
            node[left_key] = TreeGenerate(left, A)

        right = D.loc[D[best_attr] > partition].copy()
        Dv_size = right.shape[0]
        right.loc[pd.isna(right[best_attr]), 'power'] = Dv_size / D_size

        right_key = '> ' + str(partition)
        temp1 = test_Tree(choose_largest_example(D), data_test)
        if right.shape[0] == 0:
            node[right_key] = choose_largest_example(D)
        else:
            node[right_key] = TreeGenerate(right, A)
            temp0 = test_Tree(node[right_key], data_test)
            if temp1 > temp0:
                node[right_key] = choose_largest_example(D)

    # plotTree.plotTree(Tree)
    return {best_attr: node}


# 获得下一层子树分支
def get_next_Tree(Tree, key, value):
    if key not in D_keys:
        partition = float(list(Tree[key].keys())[0].split(' ')[1])
        if value <= partition:
            value = '<= ' + str(partition)
        else:
            value = '> ' + str(partition)

    return Tree[key][value]

利用树判断:

# 深度优先遍历,判断预测值
def dfs_Tree(Tree, row):
    if type(Tree).__name__ == 'dict':
        key = list(Tree.keys())[0]
        value = row[key]
        if pd.isnull(value):
            result = {key: 0 for key in D_keys['好瓜']}
            for next_key in Tree[key]:
                next_Tree = Tree[key][next_key]
                temp = dfs_Tree(next_Tree, row)
                result[temp] += 1

            return '是' if count['是'] > count['否'] else '否'

        else:
            next_Tree = get_next_Tree(Tree, key, value)
            return dfs_Tree(next_Tree, row)
    else:
        return Tree

主函数

if __name__ == '__main__':
    # 读取数据
    filename = '/Users/haoranjiang/Documents/machine learning/111111111/tree.txt'
    dataSet = loadData(filename)

    dataSet.drop(columns=['编号'], inplace=True)
    # 考虑缺失值
    dataSet['power'] = 1.0

    data_train,data_test = traintest(dataSet)

    # 决策树训练
    A = [column for column in data_train.columns if column != '好瓜']
    Tree = TreeGenerate(data_train, A)

    # 决策树测试
    print('准确度:',test_Tree(Tree, data_test)*100,'%')

    print(Tree)

猜你喜欢

转载自blog.csdn.net/River_J777/article/details/105917613