用python实现决策树算法

使用python实现决策树

#coding=utf-8
'''
the feature's value is all discrete
build a decision tree by ID3,CART,C4.5 respectively
'''
import numpy as np
import csv
from math import log
import json


def find_val(list):
    # list is a list
    val = []
    for i in range(len(list)):
        if list[i] not in val:
            val.append(list[i])

    return val


def information_entorpy(Feature,Label):
    # feature is a list, label is a list
    f_val = find_val(Feature)
    l_val = find_val(Label)

    p = np.zeros([len(f_val),len(l_val)],np.float32)
    for i in range(len(Feature)):
        for j in range(len(f_val)):
            if Feature[i] == f_val[j]:
                for k in range(len(l_val)):
                    if Label[i] == l_val[k]:
                        p[j,k] += 1
    p_sum = np.sum(p,1).reshape([-1,1])
    p = p/p_sum

    E = 0
    D = len(Label)
    for i in range(p.shape[0]):
        for j in range(p.shape[1]):
            if p[i,j]>0:
                E += -p[i,j]*log(p[i,j],2)*p_sum[i]/D

    return E


def gain_ratio(E_gain,Feature,Label):
    # E_gain is information gain of the Feature
    f_val = find_val(Feature)
    f_count = []
    for val in f_val:
        f_count.append(Feature.count(val))
    D = len(Label)
    IV = 0.0  # intrinsic value of the feature
    for val in f_count:
        IV += -(val/D)*log(val/D,2)

    return E_gain/IV


def gini_index(Feature,Label):
    # Feature is a list, label is a list
    f_val = find_val(Feature)
    l_val = find_val(Label)
    p = np.zeros([len(f_val),len(l_val)],np.float32)
    for i in range(len(Feature)):
        for j in range(len(f_val)):
            if Feature[i] == f_val[j]:
                for k in range(len(l_val)):
                    if Label[i] == l_val[k]:
                        p[j,k] += 1

    p_sum = np.sum(p,1).reshape([-1,1])
    p = p/p_sum

    D = len(Label)
    Gini = 0.0
    for i in range(p.shape[0]):
        Gini += p_sum[i]/D
        for j in range(p.shape[1]):
            if p[i,j]>0:
                Gini -= p[i,j]*p[i,j]*p_sum[i]/D

    return Gini


def ID3_select_feature(Feature_all,Label):
    # feature_all is a dictionary, label is a list
    no_feature = [1]*len(Label)
    E_label = information_entorpy(no_feature,Label)  # information entropy before devided by any one of Feature_all

    E_gain = {}
    for val in Feature_all.keys():
        E_gain[val] = E_label - information_entorpy(Feature_all[val],Label)  # Information Gain

    return list(E_gain.keys())[list(E_gain.values()).index(max(list(E_gain.values())))]


def C45_select_feature(Feature_all,Label):
    # feature_all is a dictionary, label is a list
    no_feature = [1]*len(Label)
    E_label = information_entorpy(no_feature,Label)  # information entropy before devided by any one of Feature_all

    # information gain
    E_gain = {}
    for val in Feature_all.keys():
        E_gain[val] = E_label - information_entorpy(Feature_all[val],Label)  # Information Gain
    E_gain_value = list(E_gain.values())
    E_gain_mean = sum(E_gain_value)/len(E_gain_value)

    # gain ratio of the feature who's information gain is more than the mean of all information gain 
    E_ratio = {}
    for val in E_gain.keys():
        if E_gain[val]>=E_gain_mean:
            E_ratio[val] = gain_ratio(E_gain[val],Feature_all[val],Label)

    return list(E_ratio.keys())[list(E_ratio.values()).index(max(list(E_ratio.values())))]


def CART_select_feature(Feature_all,Label):
    Gini_index = {}
    for val in Feature_all.keys():
        Gini_index[val] = gini_index(Feature_all[val],Label)

    return list(Gini_index.keys())[list(Gini_index.values()).index(min(list(Gini_index.values())))]


def find_data(data,feature_val,feature_key,label,feature):
    # data is a 2-dim list,feature_val is the value of the feature, feature_key is the key of features
    new_data = []
    new_label = []
    new_feature = {}
    for val in feature.keys():
        new_feature[val] = []
    for i in range(len(data)):
        if data[i][feature_key]==feature_val:
            data[i][feature_key] = 0
            new_data.append(data[i])
            new_label.append(label[i])
            for val in new_feature.keys():
                new_feature[val].append(feature[val][i])
    return new_data,new_label,new_feature


def build_tree(features,label,data):
    # features is a dictionary, label is a list, data is a list
    # case 1:
    l_now = find_val(label)
    if len(l_now)==1:
        node = label[0]
        return node

    # case 2:
    data_val = find_val(data)
    if len(features)==0 or len(data_val)==1:
        l_count=[]
        for val in l_now:
            l_count.append(label.count(val))
        node = label[l_count.index(max(l_count))]
        return node

    # case 3:   
    feature_key = CART_select_feature(features,label) # can change the select feature (ID3,C4.5,CART)
    node = {feature_key:{}}
    f_val = find_val(features[feature_key])
    for val in f_val:
        new_features={}
        for key in features.keys():
            new_features[key] = features[key]
        del new_features[feature_key]
        new_data,new_label,new_features = find_data(data,val,feature_key,label,new_features)
        node[feature_key][val]=build_tree(new_features,new_label,new_data)

    return node


def use_decision_tree(data,tree,label):
    index = list(tree.keys())[0]
    tree = tree[index]
    if tree[data[index]] in label:
        return tree[data[index]]
    else:
        for val in tree.keys():
            if val==data[index]:
                tree = tree[val]
                return use_decision_tree(data,tree,label)




csvpath = 'watermelon.csv'  #data
f = csv.reader(open(csvpath))

fea_num = 6  # features number = 6
features = {}  # feature is a dict ,the key is the feature's index
for i in range(fea_num):
    features[i] = []
label = []
data_train = [] # after build decision tree, the data_train will be changed
data_test = []
for row in f:
    data_train.append([row[0],row[1],row[2],row[3],row[4],row[5]])#,row[6],row[7]])
    data_test.append([row[0],row[1],row[2],row[3],row[4],row[5]])#,row[6],row[7]])
    for i in range(fea_num):
        features[i].append(row[i])
    label.append(row[8])

decision_tree = build_tree(features,label,data_train)
output = []
for i in range(len(data_test)):
    output.append(use_decision_tree(data_test[i],decision_tree,label))
print(output)
#json_tree = json.dumps(decision_tree,indent=1,ensure_ascii=False)
#print(json_tree)









猜你喜欢

转载自blog.csdn.net/qw_sunny/article/details/78915397