使用python实现决策树
#coding=utf-8
'''
the feature's value is all discrete
build a decision tree by ID3,CART,C4.5 respectively
'''
import numpy as np
import csv
from math import log
import json
def find_val(list):
# list is a list
val = []
for i in range(len(list)):
if list[i] not in val:
val.append(list[i])
return val
def information_entorpy(Feature,Label):
# feature is a list, label is a list
f_val = find_val(Feature)
l_val = find_val(Label)
p = np.zeros([len(f_val),len(l_val)],np.float32)
for i in range(len(Feature)):
for j in range(len(f_val)):
if Feature[i] == f_val[j]:
for k in range(len(l_val)):
if Label[i] == l_val[k]:
p[j,k] += 1
p_sum = np.sum(p,1).reshape([-1,1])
p = p/p_sum
E = 0
D = len(Label)
for i in range(p.shape[0]):
for j in range(p.shape[1]):
if p[i,j]>0:
E += -p[i,j]*log(p[i,j],2)*p_sum[i]/D
return E
def gain_ratio(E_gain,Feature,Label):
# E_gain is information gain of the Feature
f_val = find_val(Feature)
f_count = []
for val in f_val:
f_count.append(Feature.count(val))
D = len(Label)
IV = 0.0 # intrinsic value of the feature
for val in f_count:
IV += -(val/D)*log(val/D,2)
return E_gain/IV
def gini_index(Feature,Label):
# Feature is a list, label is a list
f_val = find_val(Feature)
l_val = find_val(Label)
p = np.zeros([len(f_val),len(l_val)],np.float32)
for i in range(len(Feature)):
for j in range(len(f_val)):
if Feature[i] == f_val[j]:
for k in range(len(l_val)):
if Label[i] == l_val[k]:
p[j,k] += 1
p_sum = np.sum(p,1).reshape([-1,1])
p = p/p_sum
D = len(Label)
Gini = 0.0
for i in range(p.shape[0]):
Gini += p_sum[i]/D
for j in range(p.shape[1]):
if p[i,j]>0:
Gini -= p[i,j]*p[i,j]*p_sum[i]/D
return Gini
def ID3_select_feature(Feature_all,Label):
# feature_all is a dictionary, label is a list
no_feature = [1]*len(Label)
E_label = information_entorpy(no_feature,Label) # information entropy before devided by any one of Feature_all
E_gain = {}
for val in Feature_all.keys():
E_gain[val] = E_label - information_entorpy(Feature_all[val],Label) # Information Gain
return list(E_gain.keys())[list(E_gain.values()).index(max(list(E_gain.values())))]
def C45_select_feature(Feature_all,Label):
# feature_all is a dictionary, label is a list
no_feature = [1]*len(Label)
E_label = information_entorpy(no_feature,Label) # information entropy before devided by any one of Feature_all
# information gain
E_gain = {}
for val in Feature_all.keys():
E_gain[val] = E_label - information_entorpy(Feature_all[val],Label) # Information Gain
E_gain_value = list(E_gain.values())
E_gain_mean = sum(E_gain_value)/len(E_gain_value)
# gain ratio of the feature who's information gain is more than the mean of all information gain
E_ratio = {}
for val in E_gain.keys():
if E_gain[val]>=E_gain_mean:
E_ratio[val] = gain_ratio(E_gain[val],Feature_all[val],Label)
return list(E_ratio.keys())[list(E_ratio.values()).index(max(list(E_ratio.values())))]
def CART_select_feature(Feature_all,Label):
Gini_index = {}
for val in Feature_all.keys():
Gini_index[val] = gini_index(Feature_all[val],Label)
return list(Gini_index.keys())[list(Gini_index.values()).index(min(list(Gini_index.values())))]
def find_data(data,feature_val,feature_key,label,feature):
# data is a 2-dim list,feature_val is the value of the feature, feature_key is the key of features
new_data = []
new_label = []
new_feature = {}
for val in feature.keys():
new_feature[val] = []
for i in range(len(data)):
if data[i][feature_key]==feature_val:
data[i][feature_key] = 0
new_data.append(data[i])
new_label.append(label[i])
for val in new_feature.keys():
new_feature[val].append(feature[val][i])
return new_data,new_label,new_feature
def build_tree(features,label,data):
# features is a dictionary, label is a list, data is a list
# case 1:
l_now = find_val(label)
if len(l_now)==1:
node = label[0]
return node
# case 2:
data_val = find_val(data)
if len(features)==0 or len(data_val)==1:
l_count=[]
for val in l_now:
l_count.append(label.count(val))
node = label[l_count.index(max(l_count))]
return node
# case 3:
feature_key = CART_select_feature(features,label) # can change the select feature (ID3,C4.5,CART)
node = {feature_key:{}}
f_val = find_val(features[feature_key])
for val in f_val:
new_features={}
for key in features.keys():
new_features[key] = features[key]
del new_features[feature_key]
new_data,new_label,new_features = find_data(data,val,feature_key,label,new_features)
node[feature_key][val]=build_tree(new_features,new_label,new_data)
return node
def use_decision_tree(data,tree,label):
index = list(tree.keys())[0]
tree = tree[index]
if tree[data[index]] in label:
return tree[data[index]]
else:
for val in tree.keys():
if val==data[index]:
tree = tree[val]
return use_decision_tree(data,tree,label)
csvpath = 'watermelon.csv' #data
f = csv.reader(open(csvpath))
fea_num = 6 # features number = 6
features = {} # feature is a dict ,the key is the feature's index
for i in range(fea_num):
features[i] = []
label = []
data_train = [] # after build decision tree, the data_train will be changed
data_test = []
for row in f:
data_train.append([row[0],row[1],row[2],row[3],row[4],row[5]])#,row[6],row[7]])
data_test.append([row[0],row[1],row[2],row[3],row[4],row[5]])#,row[6],row[7]])
for i in range(fea_num):
features[i].append(row[i])
label.append(row[8])
decision_tree = build_tree(features,label,data_train)
output = []
for i in range(len(data_test)):
output.append(use_decision_tree(data_test[i],decision_tree,label))
print(output)
#json_tree = json.dumps(decision_tree,indent=1,ensure_ascii=False)
#print(json_tree)