基于gini系数的决策树代码

import gini

import tree

import operator
from math import pow
def cal_gini_index(data):
total_sample=len(data)
if total_sample==0:
return 0
label_counts=label_unique_cnt(data)
gini=0
for label in label_counts:
gini=gini+pow(label_counts[label],2)
gini=1-float(gini)/pow(total_sample,2)
return gini

def label_unique_cnt(data):
label_unique_cnt={}
for x in data:
label=x[len(x)-1]
if label not in label_unique_cnt:
label_unique_cnt[label]=0
label_unique_cnt[label]+=1
return label_unique_cnt

def createDataSet1(): # 创造示例数据
dataSet = [[‘青色’, ‘蜷缩’, ‘浊响’, ‘清晰’, ‘凹陷’, ‘硬滑’, ‘是’],
[‘乌黑’, ‘蜷缩’, ‘沉闷’, ‘清晰’, ‘凹陷’, ‘硬滑’, ‘是’],
[‘乌黑’, ‘蜷缩’, ‘浊响’, ‘清晰’, ‘凹陷’, ‘硬滑’, ‘是’],
[‘青绿’, ‘蜷缩’, ‘沉闷’, ‘清晰’, ‘凹陷’, ‘硬滑’, ‘是’],
[‘浅白’, ‘蜷缩’, ‘浊响’, ‘清晰’, ‘凹陷’, ‘硬滑’, ‘是’],
[‘青绿’, ‘稍蜷’, ‘浊响’, ‘清晰’, ‘稍凹’, ‘软粘’, ‘是’],
[‘乌黑’, ‘稍蜷’, ‘沉闷’, ‘稍糊’, ‘稍凹’, ‘硬滑’, ‘否’],
[‘青绿’, ‘硬挺’, ‘清脆’, ‘清晰’, ‘平坦’, ‘软粘’, ‘否’],
[‘浅白’, ‘硬挺’, ‘清脆’, ‘模糊’, ‘平坦’, ‘硬滑’, ‘否’],
[‘浅白’, ‘蜷缩’, ‘浊响’, ‘模糊’, ‘平坦’, ‘软粘’, ‘否’],
[‘青绿’, ‘稍蜷’, ‘浊响’, ‘稍糊’, ‘凹陷’, ‘硬滑’, ‘否’],
[‘浅白’, ‘稍蜷’, ‘沉闷’, ‘稍糊’, ‘凹陷’, ‘硬滑’, ‘否’],
[‘乌黑’, ‘稍蜷’, ‘浊响’, ‘清晰’, ‘稍凹’, ‘软粘’, ‘否’],
[‘浅白’, ‘蜷缩’, ‘浊响’, ‘模糊’, ‘平坦’, ‘硬滑’, ‘否’],
[‘青绿’, ‘蜷缩’, ‘沉闷’, ‘稍糊’, ‘稍凹’, ‘硬滑’, ‘否’],
]
#labels = [‘头发’,’声音’] #两个特征
labels=[‘色泽’,’根蒂’,’敲声’,’纹理’,’脐部’,’触感’]
return dataSet,labels

def getBestFeature(data):
label_num=len(data[0])-1
bestGini=0
currentgini=cal_gini_index(data)
for index in range(0,label_num):
newgini=0
sample_label_num=[example[index] for example in data]
valueset=set(sample_label_num)
for value in valueset:
subdata=split_tree(data,index,value)
newgini=newgini+len(subdata)*cal_gini_index(subdata)/len(data)
gaingini=currentgini-newgini
if gaingini>bestGini:
bestGini=gaingini
bestFeatureIndex=index
return bestFeatureIndex

def majorcnt(classlist):
classValue=set(classlist)
bestValueNum=0
bestClassValue=None
for value in classValue:
valueNum=classlist.count(value)
if valueNum>=bestValueNum:
bestValueNum=valueNum
bestClassValue=value
#print(bestClassValue)
return bestClassValue

def build_tree(data,label):
datalabel=label[:]
classlist=[example[-1] for example in data]
#print(classlist)
if classlist.count(classlist[0])==len(classlist):
return classlist[0]
if len(data[0])==1:
return majorcnt(classlist)
bestFeature=getBestFeature(data)
#print(bestFeature)
bestFeatureLabel=datalabel[bestFeature]
mytree={bestFeatureLabel:{}}
del(datalabel[bestFeature])
bestFeatureValue=[sample[bestFeature] for sample in data]
bestFeatureValueSet=set(bestFeatureValue)
for value in bestFeatureValueSet:
sublabel=datalabel[:]
mytree[bestFeatureLabel][value]=build_tree(split_tree(data,bestFeature,value),sublabel)
return mytree

def split_tree(data,axis,value):
subdata=[]
for sample in data:
if sample[axis]==value:
subdata1=sample[:axis]
subdata1.extend(sample[axis+1:])
subdata.append(subdata1)
return subdata

def makeClassDecision(sample,tree,labels):
keyoftree=”“.join(tree.keys())
print(str(keyoftree))
indexoffea=labels.index(keyoftree)
tree=tree[keyoftree]
#print(tree)
if tree[sample[indexoffea]]==’否’or tree[sample[indexoffea]]==’是’:
return tree[sample[indexoffea]]
else:
tree=tree[sample[indexoffea]]
return makeClassDecision(sample,tree,labels)

if name==’main‘:
dataSet, labels=createDataSet1() # 创造示列数据
#print(labels)
mytree=build_tree(dataSet,labels)
#print(labels)
print(mytree) # 输出决策树模型结果
sample= [‘青绿’, ‘稍蜷’, ‘浊响’, ‘清晰’, ‘稍凹’, ‘软粘’]
#print(labels[3])
t=makeClassDecision(sample,mytree,labels)
print(t)

猜你喜欢

转载自blog.csdn.net/weixin_42720875/article/details/81630579