机器学习算法学习---处理分类问题常用算法(三)

决策树

优点:计算复杂度不高,输出结果易于理解,对中间值的缺失不敏感,可以处理不相关特征数据。

缺点:可能会产生过度匹配问题。

适用类型:数值型、标称型。

1、信息增益

划分数据集的大原则是:将无序的数据变得更加有序。

在划分数据集之前之后信息发生的变化称为信息增益,计算每个特征值划分数据集获得的信息增益,获得信息增益最高的特征就算最好选择。

xi的信息: l(xi)=-log2p(xi),其中p(xi)是选择该分类的概率。

信息增益(熵的减少或数据无序度的减少):熵H=- sigma(i=1~n)p(xi)log2p(xi),其中n是分类的数目。熵值越大,信息越无序。

2、实现程序

决策树
import operator
from math import log
def calcShannonEnt(dataSet):#计算给定数据集的香农熵
numEntries=len(dataSet)
labelCounts={}
for featVec in dataSet:
currentLabel=featVec[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel]=0
labelCounts[currentLabel]+=1
shannonEnt=0.0
for key in labelCounts:
prob=float(labelCounts[key])/numEntries
shannonEnt-=prob*log(prob,2)
return shannonEnt
def createDataSet():
dataSet=[[1,1,'yes'],[1,1,'yes'],[1,0,'no'],[0,1,'no'],[0,1,'no']]
labels=['no surfacing','flippers']
return dataSet,labels
def splitDataSet(dataSet,axis,value):#按照给定特征划分数据集
retDataSet=[]
for featVec in dataSet:
if featVec[axis]==value:
reducedFeatVec=featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSet
def chooseBestFeatureToSplit(dataSet):#选择最好的数据集划分方式
numFeatures=len(dataSet[0])-1
baseEntropy=calcShannonEnt(dataSet)
bestInfoGain=0.0
bestFeature=-1
for i in range(numFeatures):
featList=[example[i] for example in dataSet]
uniqueVals=set(featList)
newEntropy=0.0
for value in uniqueVals:
subDataSet=splitDataSet(dataSet,i,value)
prob=len(subDataSet)/float(len(dataSet))
newEntropy+=prob*calcShannonEnt(subDataSet)
infoGain=baseEntropy - newEntropy
if infoGain>bestInfoGain:
bestInfoGain=infoGain
bestFeature=i
return bestFeature
def majorityCnt(classList):#返回出现次数最多的分类名称
classCount={}
for vote in classList:
if vote not in classCount.keys():
classCount[vote]=0
classCount[vote]+=1
sortedClassCount=sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True)
return sortedClassCount[0][0]
def createTree(dataSet,labels):#创建树
classList = [example[-1] for example in dataSet]
print(classList)
if classList.count(classList[0])==len(classList):#类别完全相同则停止继续划分
return classList[0]
if len(dataSet[0])==1:#遍历完所有特征时返回出现次数最多的类别
print(classList)
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet)
bestFeatLabel = labels[bestFeat]
myTree = {bestFeatLabel:{}}
del(labels[bestFeat])
featValues=[example[bestFeat] for example in dataSet]
uniqueVals=set(featValues)
for value in uniqueVals:
subLabels = labels[:]
myTree[bestFeatLabel][value]=createTree(splitDataSet(dataSet,bestFeat,value),subLabels)
return myTree
def classify(inputTree,featLabels,testVec):#使用决策树的分类函数
firstSides=list(inputTree.keys())
firstStr=firstSides[0]
secondDict=inputTree[firstStr]
featIndex=featLabels.index(firstStr)
d={}
for key in secondDict.keys():
if testVec[featIndex]==key:
if type(secondDict[key])==type(d):
classLabel=classify(secondDict[key],featLabels,testVec)
else:
classLabel=secondDict[key]
return classLabel
dataSet,labels=createDataSet()
tree=createTree(dataSet,labels)
print(tree)
a,labels1=createDataSet()
classlabel=classify(tree,labels1,[1,1])
print(classlabel)

猜你喜欢

转载自www.cnblogs.com/zhenpengwang/p/10796630.html