Decision Tree --ID3

Reference URL: https: //www.cnblogs.com/further-further-further/p/9429257.html

ID3 algorithm

Optimal Decision Tree

-- coding: utf-8 --

"" "
The Created ON Thu Aug 2 17:09:34 2018
decision tree ID3 implementation
@author: weixw
" ""
from the Math Import log
Import operator

Raw data

def createDataSet():
dataSet = [[1, 1, 1,1,'yes'],
[1, 1, 0,0,'yes'],
[1, 0, 1,1,'no'],
[0, 1, 0,1,'yes'],
[0, 1, 1,0,'yes'],
[1, 1, 1, 1, 'yes'],
[1, 1, 0, 0, 'no'],
[1, 0, 1, 1, 'no'],
[0, 1, 0, 1, 'no'],
[0, 1, 1, 0, 'no']]
labels = ['no surfacing','flippers','people','day']
return dataSet, labels

Majority voting

The same value up to the number of columns results

def majorityCnt(classList):
classCounts = {}
for value in classList:
if (value not in classCounts.keys()):
classCounts[value] = 0
classCounts[value] += 1
sortedClassCount = sorted(classCounts.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]

Data Partitioning

dataSet: original data set

axis: specified column index split

value: value of the specified column

splitDataSet DEF (dataSet A, Axis, value):
retDataSet = []
for featDataVal in dataSet A:
IF featDataVal [Axis] == value:
# removed two lines below a specified value for a column, there is a clever
reducedFeatVal = featDataVal [: Axis]
reducedFeatVal.extend (featDataVal [Axis +. 1:])
retDataSet.append (reducedFeatVal)
return retDataSet

Shannon entropy calculation

def calcShannonEnt (dataSet):
total number of items # dataset
numEntries = len (dataSet A)
# tag count object initialization
labelCounts = {}
for featDataVal in dataSet A:
# acquiring data sets each last a one tag value
currentLabel = featDataVal [- 1]
# If this tag is not in the tag storage objects, is initialized, and counting
the currentLabel not in labelCounts.keys () IF:
labelCounts [the currentLabel] = 0
labelCounts [the currentLabel] + = 1
# entropy initialization
shannonEnt = 0.0
# traversal label object , determination of probability, the entropy is calculated
for Key in labelCounts.keys ():
prop = labelCounts [Key] / a float (numEntries)
shannonEnt - * prop = log (prop, 2)
return shannonEnt

Select the best feature column index

DEF chooseBestFeatureToSplit (dataSet):
# counting the number of features, the last one dataSet a tag attributes, not the feature amount
numFeatures = len (dataSet [0]) -. 1
# to calculate the initial data Shannon entropy
baseEntropy = calcShannonEnt (dataSet)
# initialization information gain, wherein optimal dividing column index
bestInfoGain = 0.0
bestFeatureIndex = -1
for I in Range (numFeatures):
# acquiring data in each column
featList = [Example [I] in Example for dataSet a]
# of each column of data deduplication
uniqueVals = set (featList )
newEntropy = 0.0
for value in uniqueVals:
subDataSet = splitDataSet (dataSet A, I, value)
# calculating the conditional probability
Prob = len (subDataSet) / a float (len (dataSet A))
# calculate the conditional entropy
newEntropy + = prob * calcShannonEnt (subDataSet )
# compute information gain
infoGain = baseEntropy - newEntropy
if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeatureIndex = i
return bestFeatureIndex

Create a decision tree

createTree DEF (dataSet A, labels):
# Get tag attributes, dataSet A last column, different from the label name labels
classList = [Example [-1] for Example in dataSet A]
# tree extreme Termination condition
# tag attribute values are all the same, return label The first attribute value
IF classList.count (classList [0]) == len (classList):
return classList [0]
# is only a feature (a)
IF len (dataSet a [0]) ==. 1:
return majorityCnt ( classList)
# obtain optimal feature column index
bestFeatureIndex = chooseBestFeatureToSplit (dataSet a)
# obtain the optimal tag name corresponding to the index
bestFeatureLabel = labels [bestFeatureIndex]
# Create the root node
of myTree = {bestFeatureLabel: {}}
# remove most corresponding to the index tab name, the labels to correctly label traversal
del (labels [bestFeatureIndex])
# obtain the optimal column
bestFeature = [example [bestFeatureIndex] for example in dataSet]
uniquesVals = set(bestFeature)
for value in uniquesVals:
# 子标签名称集合
subLabels = labels[:]
# 递归
myTree[bestFeatureLabel][value] = createTree(splitDataSet(dataSet, bestFeatureIndex, value), subLabels)
return myTree

Obtain classification results

inputTree: decision trees dictionary

featLabels: tag list

testVec: Vector Test example: a simple example of the route [1,1] => yes (values ​​trunk combination, from the root to a leaf node)

Classify DEF (inputTree, featLabels, testVec):
# Get the name of the root node, the dict into List
firstSide = List (inputTree.keys ())
# Name String Type root
firstStr firstSide = [0]
# Get the root corresponding to the child node
secondDict = inputTree [firstStr]
# Get the name of the root node corresponding to the label list index
featIndex = featLabels.index (firstStr)
# acquired by the index value corresponding to the scale
Key = testVec [featIndex]
# Get trunk after the target vector
valueOfFeat = secondDict [Key]
# is determined leaf node or a child node: the sub-node to the callback function classification, the classification result is a leaf node
# if type (valueOfFeat) .__ name __ == 'dict': equivalent the isinstance IF (valueOfFeat, dict):
IF the isinstance (valueOfFeat, dict):
classLabel = Classify (valueOfFeat, featLabels, testVec)
the else:
classLabel = valueOfFeat
return classLabel

The decision tree classifier stored on disk, filename usually saved as txt format

def storeTree(inputTree, filename):
import pickle
fw = open(filename, 'wb+')
pickle.dump(inputTree, fw)
fw.close()

The porcelain objects in the load out, filename here is the function of the above txt file

def grabTree(filename):
import pickle
fr = open(filename, 'rb')
return pickle.load(fr)

Draw the decision tree

'''
Created on Oct 14, 2010

@author: Peter Harrington
'''
import matplotlib.pyplot as plt

decisionNode = dict(boxstyle="sawtooth", fc="0.8")
leafNode = dict(boxstyle="round4", fc="0.8")
arrow_args = dict(arrowstyle="<-")

Gets leaf nodes of the tree

getNumLeafs DEF (of myTree):
numLeafs = 0
# dict into List
firstSides = List (myTree.keys ())
firstStr firstSides = [0]
secondDict of myTree = [firstStr]
for Key in secondDict.keys ():
# determines whether a leaf node (determined by the type, the child class does not exist, the type str; subclass exists, then dict)
IF type (secondDict [
Key]) == name__ .__ 'dict': # Test the nodes are to IF dictonaires See, They are Leaf Nodes Not IF
numLeafs + = getNumLeafs (secondDict [Key])
the else:
numLeafs + =. 1
return numLeafs

Gets the number of layers of the tree

def getTreeDepth(myTree):
maxDepth = 0
# dict转化为list
firstSides = list(myTree.keys())
firstStr = firstSides[0]
secondDict = myTree[firstStr]
for key in secondDict.keys():
if type(secondDict[
key]).__name__ == 'dict': # test to see if the nodes are dictonaires, if not they are leaf nodes
thisDepth = 1 + getTreeDepth(secondDict[key])
else:
thisDepth = 1
if thisDepth > maxDepth: maxDepth = thisDepth
return maxDepth

def plotNode(nodeTxt, centerPt, parentPt, nodeType):
createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction',
xytext=centerPt, textcoords='axes fraction',
va="center", ha="center", bbox=nodeType, arrowprops=arrow_args)

def plotMidText(cntrPt, parentPt, txtString):
xMid = (parentPt[0] - cntrPt[0]) / 2.0 + cntrPt[0]
yMid = (parentPt[1] - cntrPt[1]) / 2.0 + cntrPt[1]
createPlot.ax1.text(xMid, yMid, txtString, va="center", ha="center", rotation=30)

def plotTree(myTree, parentPt, nodeTxt): # if the first key tells you what feat was split on
numLeafs = getNumLeafs(myTree) # this determines the x width of this tree
depth = getTreeDepth(myTree)
firstSides = list(myTree.keys())
firstStr = firstSides[0] # the text label for this node should be this
cntrPt = (plotTree.xOff + (1.0 + float(numLeafs)) / 2.0 / plotTree.totalW, plotTree.yOff)
plotMidText(cntrPt, parentPt, nodeTxt)
plotNode(firstStr, cntrPt, parentPt, decisionNode)
secondDict = myTree[firstStr]
plotTree.yOff = plotTree.yOff - 1.0 / plotTree.totalD
for key in secondDict.keys():
if type(secondDict[
key]).__name__ == 'dict': # test to see if the nodes are dictonaires, if not they are leaf nodes
plotTree(secondDict[key], cntrPt, str(key)) # recursion
else: # it's a leaf node print the leaf node
plotTree.xOff = plotTree.xOff + 1.0 / plotTree.totalW
plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)
plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
plotTree.yOff = plotTree.yOff + 1.0 / plotTree.totalD

if you do get a dictonary you know it's a tree, and the first element will be another dict

Draw the decision tree

def createPlot(inTree):
fig = plt.figure(1, facecolor='white')
fig.clf()
axprops = dict(xticks=[], yticks=[])
createPlot.ax1 = plt.subplot(111, frameon=False, **axprops) # no ticks
# createPlot.ax1 = plt.subplot(111, frameon=False) #ticks for demo puropses
plotTree.totalW = float(getNumLeafs(inTree))
plotTree.totalD = float(getTreeDepth(inTree))
plotTree.xOff = -0.5 / plotTree.totalW;
plotTree.yOff = 1.0;
plotTree(inTree, (0.5, 1.0), '')
plt.show()

Draw root and leaf nodes of the tree (root shape: rectangular, leaf node: oval)

def createPlot():

fig = plt.figure(1, facecolor='white')

fig.clf()

createPlot.ax1 = plt.subplot(111, frameon=False) #ticks for demo puropses

plotNode('a decision node', (0.5, 0.1), (0.1, 0.5), decisionNode)

plotNode ( 'a leaf node', (0.8, 0.1), (0.3, 0.8), leafNode)

plt.show()

def retrieveTree(i):
listOfTrees = [{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}},
{'no surfacing': {0: 'no', 1: {'flippers': {0: {'head': {0: 'no', 1: 'yes'}}, 1: 'no'}}}}
]
return listOfTrees[i]

thisTree = retrieveTree(0)

createPlot(thisTree)

createPlot()

myTree = retrieveTree(0)

numLeafs =getNumLeafs(myTree)

treeDepth =getTreeDepth(myTree)

print (u "number of leaf nodes:% d"% numLeafs)

print (u "tree depth:% d"% treeDepth)

Test code

-- coding: utf-8 --

"""
Created on Fri Aug 3 19:52:10 2018

@author: weixw
"""
import Demo_1.myTrees as mt
import Demo_1.treePlotter as tp

test

dataSet, labels = mt.createDataSet()

copy function: to open up a new memory, then the list of all values ​​copied to the opening of the new memory

labels1 = labels.copy()

The value will labels1 createTree function changes, so the test can not be used in the classification labels1

myTree = mt.createTree(dataSet,labels1)

Save a tree to a local

mt.storeTree(myTree,'myTree.txt')

Get Tree in local disk

= mt.grabTree of myTree ( 'myTree.txt')
Print (U "decision tree structure:% s"% myTree)

Draw the decision tree

print (u "Draw Decision Tree:")
tp.createPlot (of myTree)
numLeafs = tp.getNumLeafs (of myTree)
treeDepth = tp.getTreeDepth (of myTree)
print (u "number of leaf nodes:% D" numLeafs%)
print (u " tree depth:% d "% treeDepth)

Test Category 3 simple sample data

= mt.classify labelResult (of myTree, Labels, [1,1,1,0])
Print (U "[1,1] Test results:% S"% labelResult)
labelResult = mt.classify (of myTree, Labels, [ 1,0,0,0])
Print (U "[1,0] test results:% s"% labelResult)

Guess you like

Origin www.cnblogs.com/131415-520/p/11789727.html