Decision Tree CART regression trees - Algorithm

Decision Tree Model

  1. Select the best features and feature values ​​data set into
  2. Create a decision tree based on the results obtained above
  3. Pruning test data (not the default data branch of the tree is cut)
  4. The input prediction

Model Tree

 

import numpy as np


def loadDataSet(fileName):      #general function to parse tab -delimited floats
    dataMat = []                #assume last column is target value
    with open(fileName) as fr:
        for line in fr.readlines():
            curLine = line.strip().split('\t')
            # fltLine = map(float, curLine) #map all elements to float()
            fltLine = [float(i) for i in curLine]
            dataMat.append(fltLine)
        # dataMat = [map(float,line.strip().split('\t')) for line in fr.readlines()]
    return np.mat(dataMat)


# dataSet为矩阵,feature 为特征索引,value为值
def binSplitDataSet(dataSet, feature, value):
    mat0 = dataSet[np.nonzero(dataSet[:,feature] > value)[0],:]
    mat1 = dataSet[np.nonzero(dataSet[:,feature] <= value)[0],:]
    return np.mat(mat0),np.mat(mat1)


def regLeaf(dataSet):#returns the value used for each leaf
    return np.mean(dataSet[:,-1])


def regErr(dataSet): # 输出的平方误差和
    return np.var(dataSet[:,-1]) * np.shape(dataSet)[0]


# ops[0]误差下降值,小于此值不再切分
# ops[1] 切分的最小样本数,小于此值不再切分
def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
    tolS = ops[0]; tolN = ops[1]
    #if all the target variables are the same value: quit and return value
    # print(set(dataSet[:,-1].T.tolist()[0]))

    if len(set(dataSet[:,-1].T.tolist()[0])) == 1: #exit cond 1
    # if len(set(dataSet[:, -1])) == 1:  # exit cond 1
        return None, leafType(dataSet) # 返回None,输出值
    m,n = np.shape(dataSet)
    #the choice of the best feature is driven by Reduction in RSS error from mean
    S = errType(dataSet)
    bestS = np.inf; bestIndex = 0; bestValue = 0
    for featIndex in range(n-1):
        for splitVal in set(dataSet[:,featIndex].T.tolist()[0]):
            mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
            if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):
                continue  # 结束本次循环,小于最小切分样本数,不再切分
            newS = errType(mat0) + errType(mat1)
            if newS < bestS:
                bestIndex = featIndex
                bestValue = splitVal
                bestS = newS
    #if the decrease (S-bestS) is less than a threshold don't do the split
    if (S - bestS) < tolS:  # 切分前的和切分后的误差小于给定值,不再切分
        return None, leafType(dataSet) #exit cond 2
    # mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue) # ?按照最优特征和值切分
    # if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):  #exit cond 3
    #     return None, leafType(dataSet)
    return bestIndex,bestValue#returns the best feature to split on
                              #and the value used for that split


def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):#assume dataSet is NumPy Mat so we can array filtering
    feat, val = chooseBestSplit(dataSet, leafType, errType, ops)#choose the best split
    if feat == None: return val #if the splitting hit a stop condition return val
    retTree = {}
    retTree['spInd'] = feat
    retTree['spVal'] = val
    lSet, rSet = binSplitDataSet(dataSet, feat, val)
    retTree['left'] = createTree(lSet, leafType, errType, ops)
    retTree['right'] = createTree(rSet, leafType, errType, ops)
    return retTree


def isTree(obj):
    return (type(obj).__name__ == 'dict')


def getMean(tree):
    if isTree(tree['right']): tree['right'] = getMean(tree['right'])
    if isTree(tree['left']): tree['left'] = getMean(tree['left'])
    return (tree['left'] + tree['right']) / 2.0


def prune(tree, testData):
    if np.shape(testData)[0] == 0: return getMean(
        tree)  # if we have no test data collapse the tree
    if (isTree(tree['right']) or isTree(
            tree['left'])):  # if the branches are not trees try to prune them
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
    if isTree(tree['left']): tree['left'] = prune(tree['left'], lSet)
    if isTree(tree['right']): tree['right'] = prune(tree['right'], rSet)
    # if they are now both leafs, see if we can merge them
    if not isTree(tree['left']) and not isTree(tree['right']):
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
        errorNoMerge = sum(np.power(lSet[:, -1] - tree['left'], 2)) + \
                       sum(np.power(rSet[:, -1] - tree['right'], 2))
        treeMean = (tree['left'] + tree['right']) / 2.0
        errorMerge = sum(np.power(testData[:, -1] - treeMean, 2))
        if errorMerge < errorNoMerge:
            print("merging")
            return treeMean
        else:
            return tree
    else:
        return tree



# 模型树代码--未测试
def linearSolve(dataSet):   #helper function used in two places
    m,n = np.shape(dataSet)
    X = np.mat(np.ones((m,n))); Y = np.mat(np.ones((m,1)))#create a copy of 
    # data with 1
    # in 0th postion
    X[:,1:n] = dataSet[:,0:n-1]; Y = dataSet[:,-1]#and strip out Y
    xTx = X.T*X
    if np.linalg.det(xTx) == 0.0:
        raise NameError('This matrix is singular, cannot do inverse,\n\
        try increasing the second value of ops')
    ws = xTx.I * (X.T * Y)
    return ws,X,Y


def regTreeEval(model, inDat):
    return float(model)


def modelTreeEval(model, inDat):
    n = np.shape(inDat)[1]
    X = np.mat(np.ones((1, n + 1)))
    X[:, 1:n + 1] = inDat
    return float(X * model)


def treeForeCast(tree, inData, modelEval=regTreeEval):
    if not isTree(tree): return modelEval(tree, inData)
    if inData[tree['spInd']] > tree['spVal']:
        if isTree(tree['left']):
            return treeForeCast(tree['left'], inData, modelEval)
        else:
            return modelEval(tree['left'], inData)
    else:
        if isTree(tree['right']):
            return treeForeCast(tree['right'], inData, modelEval)
        else:
            return modelEval(tree['right'], inData)


def createForeCast(tree, testData, modelEval=regTreeEval):
    m = len(testData)
    yHat = np.mat(np.zeros((m, 1)))
    for i in range(m):
        yHat[i, 0] = treeForeCast(tree, np.mat(testData[i]), modelEval)
    return yHat


if __name__ == '__main__':
    # mat0, mat1 = binSplitDataSet(np.mat(np.eye(4)),1,0.5)  # 二分测试
    dataMat = loadDataSet('ex00.txt')  # 构建数测试
    myTree = createTree(dataMat)
    print(myTree)

    dataMat2 = loadDataSet('ex0.txt')
    myTree2 = createTree(dataMat2)
    print(myTree2)

    dataMat31 = loadDataSet('ex2.txt')  # 剪枝测试
    dataMat32 = loadDataSet('ex2test.txt')
    myTree31 = createTree(dataMat31)
    retTree = prune(myTree31, dataMat32)
    print(myTree31)
    print(retTree)

 

Published 46 original articles · won praise 0 · Views 1042

Guess you like

Origin blog.csdn.net/weixin_37680513/article/details/103088103