python ID3算法

 
 
  1. 一个简单的数据集data(前4列表示一种属性(其中每一列不同数字表示同一属性的不同取值),后面一列表示它的类别)比如:西瓜数据集
  2.  
       
    '0,2,0,0,0; '
    '0,2,0,1,0; '
    '1,2,0,0,1;'
    '2,1,0,0,1;'
    '2,0,1,0,1;'
    '2,0,1,1,0;'
    '1,0,1,1,1;'
    '0,1,0,0,0;'
    '0,0,1,0,1;'
    '2,1,1,0,1;'
    '0,1,1,1,1;'
    '1,1,0,1,1;'
    '1,2,1,0,1;'
    '2,1,0,1,0'
  3. 现在我们根据熵增益来建立一个决策树。
  4. 程序如下:
  5.         第一部分:
    import  numpy as np
    import  math
    data=np.mat('0,2,0,0,0; '
                '0,2,0,1,0; '
                '1,2,0,0,1;'
                '2,1,0,0,1;'
                '2,0,1,0,1;'
                '2,0,1,1,0;'
                '1,0,1,1,1;'
                '0,1,0,0,0;'
                '0,0,1,0,1;'
                '2,1,1,0,1;'
                '0,1,1,1,1;'
                '1,1,0,1,1;'
                '1,2,1,0,1;'
                '2,1,0,1,0')
    def creattree(data):
        m,n=np.shape(data)
        print(' original data:')
        print(data)
        classlist=data[:,n-1]
        classone=1
        for i in range(m):
            if classlist[i,:]==classlist[0,:]:
                classone=+1
        #只有一个类别
        if classone==m:
            print('final data:')
            print(data)
            return
        #特征(属性)已经用完
        if n==1:
            print('final data:')
            print(data)
            return
        bestFeat=chooseBestFeature(data)
        print('划分属性:',bestFeat)
        featValues=np.unique(np.array(data[:,bestFeat]))
        num0fFeatValue=np.size(featValues)
        for i in range(num0fFeatValue):
            print('----当前属性%d下类别%d划分----:' % (bestFeat, i))
            creattree(splitData(data,bestFeat,featValues[i]))
    def chooseBestFeature(data):
        m,n=np.shape(data)
        #统计特征的个数 最后一列是类别
        numofFeature=n-1
        #计算原始的熵
        baseEntropy=calEntropy(data)
        bestinfoGain=0   # 初始化信息增益
        bestFeature=0    # 初始化最佳的特征位
        #挑选最佳的特征位
        for j in range(numofFeature):
            featureTemp=np.unique(np.array(data[:,j]))
            numF=np.size(featureTemp)
            newEntropy=0
            for i in range(numF):
                subSet=splitData(data,j,featureTemp[i])
                m_1,n_1=np.shape(subSet)
                prob=m_1/m
                newEntropy=newEntropy+prob*calEntropy(subSet)
        infoGain=baseEntropy-newEntropy
        if infoGain>bestinfoGain:
            bestinfoGain=infoGain
            bestFeature=j
        return bestFeature
    #这个地方还可以改写
    def splitData(data,axit,value):
        m,n=np.shape(data)
        subSet=data
        subSet=np.delete(subSet,0,axis=1)
        k=0
        for i in range(m):
            if data[i,axit]!=value:
                subSet=np.delete(subSet,i-k,axis=0)
                k=k+1
        return subSet
    
    def calEntropy(data):
        m,n=np.shape(data)
        #得到类的项
        label=data[:,n-1]
        # 处理完的lable
        label_deal=np.unique(np.array(label))
        numLable=np.size(label_deal)
        prob=np.array([[0.,0.],[0.,0.]])
        for i in range(numLable):
            prob[i,0]=label_deal[i]
            for j in range(m):
                if label[j,:]==label_deal[i]:
                    prob[i,1]=+1
        prob[:,1]=prob[:,1]/m
        entropy=0
        for i in range(numLable):
            entropy=entropy -prob[i,1]*math.log2(prob[i,1])
        return  entropy
    def main():
        creattree(data)
    if __name__=='__main__':
        main()

猜你喜欢

转载自blog.csdn.net/qq_23859701/article/details/78886765