机器学习实战 第4章

贝叶斯决策理论:选择具有最高概率的决策。

条件概率:
p(gray|bucketB)表示:在已知石头出自B桶的条件下,取出灰色石头的概率。
p(gray|bucketB) = p(gray and bucketB) / p(bucketB)

贝叶斯准则:
p(c|x) = p(x|c) * p(c) / p(x)


import numpy as np
def loadDataSet():
        postingList =[["my", "dog", "has", "flea", "problems", "help", "please"],
             ["maybe", "not", "take", "him", "to", "dog", "park", "stupid"],
             ["my", "dalmation", "is", "so", "cute", "I", "love", "him"],
             ["stop", "posting", "stupid", "worthless", "garbage"],
             ["mr", "licks", "ate", "my", "steak", "how", "to", "stop", "him"],
             ["quit", "buying", "worthless", "dog", "food", "stupid"]]
        classVec = [0, 1, 0, 1, 0, 1]
        return postingList, classVec

def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)


def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else:
            print "the word: %s is not in my vocabulary" % word
    return returnVec

def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory) / float(numTrainDocs)
    p0Num = np.zeros(numWords)
    p1Num = np.zeros(numWords)
    p0Denom = 0.0
    p1Denom = 0.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += np.sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += np.sum(trainMatrix[i])
    p1Vec = p1Num / p1Denom
    p0Vec = p0Num / p0Denom
    return p0Vec, p1Vec, pAbusive
    

listOfPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOfPosts)
trainMatrix = []
for doc in listOfPosts:
    trainMatrix.append(setOfWords2Vec(myVocabList, doc))
p0v, p1v, pAb = trainNB0(trainMatrix, listClasses)
print pAb
print p0v

猜你喜欢

转载自blog.csdn.net/sumaoqing123/article/details/79519755