贝叶斯决策理论:选择具有最高概率的决策。
条件概率:
p(gray|bucketB)表示:在已知石头出自B桶的条件下,取出灰色石头的概率。
p(gray|bucketB) = p(gray and bucketB) / p(bucketB)
贝叶斯准则:
p(c|x) = p(x|c) * p(c) / p(x)
条件概率:
p(gray|bucketB)表示:在已知石头出自B桶的条件下,取出灰色石头的概率。
p(gray|bucketB) = p(gray and bucketB) / p(bucketB)
贝叶斯准则:
p(c|x) = p(x|c) * p(c) / p(x)
import numpy as np
def loadDataSet():
postingList =[["my", "dog", "has", "flea", "problems", "help", "please"],
["maybe", "not", "take", "him", "to", "dog", "park", "stupid"],
["my", "dalmation", "is", "so", "cute", "I", "love", "him"],
["stop", "posting", "stupid", "worthless", "garbage"],
["mr", "licks", "ate", "my", "steak", "how", "to", "stop", "him"],
["quit", "buying", "worthless", "dog", "food", "stupid"]]
classVec = [0, 1, 0, 1, 0, 1]
return postingList, classVec
def createVocabList(dataSet):
vocabSet = set([])
for document in dataSet:
vocabSet = vocabSet | set(document)
return list(vocabSet)
def setOfWords2Vec(vocabList, inputSet):
returnVec = [0] * len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else:
print "the word: %s is not in my vocabulary" % word
return returnVec
def trainNB0(trainMatrix, trainCategory):
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pAbusive = sum(trainCategory) / float(numTrainDocs)
p0Num = np.zeros(numWords)
p1Num = np.zeros(numWords)
p0Denom = 0.0
p1Denom = 0.0
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += np.sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += np.sum(trainMatrix[i])
p1Vec = p1Num / p1Denom
p0Vec = p0Num / p0Denom
return p0Vec, p1Vec, pAbusive
listOfPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOfPosts)
trainMatrix = []
for doc in listOfPosts:
trainMatrix.append(setOfWords2Vec(myVocabList, doc))
p0v, p1v, pAb = trainNB0(trainMatrix, listClasses)
print pAb
print p0v