使用朴素贝叶斯在nlp中的应用
import numpy as np
def loadDataSet():
postingList = [['my','dog','has','flea','problems','help','please'],
['maybe','not','take','him','to','dog','park','stupid'],
['my','dalmation','is','so','cute','I','love','him'],
['stop','posting','stupid','worthless','garbage'],
['mr','licks','ate','my','steak','how','to','stop','him'],
['quit','buying','worthless','dog','food','stupid']]
classVec = [0,1,0,1,0,1]
return postingList, classVec
def createVocabList(dataSet):
vocabSet = set([])
for document in dataSet:
vocabSet = vocabSet | set(document)
return list(vocabSet)
def bagofWord2VecMN(vocablist,inputSet):
returnVec = [0]*len(vocablist)
for word in inputSet:
if word in vocablist:
returnVec[vocablist.index(word)]+=1
return returnVec
def trainNb0(trainMatrix,trainCategory):
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pAbsive = sum(trainCategory)/float(numTrainDocs)
p0Num=np.ones(numWords)
p1Num =np.ones(numWords)
p0Denom = 2.0
p1Denom = 2.0
for i in range(numTrainDocs):
if np.all(trainCategory[i] == 0):
p0Num +=trainMatrix[i]
p0Denom+=sum(trainMatrix[i])
else:
p1Num +=trainMatrix[i]
p1Denom +=sum(trainMatrix[i])
p0Vect=np.log(p0Num/p0Denom)
p1Vect=np.log(p1Num/p1Denom)
return p0Vect,p1Vect,pAbsive
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
p1 = sum(vec2Classify*p1Vec) + np.log(pClass1)
p0 = sum(vec2Classify*p0Vec) + np.log(1-pClass1)
return 1 if p1>p0 else 0
def tsNB():
listOPosts,listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
trainMat=[]
for postinDoc in listOPosts:
trainMat.append(bagofWord2VecMN(myVocabList,postinDoc))
p0V,p1V,pAb=trainNb0(np.array(trainMat),np.array(listClasses))
testEntry=["stupid","not","not","dfdd","not","not","dfdd"]
thisDoc = np.array(bagofWord2VecMN(myVocabList,testEntry))
if __name__ == '__main__':
tsNB()
'''
思想:
1.Word2Vec构建词向量 词向量的构建需要使用到词汇(训练集中)的数量 注意不要重复
2.在训练集中每一条记录都转为向量 然后构成一个矩阵
3.在使用朴素贝叶斯分类的时候 首先还是计算出1与0的频率 即公式中的P(Y=Ci) p0V,p1V表示每一个词在0与1类中的频率
并使用了拉普普拉斯平滑
'''