朴素贝叶斯模型+文本分类+垃圾邮件分类源代码

注意：条件贝叶斯是保证条件之间独立的（文档分类中是假设一个词汇出现与其他词汇是否出现无关，然而同一主题的词汇一起出现的概率很高，存在关联），所以这个假设过于简单；尽管如此，然而事实表明，朴素贝叶斯的效果还很好。

from numpy import *
#词表到向量的转换函数
def loadDataSet():
	postingList=[['my','dog','has','flea', \
				'problems','help', 'please'],
				['maybe','not','take','him',\
				'to','dog','park','stupid'],
				['my','dalmation','is','so','cute',\
				'I','love','him'],
				['stop','posting','stupid','worthless','garbage'],
				['mr','licks','ate','my','steak','how',\
				'to','stop','him'],
				['quit','buying','worthless','dog','food','stupid']]
	classVec=[0,1,0,1,0,1]
	return postingList,classVec

def createVocabList(dataSet):
	vocabSet=set([])
	for document in dataSet:
		vocabSet=vocabSet|set(document)
	return list(vocabSet)
	
def setOfWords2Vec(vocabList,inputSet):
	returnVec=[0]*len(vocabList)
	for word in inputSet:
		if word in vocabList:
			returnVec[vocabList.index(word)]=1
		else : print"the word: %s is not in my Vocabulary!" % word
	return returnVec
	
#朴素贝叶斯分类器训练函数
def trainNB0(trainMatrix,trainCategory):
	numTrainDocs=len(trainMatrix)
	numWords=len(trainMatrix[0])
	pAbusive=sum(trainCategory)/float(numTrainDocs)
	p0Num=ones(numWords);p1Num=ones(numWords)
	p0Denom=2.0;p1Denom=2.0
	for i in range(numTrainDocs):
		if trainCategory[i]==1:
			p1Num+=trainMatrix[i]
			p1Denom+=sum(trainMatrix[i])
		else:
			p0Num+=trainMatrix[i]
			p0Denom+=sum(trainMatrix[i])
	p1Vect=log(p1Num/p1Denom)
	p0Vect=log(p0Num/p0Denom)
	return p0Vect,p1Vect,pAbusive
	
#朴素贝叶斯分类函数
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
	p1=sum(vec2Classify*p1Vec)+log(pClass1)
	p0=sum(vec2Classify*p0Vec)+log(1.0-pClass1)
	if p1>p0:
		return 1
	else :
		return 0

def testingNB():
	listOPosts,listClasses=loadDataSet()
	myVocabList=createVocabList(listOPosts)
	trainMat=[]
	for postinDoc in listOPosts:
		trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
	p0V,p1V,pAb=trainNB0(array(trainMat),array(listClasses))
	testEntry=['love','my','dalmation']
	thisDoc=array(setOfWords2Vec(myVocabList,testEntry))
	print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)
	testEntry=['stupid','garbage']
	thisDoc=array(setOfWords2Vec(myVocabList,testEntry))
	print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)
	
def bagOfWords2VecMN(vocabList,inputSet):
	returnVec=[0]*len(vocabList)
	for word in inputSet:
		if word in vocabList:
			returnVec[vocabList.index(word)]+=1
	return returnVec
	
def textParse(bigString):    #input is big string, #output is word list
	import re
	listOfTokens = re.split(r'\W*', bigString)
	return [tok.lower() for tok in listOfTokens if len(tok) > 2] 
	
def spamTest():
	docList=[]; classList = []; fullText =[]
	for i in range(1,26):
		wordList = textParse(open('email/spam/%d.txt' % i).read())
		# print wordList
		docList.append(wordList)
		fullText.extend(wordList)
		classList.append(1)
		wordList = textParse(open('email/ham/%d.txt' % i).read())
		docList.append(wordList)
		fullText.extend(wordList)
		classList.append(0)
	vocabList = createVocabList(docList)#create vocabulary
	trainingSet = range(50); testSet=[]           #create test set
	for i in range(10):
		randIndex = int(random.uniform(0,len(trainingSet)))
		testSet.append(trainingSet[randIndex])
		del(trainingSet[randIndex])  
	trainMat=[]; trainClasses = []
	for docIndex in trainingSet:#train the classifier (get probs) trainNB0
		trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
		trainClasses.append(classList[docIndex])
	p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
	errorCount = 0
	for docIndex in testSet:        #classify the remaining items
		wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
		if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
			errorCount += 1
			print "classification error",docList[docIndex]
	print 'the error rate is: ',float(errorCount)/len(testSet)

一名路过的小码农啊

发布了192 篇原创文章 · 获赞 27 · 访问量 10万+

私信关注

朴素贝叶斯模型+文本分类+垃圾邮件分类源代码

猜你喜欢