第4章:Naive Bayes
Naive Bayes:监督学习。通过计算概率进行分类。
优点:数据量小的情况下仍然有效,可以处理多类别问题。
缺点:对于输入数据的准备数据比较敏感。
适用于标称型数据。
公式:
如果把w展开为独立特征,就成了,假设每个特征都相互独立,用来计算概率。
P1:计算文本分类概率
# 创建数据集
def loadDataSet():
postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0, 1, 0, 1, 0, 1]
return postingList,classVec
# 把每一行的词变成一个不重复的set
def createVocabList(dataSet):
vocabSet = set([])
for document in dataSet:
vocabSet = vocabSet | set(document) # 取并集
return list(vocabSet)
# 查看inputList每个单词在vocabList里面是否存在
def setOfWords2Vec(vocabList, inputList):
returnVec = [0]*len(vocabList) # list like [0, 0, 0]
for word in inputList:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else:
print("the word: %s is not in my vocabulary" % word)
return returnVec
拿数据集测试,得到结果如下:
测试:
def trainNB0(trainMatrix, trainCategory):
numTrainDocs = len(trainMatrix) # 行数量
numWords = len(trainMatrix[0]) # 行长度
pAbusive = sum(trainCategory)/float(numTrainDocs)
p0Num = zeros(numWords)
p1Num = zeros(numWords)
p0Denom = 0.0
p1Denom = 0.0
for i in range(numTrainDocs): # 1-6
# 根据每一行的分类,计算每行每个词语在行分类出现次数,总词语在行分类中出现总数
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vect = p1Num/p1Denom # 每个词语在每个分类中的出现比例
p0Vect = p0Num/p0Denom
return p0Vect, p1Vect, pAbusive
输入p0V, p1V, pAb = nbayes.trainNB0(trainMat, listClasses)得到词汇表中每个词在每个分类中的概率,文档总分类概率
在使用bayes进行分类计算多个概率的成绩,如果其中一个概率为0,那么最后乘积也是0.为了降低这种影响,可以把所有的出现数初始化为1,分母初始化为2。修改上面的代码:
p0Num = ones(numWords)
p1Num = ones(numWords)
p0Denom = 2.0
p1Denom = 2.0
还可能因为太多很小的数字相乘,导致下溢出,可以通过采用自然对数。采用自然对数进行处理不会有任何损失,下图时f(x)和ln(f(x))的曲线:在相同区域内同时增加或减少,并且在相同点取到极值。虽然取值不同,但不影响最终结果。
修改代码:
p1Vect = log(p1Num/p1Denom)
p0Vect = log(p0Num/p0Denom)
一个分类器:
def classifyNB(vec2Classify, p0Vect, p1Vect, pClass1):
p1 = sum(vec2Classify * p1Vect) + log(pClass1) # 词语exist list与分类文档中词汇概率的点乘积,加分类文档总概率(说起来都感觉好绕口。。
p0 = sum(vec2Classify * p0Vect) + log(1.0 - pClass1)
if p1 > p0:
return 1
else:
return 0
挨你喂,看测试代码……
def testingNB():
listOPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
trainMat = []
for postinDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
p0V, p1V, pAb = trainNB0(array(trainMat), array(listClasses))
testEntry = ['love', 'my', 'dalmation']
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
testEntry = ['stupid', 'garbage', 'dalmation']
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
得到结果
['love', 'my', 'dalmation'] classified as: 0
['stupid', 'garbage', 'dalmation'] classified as: 1
将每个词的出现与否作为一个特征,这称为set-of-words model。如果一个词在文档出现不止一次,可能意味包含更多信息,称为bag-of-words model。
在词袋中,每个单词可以出现多次,而在词集中,每个词只能出现一次。
def bagOfWordsVecMN(vocabList, inputList):
returnVec = [0]*len(vocabList)
for word in inputList:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
return returnVec
P2:词语分割,判断文本邮件
mySentence = 'This book is the best book on Python I have ever laid eyes upon.'
mySentence.split()
上面代码会把最后一个元素带有标点
['This', 'book', 'is', 'the', 'best', 'book', 'on', 'Python', 'I', 'have', 'ever', 'laid', 'eyes', 'upon.']
使用正则进行分割:(\w:用于匹配字母,数字或下划线字符; \W:用于匹配所有与\w不匹配的字符; )
import re
wordList = re.split('\W+', mySentence)
print(wordList)
再使用去掉大小写,去掉空元素:
[w.lower() for w in wordList if len(w) > 0]
接下来读取两个文件夹的文件进行测试:
def textParse(bigString):
import re
wordList = re.split('\W+', bigString)
return [w.lower() for w in wordList if len(w) > 2]
def spamTest():
docList = [] # matrix-like list
classList = [] # list
fullText = [] # list
for i in range(1, 26):
wordList = textParse(open('email/spam/%d.txt' % i, 'rb').read().decode('GBK','ignore'))
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(open('email/ham/%d.txt' % i, 'rb').read().decode('GBK','ignore'))
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)
# 留存交叉验证
trainingSet = list(range(50))
testSet = []
for i in range(10): # 把50个number随机分成两组list,一组10个数字,一组40个数字
randIndex = int(random.uniform(0, len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
# 计算训练集中的各词语分类概率
trainMat = []
trainClasses = []
for docIndex in trainingSet:
trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))
# 验证错误率
errorCount = 0
for docIndex in testSet:
wordVector = setOfWords2Vec(vocabList, docList[docIndex])
if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
errorCount += 1
print("error rate: ", float(errorCount)/len(testSet))
(怎么感觉代码的计算思想和bayes对不上号……T T大年初一的丧气唉~)
1. 从已经被判别好分类的邮件,把邮件里的词抽出来,与词汇集对比是否出现。
2. 选出训练集,计算这些训练集中各词汇的分类概率。
3. 把训练集的分类概率,与测试集的进行比较,把判断出的分类,和真实分类对比计算错误率。
P3:广告倾向
RSS feed里面比较两种信息的分类
def calcMostFreq(vocabList, fullText):
import operator
freqDict = {}
for w in vocabList:
freqDict[w] = fullText.count(w)
sortedFreq = sorted(freqDict.items(), key = operator.itemgetter(1), reverse = True)
return sortedFreq[:30]
def localWords(feed1, feed0):
import feedparser
docList = []
classList = []
fullText = []
minLen = min(len(feed1['entries'])), len(feed0['entries'])
for i in range(minLen):
wordList = textParse(feed1['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(feed0['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)
top30words = calcMostFreq(vocabList, fullText)
# 去掉出现频率最高的30个词,因为可能是冗余
for pairW in top30words:
if pairW[0] in vocabList:
vocabList.remove(pairW[0])
trainingSet = list(range(2*minLen))
testSet = []
for i in range(20):
randIndex = int(random.uniform(0, len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
trainMat = []
trainClasses = []
for docIndex in trainingSet:
trainMat.append(bagOfWordsVecMN(vocabList, docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))
errorCount = 0
for docIndex in testSet:
wordVector = bagOfWordsVecMN(vocabList, docList[docIndex])
if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
errorCount += 1
print("error rate: ", float(errorCount)/len(testSet))
return vocabList, p0V, p1V
把两个分类中出现率最高的找出来:
def getTopWords(feed0, feed1):
import operator
vocabList, p0V, p1V = localWords(feed0, feed1)
top0 = []
top1 = []
for i in range(len(p0V)):
if p0V[i] > -6.0:
top0.append((vocabList[i], p0V[i])) # 像字典一样存key, value(count)
if p1V[i] > -6.0:
top1.append((vocabList[i], p0V[i]))
sortedFeed0 = sorted(top0, key = lambda pair : pair[1], reverse = True)
for item in sortedFeed0:
print(item[0])
sortedFeed1 = sorted(top0, key = lambda pair : pair[1], reverse = True)
for item in sortedFeed1:
print(item[0])
对于分类而言,使用概率有时候比用规则更有效。bayes利用已知值来估计未知概率。
实现bayes实现需要考虑很多实际因素,下溢出也是其中的一个问题,可以通过对数来解决。
词袋模型比词集模型在处理文档分类问题上更好。
使用停用词(筛选出一些冗余词汇)也能做一些改进。
文末吐槽:
这点积+对数概率到底和bayes什么关系………………