MLiA笔记_贝叶斯

#-*-coding:utf-8-*-
from numpy import *


# 4.1 词表到向量的转换函数

# loadDataSet()创建了一些实验样本。该函数返回的第一个变量是进行词条切分后的文档集合，。这些留言本文被切分成一系列的词条集合，标点符号从文本中去掉。
# loadDataSet()函数返回的第二个变量是一个列别标签的集合，有侮辱性和非侮辱性两类，这些文本的类别由人工标注。
def loadDataSet():
    positingList=[['my','dog','has','flea','problems','help','please'],
                  ['maybe','not','take','him','to','dog','park','stupid'],
                  ['my','dalamtion','is','so','cute','I','love','him'],
                  ['stop','posting','stupid','worthless','garbage'],
                  ['mr','licks','ate','my','steak','how','to','stop','him'],
                  ['quit','buying','worhtless','dog','food','stupid']]
    classVec = [0,1,0,1,0,1] #1代表侮辱性文字，0代表正常言论
    return positingList, classVec


# createVocabList(dataSet)函数会创建一个包含在所有文档中出现的不重复词的列表，为此使用了python的set数据类型。
# 将词条列表输给set构造函数，set就会返回一个不重复词表
def createVocabList(dataSet):
    # 首先创建一个空集
    vocabSet = set([])
    # 然后将每篇文档返回的新词集合添加到该集合中
    for document in dataSet:
        vocabSet = vocabSet | set(document) #创建两个集合的并集，按位或操作符
    return list(vocabSet)

# 获得词表后，便可以使用setOfWords2Vec()函数，该函数的输入参数为词汇表及某个文档，输出的是文档向量（向量的每一元素为1或0，分别表示词汇表中的单词在输入文档中是否出现）
def setOfWords2Vec(vocabList, inputSet):
    # 创建一个其中所包含元素都为0的向量，向量长度与词汇表相同
    returnVec = [0]*len(vocabList)
    # 接着遍历文档中的所有词汇，如果出现则输出值对应为1
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else:
            print "the word: %s is not in my vocabulary!" %word
    return returnVec


# 函数trainNB0()朴素贝叶斯分类器训练函数，函数中的输入参数为文档矩阵trainMatrix，以及由每篇文档类别标签所构成的向量trainCategory。
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    # 首先，计算文档属于侮辱性文档(class=1)的概率，因为这是一个二类分类问题，所以有1-P1可得P0
    pAbusive = sum(trainCategory)/float(numTrainDocs)
    p0Num = zeros(numWords); p1Num = zeros(numTrainDocs)
    # 计算P(Wi|C1)和P(Wi|C0)，需要初始化程序中的分子变量和分母变量
    p0Denom = 0.0 ; p1Denom = 0.0 #初始化概率
    # 在for循环中，要遍历训练集trainMatrix中的所有文档，一档某个词语在某一文档中出现，则该词对应的个数p0Denom或p1Denom就加1，而且在所有的文档中，该文档的总词数也相应加1
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i]) # 向量相加
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    # 最后，对每个元素除以该类别中的总词数
    p1Vect = p1Num/p1Denom #对每个元素做除法
    p0Vect = p0Num/p0Denom
    return p0Vect,p1Vect, pAbusive


# 4.3 朴素贝叶斯分类函数
# 函数的四个输入：要分类的向量vec2Classify，以及使用函数trainNB0()计算得到的三个概率
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    # 这里的相乘是指对应元素相乘
    p1 = sum(vec2Classify*p1Vec)+log(pClass1) # 元素相乘
    p0 = sum(vec2Classify*p0Vec)+log(1.0-pClass1)
    if p1>p0:
        return 1
    else:
        return 0

# 便利函数，该函数封装所有操作
def testingNB():
    listOPosts, listCLasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
        p0V, p1V, pAb = trainNB0(array(trainMat),array(listCLasses))
        testEntry = ['love','my','dalmation']
        thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
        print testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb)
        testEntry = ['stupid','garbage']
        thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
        print testEntry,'classified as:', classifyNB(thisDoc,p0V,p1V,pAb)

# 4.4 朴素贝叶斯词袋模型
# 如果一个词在文档中出现不止一次，这可能意味着包含该词是否出现在文档中所不能表达的某种信息，这种方法被称为词袋模型。
# 在词袋中，每个单词可以出现多次，而在词集中只能出现一次。为适应词袋模型，需对函数setOfWords2Vec()稍加修改为bagOfWords2Vec()
def bagofWords2VecMN(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            # 每当遇到一个单词时，它会增加词向量中的对应值，而不只是将对应的数值设为1
            returnVec[vocabList.index(word)] += 1
    return returnVec


# 4.5 文件解析及完整的垃圾邮件测试函数

def textParse(bigString):
    import re
    # textPrase()接受一个大字符串并将其解析为字符串列表.
    listOfTokens = re.split(r'\W*',bigString)
    # 该函数去掉少于两个字符的字符串，并将所有字符串转换为小写
    return [tok.lower() for tok in listOfTokens if len(tok)>2]

# spamText()函数对贝叶斯垃圾邮件分类器进行自动化处理。导入文件夹spam与ham下的文本文件，并将它们解析为词列表
def spamTest():
    docList = []; classList = []; fullText = []
    # 导入并解析文本文件
    for i in range (1,26):
        wordList = textParse(open('E:\PDF\MLIA\Ch04\email/spam/%d.txt' %i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(wordList)
        classList.append(1)
        wordList = textParse(open('E:\PDF\MLIA\Ch04\email/ham/%d.txt' %i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    # 接下来构建一个测试集合一个训练集，两个集合中的邮件都是随机选出的。本例中共有50封电子邮件，其中10封随机选为测试集。
    vocabList = createVocabList(docList)
    # python变量trainingSet是一个整数列表，其中的值从0到49，
    trainingSet = range(50); testSet = []
    # 随机构建训练集
    # 接下，随机选择其中10个文件。选择出的数字所对应的文档被添加到测试集，同时也将其训练集中剔除
    for i in range(10):
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat = []; trainClasses = []
    # for循环遍历训练集的所有文档，对每封邮件基于词汇表并使用setOfWordsVec()函数来构建词向量。这些词在traindNB()函数中用于计算分类所需的概率。
    for docIndex in trainingSet:
        trainMat.append(setOfWords2Vec(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0
    # 然后遍历测试集，对其中每封电子邮件进行分类
    for docIndex in testSet:
        wordVector = setOfWords2Vec(vocabList, docList[docIndex])
        # 如果邮件分类错误，则错误数加1，最后给出总的错误百分比
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
    print 'the error rate is: ', float(errorCount)/len(testSet)



# 4.6 RSS源分类器及高频词去除函数

# 计算出现频率
# 遍历词汇表中的每个词并统计它在文本中出现的次数
def calcMostFreq(vocabList, fullText):
    import operator
    freqDict = {}
    for token in vocabList:
        freqDict[token] = fullText.count(token)
    # 然后根据次数从高到低排序，最后返回排序最高的100词
    sortedFreq = sorted(freqDict.iteritems(),key=operator.itemgetter(1),reverse=True)
    return sortedFreq[:30]


# localWOrd()使用两个RSS源作为参数，RSS源要在函数外导入，这样做的原因是RSS源会随时间而改变。如果想通过改变代码来比较程序执行的差异，就应该使用相同的输入。
# 重新加载RSS源就会得到新的数据，但很难确定是代码原因还是输入原因导致输出结果的改变。
def localWords(feed1,feed0):
    import feedparser
    docList=[]; classList=[]; fullText=[]
    minLen = min(len(feed1['entries']),len([feed0['entries']]))
    # 每访问一条RSS源而不是文件
    for i in range(minLen):
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    # 调用calcMostFreq()函数来获得排序最高的100词并随后将它们移除
    top30Words = calcMostFreq(vocabList,fullText)
    # 去掉出现次数最高的那些词
    for pairW in top30Words:
        if pairW[0] in vocabList:
            vocabList.remove(pairW[0])
    trainingSet = range(2*minLen); testSet = []
    for i in range(20):
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat = []; trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bagofWords2VecMN(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = bagofWords2VecMN(vocabList,docList[docIndex])
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != \
            classList[docIndex]:
            errorCount += 1
    print 'the eoor rate is :', float(errorCount)/len(testSet)
    return vocabList, p0V,p1V


# 4.7 最具表征性的词汇显示函数
# getTopWords()使用两个RSS源作为输入，然后训练并测试朴素贝叶斯分类器，返回使用的概率值。
def getTopWords(ny,sf):
    import operator
    vocabList, p0V,p1V = localWords(ny,sf)
    # 然后创建两个列表用于元组的存储。
    topNY=[];topSF=[]
    # 与之前返回排名最高的X个单词不同，这里可以返回大于某个阈值的所有词
    for i in range(len(p0V)):
        if p0V[i]>-0.6:
            topSF.append((vocabList[i],p0V[i]))
        if p1V[i]>-0.6:
            topNY.append((vocabList[i],p1V[i]))
    sortedSF = sorted(topSF,key=lambda pair:pair[1],reverse=True)
    print "SF**SF**SF"
    for item in sortedSF:
        print item[0]
    sortedNY = sorted(topNY,key=lambda pair:pair[1],reverse=True)
    print "NY**NY**NY"
    for item in sortedNY:
        print item[0]
猜你喜欢