机器学习实战 朴素贝叶斯算法

# -- coding: utf-8 --

from bayes import*
import matplotlib.pyplot as plt

ListOposts, listClasses = loadDataSet()
myVocablist = creatVocabList(ListOposts) #创建单词列表

print(myVocablist)

trainMat=[]
for i in range(len(ListOposts)):
    trainMat.append(setOfWords2Vec(myVocablist, ListOposts[i]))
print(trainMat)
p0v, p1v, pAb = trainNB0(trainMat, listClasses)
print(p1v)
print(p0v)

testEntry = [['love', 'my', 'dalmation'],
             ['stupid', 'garbage']]

for a in range(len(testEntry)):
    t = setOfWords2Vec(myVocablist, testEntry[a])
    print(t)
    print(classifyNB(array(t), array(p0v), array(p1v), pAb))

spamTest()



# -- coding: utf-8 --
#bayes

from numpy import*
from math import log
import operator
import matplotlib.pyplot as plt

def loadDataSet():
    postingList = [['my', 'dog', 'has', 'flea',
                    'problems', 'help', 'please'],
                   ['maybe', 'not', 'take', 'him',
                   'to', 'dog', 'park', 'stupid'],
                   ['my', 'dalmation', 'is', 'so', 'cute',
                   'I', 'love', 'him'],
                   ['stop', 'posting', 'stupid', 'worthless',
                    'garbage'],
                   ['mr', 'licks', 'ate', 'my', 'steak', 'how',
                    'to', 'stop', 'him'],
                   ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0, 1, 0, 1, 0, 1]  #1 侮辱文字  0 正常文字
    return postingList, classVec

def creatVocabList(dataSet):   #构建词汇表
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document) #或操作
    return list(vocabSet)

#计算每个文档的词是否出现  词集模型
def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0]*len(vocabList)      #[]*n  将数组重复n次并依次连接形成一个新数组
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else:
            print('the word : %s is not in my Vocabulary!' %word)
    return returnVec


#计算每个文档的词出现次数  词袋模型
def bagOfWords2VecMN(vocabList,inputSet):
    returnVec = [0]*len(vocabList)      #[]*n  将数组重复n次并依次连接形成一个新数组
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec


#训练贝叶斯模型
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDocs)  #计算p(ck)分类概率
    p0Num = ones(numWords)        #防止p(w|c)累乘 中有值为0的项 所以用ones矩阵
    p1Num = ones(numWords)
    p0Denom = 2.0
    p1Denom = 2.0

    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])

    p1Vect = []
    p0Vect = []
    for i in range(numWords):      #单个单词的p(x1|c1)取log,防止小数累乘溢出变0
        p1Vect.append(log(p1Num[i]/p1Denom))
        p0Vect.append(log(p0Num[i]/p0Denom))
    return p0Vect, p1Vect, pAbusive


#朴素贝叶斯分类函数
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)
    p0 = sum(vec2Classify * p0Vec) + log(1.0-pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

#文本解析
def textParse(bigString):
    import re
    ListofTokens = re.split(r'\w*', bigString)     #正则表达式
    return [tok.lower() for tok in ListofTokens if len(tok) > 2]

def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):   #50个样本 正负各25个
        wordList = textParse(open(r'E:\file\python\test\test\bayes_data\email\spam/%d.txt' %i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open(r'E:\file\python\test\test\bayes_data\email\ham/%d.txt' %i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = creatVocabList(docList)

    trainingSet = range(50)   #从50个样本随机选10个作为测试集
    testSet = []
    for i in range(10):
        randIndex = int(random.uniform(0,len(trainingSet)))   #随机生成下一个实数,它在(x,y)范围内
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat = []        #训练集矩阵
    trainClasses = []    #训练集类别
    for docIndex in trainingSet:
        trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))
    erroCount = 0

    for docIndex in testSet:
        wordVector = setOfWords2Vec(vocabList, docList[docIndex])
        if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            erroCount += 1
    print('the error rate is ', float(erroCount)/len(testSet))

猜你喜欢

转载自blog.csdn.net/fm904813255/article/details/80369254