示例：使用朴素贝叶斯过滤垃圾邮件

准备数据：切分文本

>>> mySent='This book is best book on Python or M.L. I have ever laid eyes upon.'
>>> mySent.split()
['This', 'book', 'is', 'best', 'book', 'on', 'Python', 'or', 'M.L.', 'I', 'have', 'ever', 'laid', 'eyes', 'upon.']
>>> import re#使用正则表示式来切分句子
>>> regEx = re.compile('\\W*')
>>> listOfTokens = regEx.split(mySent)

Warning (from warnings module):
  File "E:\Python\bayes.py", line 1
    from numpy import *
FutureWarning: split() requires a non-empty pattern match.
>>> listOfTokens
['This', 'book', 'is', 'best', 'book', 'on', 'Python', 'or', 'M', 'L', 'I', 'have', 'ever', 'laid', 'eyes', 'upon', '']
>>> [tok for tok in listOfTokens if len(tok) > 0]#计算每个字符串的长度，只返回长度大于0的字符串
>#句子中的第一个单词是大写的，如果目的是查找句子，这个特点会很有用，但这里的文本只看成词袋，所以希望所有词的形式都是统一的
['This', 'book', 'is', 'best', 'book', 'on', 'Python', 'or', 'M', 'L', 'I', 'have', 'ever', 'laid', 'eyes', 'upon']
>>> [tok.lower() for tok in listOfTokens if len(tok) > 0]#将字符串全部转换为小写（.lower()）或者大写（.upper()）
['this', 'book', 'is', 'best', 'book', 'on', 'python', 'or', 'm', 'l', 'i', 'have', 'ever', 'laid', 'eyes', 'upon']
>>> emailText = open('email/ham/6.txt').read()
>>> listOfTokens=regEx.split(emailText)
>>> listOfTokens
['Hello', 'Since', 'you', 'are', 'an', 'owner', 'of', 'at', 'least', 'one', 'Google', 'Groups', 'group', 'that', 'uses', 'the', 'customized', 'welcome', 'message', 'pages', 'or', 'files', 'we', 'are', 'writing', 'to', 'inform', 'you', 'that', 'we', 'will', 'no', 'longer', 'be', 'supporting', 'these', 'features', 'starting', 'February', '2011', 'We', 'made', 'this', 'decision', 'so', 'that', 'we', 'can', 'focus', 'on', 'improving', 'the', 'core', 'functionalities', 'of', 'Google', 'Groups', 'mailing', 'lists', 'and', 'forum', 'discussions', 'Instead', 'of', 'these', 'features', 'we', 'encourage', 'you', 'to', 'use', 'products', 'that', 'are', 'designed', 'specifically', 'for', 'file', 'storage', 'and', 'page', 'creation', 'such', 'as', 'Google', 'Docs', 'and', 'Google', 'Sites', 'For', 'example', 'you', 'can', 'easily', 'create', 'your', 'pages', 'on', 'Google', 'Sites', 'and', 'share', 'the', 'site', 'http', 'www', 'google', 'com', 'support', 'sites', 'bin', 'answer', 'py', 'hl', 'en', 'answer', '174623', 'with', 'the', 'members', 'of', 'your', 'group', 'You', 'can', 'also', 'store', 'your', 'files', 'on', 'the', 'site', 'by', 'attaching', 'files', 'to', 'pages', 'http', 'www', 'google', 'com', 'support', 'sites', 'bin', 'answer', 'py', 'hl', 'en', 'answer', '90563', 'on', 'the', 'site', 'If', 'you抮e', 'just', 'looking', 'for', 'a', 'place', 'to', 'upload', 'your', 'files', 'so', 'that', 'your', 'group', 'members', 'can', 'download', 'them', 'we', 'suggest', 'you', 'try', 'Google', 'Docs', 'You', 'can', 'upload', 'files', 'http', 'docs', 'google', 'com', 'support', 'bin', 'answer', 'py', 'hl', 'en', 'answer', '50092', 'and', 'share', 'access', 'with', 'either', 'a', 'group', 'http', 'docs', 'google', 'com', 'support', 'bin', 'answer', 'py', 'hl', 'en', 'answer', '66343', 'or', 'an', 'individual', 'http', 'docs', 'google', 'com', 'support', 'bin', 'answer', 'py', 'hl', 'en', 'answer', '86152', 'assigning', 'either', 'edit', 'or', 'download', 'only', 'access', 'to', 'the', 'files', 'you', 'have', 'received', 'this', 'mandatory', 'email', 'service', 'announcement', 'to', 'update', 'you', 'about', 'important', 'changes', 'to', 'Google', 'Groups', '']

测试算法：使用朴素贝叶斯进行交叉验证
这种随机选择数据的一部分作为训练集，而剩余部分作为测试集的过程称为留存交叉验证。
random.uniform(x, y) 方法将随机生成一个实数，它在 [x,y] 范围内。
第一个错误

Traceback (most recent call last):
  File "<pyshell#59>", line 1, in <module>
    bayes.spamTest()
  File "E:\Python\bayes.py", line 85, in spamTest
    wordList = textParse(open('email/ham/%d.txt' % i).read())
UnicodeDecodeError: 'gbk' codec can't decode byte 0xae in position 199: illegal multibyte sequence

很眼熟的错误，回去之前的一找，应该还是打开出错了
网上的第一中解法

wordList = textParse(open('email/spam/%d.txt' % i, "rb").read().decode('GBK','ignore') )
wordList = textParse(open('email/ham/%d.txt' % i,  "rb").read().decode('GBK','ignore') )

不得行啊

Traceback (most recent call last):
  File "<pyshell#64>", line 1, in <module>
    bayes.spamTest()
  File "E:\Python\bayes.py", line 81, in spamTest
    wordList = textParse(open('email/spam/%d.txt' % i).read().decode('GBK','ignore'))#依次打开25个记事本，并进行文本分切
AttributeError: 'str' object has no attribute 'decode'

第二种，可以

wordList = textParse(open('email/spam/%d.txt' % i,'r',encoding = 'UTF-8',errors='ignore').read())
wordList = textParse(open('email/ham/%d.txt' % i,'r',encoding = 'UTF-8',errors='ignore').read())

第二个错误

Traceback (most recent call last):
  File "<pyshell#68>", line 1, in <module>
    bayes.spamTest()
  File "E:\Python\bayes.py", line 94, in spamTest
    del(trainingSet[randIndex])#删去trainingSet中已被随机挑选出的元素
TypeError: 'range' object doesn't support item deletion

trainingSet = range(50)改为：trainingSet = list(range(50))

def textParse(bigString):
    import re
    listOfTokens = re.split(r'\W*',bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]

def spamTest():
    docList = []; classList = []; fullText = [];
    for i in range(1,26):#在文件夹里，总共有25个文件，所以循环从1-25
        wordList = textParse(open('email/spam/%d.txt' % i,'r',encoding = 'UTF-8',errors='ignore').read())#依次打开25个记事本，并进行文本分切
        docList.append(wordList)#docList最后是一个放满了50个被切分后的文本
        fullText.extend(wordList)#fullText是一个有50个元素的列表
        classList.append(1)#50个1,0交叉的列表
        wordList = textParse(open('email/ham/%d.txt' % i,'r',encoding = 'UTF-8',errors='ignore').read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)#找出50个文本中不重复的词组成列表
    trainingSet = list(range(50)); testSet = []#trainingSet是一个整数列表，其中的值从0-49
    for i in range(10):
        randIndex = int(random.uniform(0,len(trainingSet)))#从0-49随机选择10个文件
        testSet.append(trainingSet[randIndex])#testSet是一个放了10个文件序列号的列表
        del(trainingSet[randIndex])#删去trainingSet中已被随机挑选出的元素
    trainMat = []; trainClasses = []#trainMat构建一个文档矩阵，trainClasses每片文档类别标签构成的向量
    for docIndex in trainingSet:#循环遍历训练集的所有文档  trainingSet40
        trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))#对每封邮件基于词汇表使用setOfWords2Vec构建词向量
        trainClasses.append(classList[docIndex])#由1和0组成的列表，1代表邮件来自spam，0代表邮件来自ham
    p0V,p1V,pSpam = trainNB0(array(trainMat), array(trainClasses))
    errorCount = 0
    for docIndex in testSet:#遍历测试集
        wordVector = setOfWords2Vec(vocabList, docList[docIndex])
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:#判断分类器的结果与事实是否相同
            errorCount += 1
    print('the error rate is: ',float(errorCount)/len(testSet))

>>> import bayes
>>> from imp import reload
>>> reload(bayes)
<module 'bayes' from 'E:\\Python\\bayes.py'>
>>> bayes.spamTest()

Warning (from warnings module):
  File "E:\Python\lib\re.py", line 212
    return _compile(pattern, flags).split(string, maxsplit)
FutureWarning: split() requires a non-empty pattern match.
the error rate is:  0.0
>>> bayes.spamTest()
the error rate is:  0.0
>>> bayes.spamTest()
the error rate is:  0.0

示例：使用朴素贝叶斯过滤垃圾邮件

猜你喜欢