Python利用朴素贝叶斯进行评分的分类

利用朴素贝叶斯可以对文档进行分类,比如说进行垃圾邮件的过滤等接下来的案例是对评分进行分类的,经过学习判断一句话应该属于几分,0-5分之间。
先利用爬虫爬取样本数据,该数据来自公开课的评论。
# coding=utf-8

import urllib2
from sgmllib import SGMLParser
import jieba


class CommentParser(SGMLParser):
    def __init__(self):
        SGMLParser.__init__(self)
        self.__start_table = False
        self.__start_p = False
        self.__value_p = ''
        self.__value_div = ''
        self.__p_state = 0
        self.data = []

    def start_table(self, attr):
        for k, v in attr:
            if k == 'class' and v == 'table table-hover':
                self.__start_table = True

    def end_table(self):
        if self.__start_table:
            self.data.append([self.__value_p, self.__value_div])
            self.__value_p = ''
            self.__value_div = ''
            self.__p_state = 0
            self.__start_table = False

    def start_div(self, attr):
        if self.__start_table:
            for k, v in attr:
                if k == 'data-score':
                    self.__value_div = v

    def end_div(self):
        pass

    def start_p(self, attrs):
        if self.__start_table:
            self.__p_state += 1
            self.__start_p = True

    def end_p(self):
        if self.__start_table:
            self.__start_p = False

    def handle_data(self, data):
        if self.__start_table and self.__start_p and self.__p_state == 3:
            self.__value_p += data


def get_page(url):
    page = urllib2.urlopen(url).read()
    paraser = CommentParser()
    paraser.feed(page)
    value = paraser.data
    return value


def download():
    url = 'http://coursegraph.com/reviews/'
    for i in range(1, 9):
        value = get_page(url + str(i))
        with open('result.txt', 'a+') as f:
            for row in value:
                f.write('[' + row[1] + ']' + row[0].strip().replace('\n', '').replace('\r', '') + '\n')


def jieba_chn():
    all_value = open('result.txt', 'r+').readlines()
    with open('result1.txt', 'w+') as f:
        for row in all_value:
            value = row[:5][1:4]
            jb = jieba.cut_for_search(row[5:])
            for row in jb:
                if len(row) > 1:
                    value += ',' + row
            f.write(value.encode('utf-8') + '\n')

#下载数据
# download()
#对文档数据进行分词操作
jieba_chn()
很简单的一个网络爬虫,然后利用**结巴**分词吧文档分割成词组,并去除掉标点符号等操作。具体的结果可以下载本案例进行查看。
下面开始进行利用朴素贝叶斯进行分类操作
先读取文档数据
def load_data_set():
    dataSet = []
    labels = []
    with open('result1.txt', 'r+') as f:
        for row in f.readlines():
            t = row.strip().replace('\n', '').split(',')
            labels.append(round(float(t[0]), 1))
            dataSet.append(t[1:])
    return dataSet, labels
创建一个单词向量和对应的标签
def create_vocab_list(dataSet, labels):
    vocabSet = []
    labelSet = []
    for index, document in enumerate(dataSet):
        vocabSet.extend(list(set(document)))
        labelSet.extend([labels[index] for i in range(len(set(document)))])
    return vocabSet, labelSet

根据单词出现修改相应的标签

def set_of_words2_vec(vocabList, label, inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        for index, r in enumerate(vocabList):
            if r == word:
                returnVec[index] += label
    return returnVec
对数据惊醒学习操作,计算出词组出现的概率分布
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    labelSet = list(set(trainCategory))
    pAbusive = {}
    for r in labelSet:
        pAbusive[str(r)] = len([row for row in trainCategory if row == r]) \
                           / float(numTrainDocs)
    pNumber = {}
    pDenom = {}
    for row in labelSet:
        pNumber[str(row)] = ones(numWords)
        pDenom[str(row)] = 2.0
    for i in range(numTrainDocs):
        pNumber[str(trainCategory[i])] += [row / trainCategory[i] for row in trainMatrix[i]]
        pDenom[str(trainCategory[i])] += sum(trainMatrix[i]) / trainCategory[i]

    ret = {}
    for i in range(len(labelSet)):
        ret[str(labelSet[i])] = pNumber[str(labelSet[i])] / pDenom[str(labelSet[i])]

    return ret, pAbusive
判断测试词组的出现的概率,选择出出现概率最高的一项,就是该词组的评分了。
def classifyNB(vec2Classify, pVec, pClass, trainCategory):
    labelSet = list(set(trainCategory))
    p = {}
    for row in labelSet:
        p[str(row)] = sum(vec2Classify * pVec[str(row)]) + log(pClass[str(row)])
    m = sorted(p.items(), key=lambda k: k[1], reverse=True)
    return float(m[0][0])
一下是对文档进行测试的操作,
def testingNB():
    dataSet, labels = load_data_set()
    vocabSet, labelSet = create_vocab_list(dataSet, labels)
    trainMatrix = []
    for index, row in enumerate(dataSet):
        trainMatrix.append(set_of_words2_vec(vocabSet, labels[index], row))
    pV, pAb = trainNB0(trainMatrix, labels)
    testEntry = ['学习', '很棒', '真不错']
    testEntry = list(set(testEntry))
    thisDoc = array(set_of_words2_vec(vocabSet, 1, testEntry))
    print testEntry, 'classified as: ', classifyNB(thisDoc, pV, pAb, labels)


def test(number):
    '''
    验证算法的正确性
    :param number: 当成测试样本的额百分比
    :return:
    '''
    dataSet, labels = load_data_set()
    test_number = int(len(dataSet) * number)
    testSet = []
    for i in range(test_number):
        randIndex = int(random.uniform(0, len(dataSet)))
        testSet.append([dataSet[randIndex], labels[randIndex]])
        del (dataSet[randIndex])
        del (labels[randIndex])
    # 进行学习
    vocabSet, labelSet = create_vocab_list(dataSet, labels)
    trainMatrix = []
    for index, row in enumerate(dataSet):
        trainMatrix.append(set_of_words2_vec(vocabSet, labels[index], row))
    pV, pAb = trainNB0(trainMatrix, labels)
    # 进行测试
    errorCount = 0
    for row in testSet:
        testEntry = row[0]
        testEntry = list(set(testEntry))
        thisDoc = array(set_of_words2_vec(vocabSet, 1, testEntry))
        ret = classifyNB(thisDoc, pV, pAb, labels)
        if ret != row[1]:
            print "classification error", row[1], ret
            errorCount += 1
    print 'the error rate is: ', float(errorCount) / len(testSet)


test(0.1)
# testingNB()
悲剧的是测试的结果很不理想,难道中文不能这样分词,还是那一个细节出现问题,还请大神指导下,权当学习一下吧!
[案例现在地址](http://download.csdn.net/detail/u010154424/9602826)

猜你喜欢

转载自blog.csdn.net/u010154424/article/details/52203503