机器学习实战第三节,朴素贝叶斯算法

【分类器给出一个最优的类别猜想结果,同时给出这个猜想的概率估计值】

 

之所以称为‘朴素’,是因为整个形式化过程只做最原始、最简单的假设

适用:标称型数据

朴素贝叶斯是贝叶斯决策理论的一部分

 

贝叶斯决策理论:

 

选择具有高概率的决策

p1(x, y)来表示数据点(x, y)属于类别1,用p2(x, y)来表示数据点(x, y)属于类别2

  • 如果p1(x, y) > p2(x, y) 类别为1
  • 如果p2(x, y) > p1(x, y) 类别为2

更精确一些的描述:

  • 如果p(c1|x, y) > p(c2|x, y),类别为c1
  • 如果p(c2|x, y) > p(c1|x, y),类别为c2

p(c|x, y)的含义为:给定数据点(x, y),该点来自类别c的概率

 

 

使用朴素贝叶斯进行文档分类

 

如果每个特征需要N个样本,那么对于10个特征将需要N10个样本,对于包含1000个特征的词汇表将需要N1000个样本。样本数会随着特征数增大而迅速增大。

如果特征之间相互独立,那么样本数就可以从N1000减少到1000*N

(独立:特征或单词出现的可能性与它和其他单词相邻没有关系)

 

朴素贝叶斯的假设:

  • 特征之间相互独立
  • 每个特征同等重要

 

(朴素贝叶斯的分类器有两张实现方式:

基于伯努利模型的实现:不考虑词在文档中出现的次数,只考虑出不出现,假设词等权重

基于多项式模型的实现:)

这里使用了伯努利模型

 

python矩阵相乘:a*b

python矩阵点乘:multiply(a, b)

 

 

_____________________________________________________________________

 

词集模型(set-of-words model)

将每个词的出现与否作为一个特征。(上述代码使用词集模型)

词袋模型(bag-of-words model)

每个单词可以出现很多次

 

————————————————————————————————————


python实现基于朴素贝叶斯的过滤网站恶意留言 & 

基于朴素贝叶斯的垃圾邮件分类器

import numpy as np
from math import log
import feedparser


# —————————————过滤网站恶意留言—————————————————


def testing_nb():
    dataset, labels = load_data_set()
    vocab_list = creat_vecablist(dataset)
    print(vocab_list)
    words_vec = []
    for i in range(len(dataset)):
        words_vec.append(set_of_words2vec(vocab_list, dataset[i]))

    print(words_vec)
    p_vec1, p_vec0, p_abusive = train_nb0(words_vec, labels)
    test_entry = ['love', 'my', 'dalmation']
    test_vec = set_of_words2vec(vocab_list, test_entry)
    print(test_entry,'classfied as :', classify_nb(test_vec, p_vec1, p_vec0, p_abusive))
    test_entry = ['stupid', 'garbage']
    test_vec = set_of_words2vec(vocab_list, test_entry)
    print(test_entry, 'classfied as :', classify_nb(test_vec, p_vec1, p_vec0, p_abusive))


def classify_nb(vec2classify, p_vec1, p_vec0, p_abusive):
    # 根据条件概率公式: p(ci|w) = p(w|ci)*p(ci)/p(w)
    # 忽略分母
    # 取对数:log(p(w|ci)*p(ci)) = log(p(w|ci)) + log(p(ci))
    # p(w|ci)为在类c中出现要预测向量w的概率
    # 即log(vec2classify * p_veci) = vec2classify * log(p_veci)
    # log(p_veci)即参数p_veci
    p1 = sum(np.multiply(vec2classify, p_vec1)) + log(p_abusive)
    p0 = sum(np.multiply(vec2classify, p_vec0)) + log(1-p_abusive)
    if p1 > p0:
        return 1
    else:
        return 0


# 朴素贝叶斯核心函数
def train_nb0(train_matrix, train_category):
    '''

    :param train_matrix: 样本数据
    :param train_category: 样本数据分类
    :return: p(w|ci), p(ci)
    '''
    m = len(train_matrix)
    n = len(train_matrix[0])
    # class = 1的概率
    p_abusive = sum(train_category)/m

    # 为避免p(w0|1)*p(w1|1)*p(w2|1)相乘=0(如果其中一个为0)改变初始化的值
    p_vec1 = np.ones(n)     # p_vec1 = np.zeros(n)
    p_vec0 = np.ones(n)     # p_vec0 = np.zeros(n)
    p_count1 = 2    # p_count1 = 0
    p_count0 = 2    # p_count0 = 0

    for i in range(m):
        if train_category[i] == 1:
            p_vec1 += train_matrix[i]
            p_count1 += sum(train_matrix[i])
        else:
            p_vec0 += train_matrix[i]
            p_count0 += sum(train_matrix[i])

    # 为避免下溢出:因乘积太小,四舍五入为0,取自然对数 ln(a*b) = ln(a) + ln(b)
    # log函数不能对矩阵直接操作
    p_vec1 = p_vec1/p_count1
    p_vec0 = p_vec0/p_count0
    log_p_vec1 = [log(a) for a in p_vec1]   # p_vec1 = p_vec1/p_count1
    log_p_vec0 = [log(a) for a in p_vec0]    # p_vec0 = p_vec0/p_count0

    return log_p_vec1, log_p_vec0, p_abusive


def load_data_set():
    posting_list = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                    ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                    ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                    ['stop', 'posting', 'stupid', 'worthless', 'grabage'],
                    ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                    ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    class_vec = [0, 1, 0, 1, 0, 1]   # 1代表侮辱,0代表正常
    return posting_list, class_vec


def creat_vecablist(dataset):
    vocabset = set([])

    for vec in dataset:
        vocabset = vocabset | set(vec)   # 创建两个集合的并集

    return list(vocabset)


def set_of_words2vec(vocablist, inputset):
    # 词集模型
    ret_vec = [0]*len(vocablist)
    for word in inputset:
        if word in vocablist:
            ret_vec[vocablist.index(word)] = 1

    return ret_vec


def bag_of_words2vec(vocablist, inputset):
    # 词袋模型
    ret_vec = [0]*len(vocablist)
    for word in inputset:
        if word in vocablist:
            ret_vec[vocablist.index(word)] += 1

    return ret_vec


# testing_nb()

# ————————————构造垃圾邮件分类器—————————————

def text_parser(text):
    import re
    # python3中,如果正则表达式可以匹配空字符,报错,所以这里将 \\W*改成\\W+
    # 错误: FutureWarning 或者 ValueError
    text_list = re.split('\\W+', text)
    return [tok.lower() for tok in text_list if len(tok)>2]


def spam_test():
    full_text = []
    email_lists = []
    class_label = []

    for i in range(1,26):
        # 导入垃圾邮件
        email_text = text_parser(open('email/spam/%d.txt' % i).read())
        full_text.extend(email_text)
        email_lists.append(email_text)
        class_label.append(1)

        # 导入非垃圾邮件
        email_text = text_parser(open('email/ham/%d.txt' % i).read())
        full_text.extend(email_text)
        email_lists.append(email_text)
        class_label.append(0)

    # 构造字典
    vocab_list = creat_vecablist(email_lists)

    # 构造训练集矩阵
    train_vec = []
    for email_list in email_lists:
        train_vec.append(set_of_words2vec(vocab_list, email_list))

    # 随机选择训练集和测试集
    training_set = list(range(50))
    test_set = []
    for i in range(10):
        rand_int = np.random.randint(0, len(training_set))
        test_set.append(rand_int)
        del(training_set[rand_int])

    train_matrix = []
    label_matrix = []
    for i in training_set:
        train_matrix.append(train_vec[i])
        label_matrix.append(class_label[i])

    # 训练
    p_vec1, p_vec0, p_abusive = train_nb0(train_matrix, label_matrix)

    # 测试
    error = 0
    for i in test_set:
        if classify_nb(train_vec[i], p_vec1, p_vec0, p_abusive) != class_label[i]:
            error += 1

    error_rate = float(error)/len(test_set)
    print('error rate: %f' % error_rate)
    return error_rate


# spam_test()


 

——————————————————————————————————

 

RSS(Really Simple Syndication):是一种基于XML的格式

发布者定义一个feed, 按照格式来发布内容,其他人就可以根据feed来获取内容。

读取类似XML


python实现基于RSS数据源的分类:

# ———————————————————基于RSS的分类——————————————————

def cal_most_freq(vocab_list, dataset):
    import operator
    freq_dict = {}
    for token in vocab_list:
        freq_dict[token] = dataset.count(token)
    sorted_freq = sorted(freq_dict.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_freq[:30]


def local_words(feed1, feed0):

    doc_list = []
    class_list = []
    full_text = []
    min_len = min(len(feed1['entries']), len(feed0['entries']))

    for i in range(min_len):
        word_list = text_parser(feed1['entries'][i]['summary'])
        doc_list.append(word_list)
        full_text.extend(word_list)
        class_list.append(1)
        word_list = text_parser(feed0['entries'][0]['summary'])
        doc_list.append(word_list)
        full_text.extend(word_list)
        class_list.append(0)

    vocab_list = creat_vecablist(doc_list)
    top30_words = cal_most_freq(vocab_list, full_text)
    for p in top30_words:
        if p[0] in vocab_list:
            vocab_list.remove(p[0])
    training_set = list(range(2*min_len))
    test_set = []
    for i in range(10):
        rand_index = int(np.random.uniform(0, len(training_set)))
        test_set.append(training_set[rand_index])
        del(training_set[rand_index])
    train_mat = []
    train_class = []
    for doc_index in training_set:
        train_mat.append(bag_of_words2vec(vocab_list, doc_list[doc_index]))
        train_class.append(class_list[doc_index])
    p0v,p1v,pspam = train_nb0(np.array(train_mat), np.array(train_class))
    error_count = 0
    for doc_index in test_set:
        word_vector = bag_of_words2vec(vocab_list, doc_list[doc_index])
        if classify_nb(np.array(word_vector), p0v, p1v, pspam) != class_list[doc_index]:
            error_count += 1

    print('the error rate:', float(error_count)/len(test_set))
    return vocab_list, p0v, p1v


def get_top_words(ny, sf):
    vocab_list, pny, psf = local_words(ny, sf)
    top_ny = []
    top_sf = []
    for i in range(len(psf)):
        if psf[i] > -6.0:
            top_sf.append((vocab_list[i],psf[i]))
        if pny[i] > -6.0:
            top_ny.append((vocab_list[i],pny[i]))
    sorted_sf = sorted(top_sf, key=lambda pair:pair[1], reverse=True)
    print('*******SFSFSFSF*******')
    for item in sorted_sf:
        print(item[0])


ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
sf = feedparser.parse('http://sfbay.craigslist.org/stp/index/rss')
get_top_words(ny, sf)




猜你喜欢

转载自blog.csdn.net/ll523587181/article/details/78934546