【分类器给出一个最优的类别猜想结果,同时给出这个猜想的概率估计值】
之所以称为‘朴素’,是因为整个形式化过程只做最原始、最简单的假设
适用:标称型数据
朴素贝叶斯是贝叶斯决策理论的一部分
贝叶斯决策理论:
选择具有高概率的决策
用p1(x, y)来表示数据点(x, y)属于类别1,用p2(x, y)来表示数据点(x, y)属于类别2
- 如果p1(x, y) > p2(x, y) 类别为1
- 如果p2(x, y) > p1(x, y) 类别为2
更精确一些的描述:
- 如果p(c1|x, y) > p(c2|x, y),类别为c1
- 如果p(c2|x, y) > p(c1|x, y),类别为c2
p(c|x, y)的含义为:给定数据点(x, y),该点来自类别c的概率
使用朴素贝叶斯进行文档分类
如果每个特征需要N个样本,那么对于10个特征将需要N10个样本,对于包含1000个特征的词汇表将需要N1000个样本。样本数会随着特征数增大而迅速增大。
如果特征之间相互独立,那么样本数就可以从N1000减少到1000*N
(独立:特征或单词出现的可能性与它和其他单词相邻没有关系)
朴素贝叶斯的假设:
- 特征之间相互独立
- 每个特征同等重要
(朴素贝叶斯的分类器有两张实现方式:
基于伯努利模型的实现:不考虑词在文档中出现的次数,只考虑出不出现,假设词等权重
基于多项式模型的实现:)
这里使用了伯努利模型
python矩阵相乘:a*b
python矩阵点乘:multiply(a, b)
_____________________________________________________________________
词集模型(set-of-words model)
将每个词的出现与否作为一个特征。(上述代码使用词集模型)
词袋模型(bag-of-words model)
每个单词可以出现很多次
————————————————————————————————————
python实现基于朴素贝叶斯的过滤网站恶意留言 &
基于朴素贝叶斯的垃圾邮件分类器
import numpy as np
from math import log
import feedparser
# —————————————过滤网站恶意留言—————————————————
def testing_nb():
dataset, labels = load_data_set()
vocab_list = creat_vecablist(dataset)
print(vocab_list)
words_vec = []
for i in range(len(dataset)):
words_vec.append(set_of_words2vec(vocab_list, dataset[i]))
print(words_vec)
p_vec1, p_vec0, p_abusive = train_nb0(words_vec, labels)
test_entry = ['love', 'my', 'dalmation']
test_vec = set_of_words2vec(vocab_list, test_entry)
print(test_entry,'classfied as :', classify_nb(test_vec, p_vec1, p_vec0, p_abusive))
test_entry = ['stupid', 'garbage']
test_vec = set_of_words2vec(vocab_list, test_entry)
print(test_entry, 'classfied as :', classify_nb(test_vec, p_vec1, p_vec0, p_abusive))
def classify_nb(vec2classify, p_vec1, p_vec0, p_abusive):
# 根据条件概率公式: p(ci|w) = p(w|ci)*p(ci)/p(w)
# 忽略分母
# 取对数:log(p(w|ci)*p(ci)) = log(p(w|ci)) + log(p(ci))
# p(w|ci)为在类c中出现要预测向量w的概率
# 即log(vec2classify * p_veci) = vec2classify * log(p_veci)
# log(p_veci)即参数p_veci
p1 = sum(np.multiply(vec2classify, p_vec1)) + log(p_abusive)
p0 = sum(np.multiply(vec2classify, p_vec0)) + log(1-p_abusive)
if p1 > p0:
return 1
else:
return 0
# 朴素贝叶斯核心函数
def train_nb0(train_matrix, train_category):
'''
:param train_matrix: 样本数据
:param train_category: 样本数据分类
:return: p(w|ci), p(ci)
'''
m = len(train_matrix)
n = len(train_matrix[0])
# class = 1的概率
p_abusive = sum(train_category)/m
# 为避免p(w0|1)*p(w1|1)*p(w2|1)相乘=0(如果其中一个为0)改变初始化的值
p_vec1 = np.ones(n) # p_vec1 = np.zeros(n)
p_vec0 = np.ones(n) # p_vec0 = np.zeros(n)
p_count1 = 2 # p_count1 = 0
p_count0 = 2 # p_count0 = 0
for i in range(m):
if train_category[i] == 1:
p_vec1 += train_matrix[i]
p_count1 += sum(train_matrix[i])
else:
p_vec0 += train_matrix[i]
p_count0 += sum(train_matrix[i])
# 为避免下溢出:因乘积太小,四舍五入为0,取自然对数 ln(a*b) = ln(a) + ln(b)
# log函数不能对矩阵直接操作
p_vec1 = p_vec1/p_count1
p_vec0 = p_vec0/p_count0
log_p_vec1 = [log(a) for a in p_vec1] # p_vec1 = p_vec1/p_count1
log_p_vec0 = [log(a) for a in p_vec0] # p_vec0 = p_vec0/p_count0
return log_p_vec1, log_p_vec0, p_abusive
def load_data_set():
posting_list = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'grabage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
class_vec = [0, 1, 0, 1, 0, 1] # 1代表侮辱,0代表正常
return posting_list, class_vec
def creat_vecablist(dataset):
vocabset = set([])
for vec in dataset:
vocabset = vocabset | set(vec) # 创建两个集合的并集
return list(vocabset)
def set_of_words2vec(vocablist, inputset):
# 词集模型
ret_vec = [0]*len(vocablist)
for word in inputset:
if word in vocablist:
ret_vec[vocablist.index(word)] = 1
return ret_vec
def bag_of_words2vec(vocablist, inputset):
# 词袋模型
ret_vec = [0]*len(vocablist)
for word in inputset:
if word in vocablist:
ret_vec[vocablist.index(word)] += 1
return ret_vec
# testing_nb()
# ————————————构造垃圾邮件分类器—————————————
def text_parser(text):
import re
# python3中,如果正则表达式可以匹配空字符,报错,所以这里将 \\W*改成\\W+
# 错误: FutureWarning 或者 ValueError
text_list = re.split('\\W+', text)
return [tok.lower() for tok in text_list if len(tok)>2]
def spam_test():
full_text = []
email_lists = []
class_label = []
for i in range(1,26):
# 导入垃圾邮件
email_text = text_parser(open('email/spam/%d.txt' % i).read())
full_text.extend(email_text)
email_lists.append(email_text)
class_label.append(1)
# 导入非垃圾邮件
email_text = text_parser(open('email/ham/%d.txt' % i).read())
full_text.extend(email_text)
email_lists.append(email_text)
class_label.append(0)
# 构造字典
vocab_list = creat_vecablist(email_lists)
# 构造训练集矩阵
train_vec = []
for email_list in email_lists:
train_vec.append(set_of_words2vec(vocab_list, email_list))
# 随机选择训练集和测试集
training_set = list(range(50))
test_set = []
for i in range(10):
rand_int = np.random.randint(0, len(training_set))
test_set.append(rand_int)
del(training_set[rand_int])
train_matrix = []
label_matrix = []
for i in training_set:
train_matrix.append(train_vec[i])
label_matrix.append(class_label[i])
# 训练
p_vec1, p_vec0, p_abusive = train_nb0(train_matrix, label_matrix)
# 测试
error = 0
for i in test_set:
if classify_nb(train_vec[i], p_vec1, p_vec0, p_abusive) != class_label[i]:
error += 1
error_rate = float(error)/len(test_set)
print('error rate: %f' % error_rate)
return error_rate
# spam_test()
——————————————————————————————————
RSS(Really Simple Syndication):是一种基于XML的格式
发布者定义一个feed, 按照格式来发布内容,其他人就可以根据feed来获取内容。
读取类似XML
python实现基于RSS数据源的分类:
# ———————————————————基于RSS的分类——————————————————
def cal_most_freq(vocab_list, dataset):
import operator
freq_dict = {}
for token in vocab_list:
freq_dict[token] = dataset.count(token)
sorted_freq = sorted(freq_dict.items(), key=operator.itemgetter(1), reverse=True)
return sorted_freq[:30]
def local_words(feed1, feed0):
doc_list = []
class_list = []
full_text = []
min_len = min(len(feed1['entries']), len(feed0['entries']))
for i in range(min_len):
word_list = text_parser(feed1['entries'][i]['summary'])
doc_list.append(word_list)
full_text.extend(word_list)
class_list.append(1)
word_list = text_parser(feed0['entries'][0]['summary'])
doc_list.append(word_list)
full_text.extend(word_list)
class_list.append(0)
vocab_list = creat_vecablist(doc_list)
top30_words = cal_most_freq(vocab_list, full_text)
for p in top30_words:
if p[0] in vocab_list:
vocab_list.remove(p[0])
training_set = list(range(2*min_len))
test_set = []
for i in range(10):
rand_index = int(np.random.uniform(0, len(training_set)))
test_set.append(training_set[rand_index])
del(training_set[rand_index])
train_mat = []
train_class = []
for doc_index in training_set:
train_mat.append(bag_of_words2vec(vocab_list, doc_list[doc_index]))
train_class.append(class_list[doc_index])
p0v,p1v,pspam = train_nb0(np.array(train_mat), np.array(train_class))
error_count = 0
for doc_index in test_set:
word_vector = bag_of_words2vec(vocab_list, doc_list[doc_index])
if classify_nb(np.array(word_vector), p0v, p1v, pspam) != class_list[doc_index]:
error_count += 1
print('the error rate:', float(error_count)/len(test_set))
return vocab_list, p0v, p1v
def get_top_words(ny, sf):
vocab_list, pny, psf = local_words(ny, sf)
top_ny = []
top_sf = []
for i in range(len(psf)):
if psf[i] > -6.0:
top_sf.append((vocab_list[i],psf[i]))
if pny[i] > -6.0:
top_ny.append((vocab_list[i],pny[i]))
sorted_sf = sorted(top_sf, key=lambda pair:pair[1], reverse=True)
print('*******SFSFSFSF*******')
for item in sorted_sf:
print(item[0])
ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
sf = feedparser.parse('http://sfbay.craigslist.org/stp/index/rss')
get_top_words(ny, sf)