all_doc = [‘Chinese Beijing Chinese’,‘Chinese Chinese Shanghai’,‘Chinese Macao’,‘Tokyo Japan Chinese’]
all_target = [1,1,1,0]
class SimpleNB(object):
‘’’
简单的贝叶斯分类器
‘’’
def init(self):
‘’’
在初始化的时候定义模型的参数
模型的参数是计算过程中用到的变量
通过训练数据获取参数的值就是训练模型
‘’’
self.total_word_num = 0 #总单词量
self.c1_word_num = 0 #类别1单词量
self.c2_word_num = 0
self.c1_dic = {} #类别1各个单词出现的次数
self.c2_dict= {} #类别2各个单词出现的次数
self.smooth_num = 0 #平滑参数是文档中不重复的个数
def fit(self,all_doc,all_target):
'''
训练模型
:param docs:
:return:
'''
c1 = []
c2 = []
index = 0
#切词,将文档按照类别放在此c1,c2
for doc,flag in zip(all_doc,all_target):
if flag == 1:
c1.append(doc)
else:
c2.append(doc)
#统计类别此c1中各个单词出现的次数
for doc in c1:
s_words = doc.split(' ')
for w in s_words:
r_in_dic = self.c1_dic.get(w)
if r_in_dic:
self.c1_dic[w] += 1
else:
self.c1_dic[w] = 1
#统计类别c2各个单词出现的次数
for doc in c2:
s_words = doc.split(' ')
for w in s_words:
if w in self.c2_dict.keys():
self.c2_dict[w] += 1
else:
self.c2_dict[w] = 1
#算出各个类别的单词总数
self.c1_word_num = sum(self.c1_dic.values())
self.c2_word_num = sum(self.c2_dict.values())
#各个类别的单词总数相加,得出语料库的总词量
self.total_word_num = self.c1_word_num + self.c2_word_num
#算出分母的平滑因子,语料库中所有的不重复的单词个数
self.smooth_num = len(set(list(self.c1_dic.keys()) + list(self.c2_dict.keys())))
def predict(self,doc_str):
#预测新文档属于什么类型
#新文档切词
all_words = doc_str.split(' ')
c1_cond_p = 1 #新文档属于类别c1的概率
c2_cond_p = 1 #新文档属于类别c2的概率
for w in all_words:
c1_occ_num = self.c1_dic.get(w)
if c1_occ_num is None:
c1_occ_num = 0
c2_occ_num = self.c2_dict.get(w)
if c2_occ_num is None:
c2_occ_num = 0
#P(Chinese|yes)^3 * P(Tokyo|yes) * P(Japan|yes) * P(yes) 计算属于类别1的最终概率
c1_cond_p *= (c1_occ_num + 1) /(self.c1_word_num + self.smooth_num)
# P(Chinese|no)^3 * P(Tokyo|no) * P(Japan|yes) * P(no) 计算属于类别
c2_cond_p *= (c2_occ_num + 1)/(self.c2_word_num + self.smooth_num)
#将最终条件概率分别乘以P(yes)或P(no)
result_c1 = c1_cond_p * (8/11)
result_c2 = c2_cond_p * (3/11)
if result_c1 >= result_c2:
return 1
else:
return 0
s_nb = SimpleNB()
s_nb.fit(all_doc,all_target)
print(s_nb.predict(‘Tokyo’))