原声版的朴素贝叶斯公式

在这里插入图片描述
在这里插入图片描述
all_doc = [‘Chinese Beijing Chinese’,‘Chinese Chinese Shanghai’,‘Chinese Macao’,‘Tokyo Japan Chinese’]
all_target = [1,1,1,0]

class SimpleNB(object):
‘’’
简单的贝叶斯分类器
‘’’
def init(self):
‘’’
在初始化的时候定义模型的参数
模型的参数是计算过程中用到的变量
通过训练数据获取参数的值就是训练模型
‘’’
self.total_word_num = 0 #总单词量
self.c1_word_num = 0 #类别1单词量
self.c2_word_num = 0

    self.c1_dic = {} #类别1各个单词出现的次数
    self.c2_dict= {} #类别2各个单词出现的次数

    self.smooth_num = 0 #平滑参数是文档中不重复的个数

def fit(self,all_doc,all_target):
    '''
    训练模型
    :param docs:
    :return:
    '''
    c1 = []
    c2 = []

    index = 0
    #切词,将文档按照类别放在此c1,c2
    for doc,flag in zip(all_doc,all_target):
        if flag == 1:
            c1.append(doc)
        else:
            c2.append(doc)
    #统计类别此c1中各个单词出现的次数
    for doc in c1:
        s_words = doc.split(' ')
        for w in s_words:
            r_in_dic = self.c1_dic.get(w)
            if r_in_dic:
                self.c1_dic[w] += 1
            else:
                self.c1_dic[w] = 1
    #统计类别c2各个单词出现的次数
    for doc in c2:
        s_words = doc.split(' ')
        for w in s_words:
            if w in self.c2_dict.keys():
                self.c2_dict[w] += 1
            else:
                self.c2_dict[w] = 1
    #算出各个类别的单词总数
    self.c1_word_num = sum(self.c1_dic.values())
    self.c2_word_num = sum(self.c2_dict.values())
    #各个类别的单词总数相加,得出语料库的总词量
    self.total_word_num = self.c1_word_num + self.c2_word_num
    #算出分母的平滑因子,语料库中所有的不重复的单词个数
    self.smooth_num = len(set(list(self.c1_dic.keys()) + list(self.c2_dict.keys())))

def predict(self,doc_str):
    #预测新文档属于什么类型
    #新文档切词
    all_words = doc_str.split(' ')
    c1_cond_p = 1 #新文档属于类别c1的概率
    c2_cond_p = 1 #新文档属于类别c2的概率

    for w in all_words:
        c1_occ_num = self.c1_dic.get(w)
        if c1_occ_num is None:
            c1_occ_num = 0

        c2_occ_num = self.c2_dict.get(w)
        if c2_occ_num is None:
            c2_occ_num = 0
        #P(Chinese|yes)^3 * P(Tokyo|yes) * P(Japan|yes) * P(yes) 计算属于类别1的最终概率
        c1_cond_p  *= (c1_occ_num + 1) /(self.c1_word_num + self.smooth_num)
        # P(Chinese|no)^3 * P(Tokyo|no) * P(Japan|yes) * P(no)  计算属于类别
        c2_cond_p  *= (c2_occ_num + 1)/(self.c2_word_num + self.smooth_num)
    #将最终条件概率分别乘以P(yes)或P(no)
    result_c1 = c1_cond_p * (8/11)
    result_c2 = c2_cond_p * (3/11)

    if result_c1 >= result_c2:
        return 1
    else:
        return 0

s_nb = SimpleNB()
s_nb.fit(all_doc,all_target)
print(s_nb.predict(‘Tokyo’))

猜你喜欢

转载自blog.csdn.net/weixin_44274975/article/details/88732110
今日推荐