朴素贝叶斯+拉普拉斯平滑代码实现-方法二

首先导入包:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import scorer
import numpy as np

数据的读取:



datasets = pd.DataFrame([["青绿", "蜷缩", "浊响", "清晰", "凹陷", "硬滑", "是"],
                         ["乌黑", "蜷缩", "沉闷", "清晰", "凹陷", "硬滑", "是"],
                         ["乌黑", "蜷缩", "浊响", "清晰", "凹陷", "硬滑", "是"],
                         ["青绿", "蜷缩", "沉闷", "清晰", "凹陷", "硬滑", "是"],
                         ["浅白", "蜷缩", "浊响", "清晰", "凹陷", "硬滑", "是"],
                         ["青绿", "稍蜷", "浊响", "清晰", "稍凹", "软粘", "是"],
                         ["乌黑", "稍蜷", "浊响", "稍糊", "稍凹", "软粘", "是"],
                         ["乌黑", "稍蜷", "浊响", "清晰", "稍凹", "硬滑", "是"],
                         ["乌黑", "稍蜷", "沉闷", "稍糊", "稍凹", "硬滑", "否"],
                         ["青绿", "硬挺", "清脆", "清晰", "平坦", "软粘", "否"],
                         ["浅白", "硬挺", "清脆", "模糊", "平坦", "硬滑", "否"],
                         ["浅白", "蜷缩", "浊响", "模糊", "平坦", "软粘", "否"],
                         ["青绿", "稍蜷", "浊响", "稍糊", "凹陷", "硬滑", "否"],
                         ["浅白", "稍蜷", "沉闷", "稍糊", "凹陷", "硬滑", "否"],
                         ["乌黑", "稍蜷", "浊响", "清晰", "稍凹", "软粘", "否"],
                         ["浅白", "蜷缩", "浊响", "模糊", "平坦", "硬滑", "否"],
                         ["青绿", "蜷缩", "沉闷", "稍糊", "稍凹", "硬滑", "否"]],
                        columns=["色泽", "根蒂", "敲声", "纹理", "脐部", "触感", "好瓜"])


计算出好瓜的概率:

def fit_fun(datasets):
    good_el_dic = {
    
    }   # P(*|好瓜)
    bad_el_dic = {
    
    }   # P(*|坏瓜)
    P_dic = {
    
    }   # P(*)
    for j in datasets.columns.to_list()[0:-1]:
        for i in datasets[j].unique():
#             P_dic[i] = round((datasets[j].value_counts()[i]+1)/(len(datasets)+len(datasets[j].unique())), 2)
            if "是" in datasets.groupby(j)["好瓜"].value_counts()[i]:
                good_el_dic[i] = round((datasets.groupby(j)["好瓜"].value_counts()[i]["是"]+1)/(len(datasets[datasets["好瓜"] == "是"])+len(datasets[j].unique())), 3)      # 拉普拉斯
#                 good_el_dic[i] = round((datasets.groupby(j)["好瓜"].value_counts()[i]["是"]+1)/(datasets[j].value_counts()[i]+len(datasets[j].unique())), 2)      # 拉普拉斯
            else:
#                 good_el_dic[i] = round(1/(datasets[j].value_counts()[i]+len(datasets[j].unique())), 2)    # 拉普拉斯
                good_el_dic[i] = round(1/(len(datasets[datasets["好瓜"] == "是"])+len(datasets[j].unique())), 3)    # 拉普拉斯
    P_good = round((len(datasets[datasets["好瓜"] == "是"])+1)/(len(datasets)+2), 2)
    print("P(*|好瓜):", good_el_dic)
#     print("\nP(*):", P_dic)
    print("\nP:", P_good)
    return good_el_dic, P_dic, P_good
good_el_dic, P_dic, P = fit_fun(datasets)

输出:

P(*|好瓜): {
    
    '青绿': 0.364, '乌黑': 0.455, '浅白': 0.182, '蜷缩': 0.545, '稍蜷': 0.364, '硬挺': 0.091, '浊响': 0.636, '沉闷': 0.273, '清脆': 0.091, '清晰': 0.727, '稍糊': 0.182, '模糊': 0.091, '凹陷': 0.545, '稍凹': 0.364, '平坦': 0.091, '硬滑': 0.7, '软粘': 0.3}

P: 0.47

计算坏瓜的概率:

def fit_fun(datasets):
#     good_el_dic = {}   # P(*|好瓜)
    bad_el_dic = {
    
    }   # P(*|坏瓜)
#     P_dic = {}   # P(*)
    for j in datasets.columns.to_list()[0:-1]:
        for i in datasets[j].unique():
            if "否" in datasets.groupby(j)["好瓜"].value_counts()[i]:
#                 bad_el_dic[i] = round((datasets.groupby(j)["好瓜"].value_counts()[i]["否"]+1)/(datasets[j].value_counts()[i]+len(datasets[j].unique())), 2)    # 平滑
               bad_el_dic[i] = round((datasets.groupby(j)["好瓜"].value_counts()[i]["否"]+1)/(len(datasets[datasets["好瓜"] == "否"])+len(datasets[j].unique())), 3) 
            else:
#                 P_bad[i] = round(1/(datasets[j].value_counts()[i]+len(datasets[j].unique())), 2)      # 平滑
                P_bad[i] = round(1/(len(datasets[datasets["好瓜"] == "否"])+len(datasets[j].unique())), 3)      # 平滑
    P_bad = round((len(datasets[datasets["好瓜"] == "否"])+1)/(len(datasets)+2), 3)
    print("P(*|坏瓜):", bad_el_dic)
    print("\nP_bad:", P_bad)
    return bad_el_dic, P_bad
bad_el_dic, P_bad = fit_fun(datasets)
P_ = 1-P

输出:

P(*|坏瓜): {
    
    '青绿': 0.333, '乌黑': 0.25, '浅白': 0.417, '蜷缩': 0.333, '稍蜷': 0.417, '硬挺': 0.25, '浊响': 0.417, '沉闷': 0.333, '清脆': 0.25, '清晰': 0.25, '稍糊': 0.417, '模糊': 0.333, '凹陷': 0.25, '稍凹': 0.333, '平坦': 0.417, '硬滑': 0.636, '软粘': 0.364}

P_bad: 0.526

单个数据的预测:

好瓜概率预测:

test = ["乌黑", "稍蜷", "浊响", "清晰", "稍凹", "软粘"]
P_good = P
for i in test:
#     P_good = P_good*good_el_dic[i]*P_dic[i]/P
    P_good = P_good*good_el_dic[i]
print("预测为好瓜的概率:", P_good )
# 预测为坏瓜的概率: 0.0006981899836275

坏瓜概率预测:

P_bad = P_
for i in test:
#     P_bad = P_bad*bad_el_dic[i]*P_dic[i]/P_
    P_bad = P_bad*bad_el_dic[i]
print("预测为坏瓜的概率:", P_bad)
# 预测为坏瓜的概率: 0.0006981899836275

整体预测:

y_pre = []
for i in range(0, len(datasets)):
    test = datasets.iloc[i, :0-1].to_list()
    P_g = P
    P_b = P_
    for i in test:
#         P_g = P_g*good_el_dic[i]*P_dic[i]/P
        P_g = P_g*good_el_dic[i]
    for i in test:
#         P_b = P_b*bad_el_dic[i]*P_dic[i]/P_
        P_b = P_b*bad_el_dic[i]
    if P_g>P_b:
        y_pre.append("是")
    else:
        y_pre.append("否")
y_test = datasets["好瓜"]
y_test==pd.Series(y_pre)

输出:

0      True
1      True
2      True
3      True
4      True
5      True
6     False
7      True
8      True
9      True
10     True
11     True
12    False
13     True
14    False
15     True
16     True
dtype: bool

如有大佬看出错误,请指正。

Guess you like

Origin blog.csdn.net/weixin_51756104/article/details/121246614