朴素贝叶斯实现

1 朴素贝叶斯自编程实现

import numpy as np
import pandas as pd

%config ZMQInteractiveShell.ast_node_interactivity='all'

class NaiveBayes():
    def __init__(self, lambda_):
        self.lambda_ = lambda_  # 贝叶斯系数，取0时，即为极大似然估计
        self.y_types_count = None  # y的（类型：数量）
        self.y_types_proba = None  # y的（类型：概率）
        self.x_types_proba = dict()  # (xi 的编号，xi 的取值，y 的类型)：概率
    
    def fit(self, X_train, y_train):
        self.y_types = np.unique(y_train)  # np.unique()去除数组中重复数字并排序输出，得到 y 的所有取值类型
        X = pd.DataFrame(X_train)  # 转换成pandas DataFrame格式，下同
        y = pd.DataFrame(y_train)
        # y的（类型：数量）统计     1 :  9      -1 :  6
        self.y_types_count = y[0].value_counts()
        # y的（类型：概率）计算
        self.y_types_proba = (self.y_types_count + self.lambda_) / (y.shape[0] + len(self.y_types) * self.lambda_)
        
        # (xi 的编号， xi的取值，y的类型)：概率的计算
        for idx in X.columns:    # 遍历xi
            for j in self.y_types:    # 选取每一个y的类型
                # 选择所有y==j为真的数据点的第idx个特征的值，并对这些值进行（类型：数量）统计
                p_x_y = X[(y == j).values][idx].value_counts()
                # 计算（xi 的编号，xi的取值，y的类型）：概率
                for i in p_x_y.index:
                    self.x_types_proba[(idx, i, j)] = (p_x_y[i] + self.lambda_) / (self.y_types_count[j] + p_x_y.shape[0] * self.lambda_)
                    
    def predict(self, X_new):
        res = []
        for y in self.y_types:  # 遍历y的可能取值
            p_y = self.y_types_proba[y]  # 计算y的先验概率 P(Y=ck)
            p_xy = 1
            for idx, x in enumerate(X_new):
                p_xy *= self.x_types_proba[(idx, x, y)]  # 计算P(x = (x1,x2,...xd) / Y = ck)
            res.append(p_y * p_xy)
        for i in range(len(self.y_types)):
            print("[{}]对应的概率：{:.2%}".format(self.y_types[i], res[i]))
        #返回最大后验概率对应的y值
        return self.y_types[np.argmax(res)]

def main():
    X_train = np.array([
                      [1,"S"],
                      [1,"M"],
                      [1,"M"],
                      [1,"S"],
                      [1,"S"],
                      [2,"S"],
                      [2,"M"],
                      [2,"M"],
                      [2,"L"],
                      [2,"L"],
                      [3,"L"],
                      [3,"M"],
                      [3,"M"],
                      [3,"L"],
                      [3,"L"]
                      ])
    y_train = np.array([-1,-1,1,1,-1,-1,-1,1,1,1,1,1,1,1,-1])
    clf = NaiveBayes(lambda_ = 0.2)
    clf.fit(X_train, y_train)
    X_new = np.array([2, 'S'])
    y_predict = clf.predict(X_new)
    print("{}被分类为:{}".format(X_new, y_predict))
    
if __name__ == '__main__':
    main()

2 朴素贝叶斯的sklearn实现

import numpy as np
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn import preprocessing  # 预处理

def main():
    X_train=np.array([
                      [1,"S"],
                      [1,"M"],
                      [1,"M"],
                      [1,"S"],
                      [1,"S"],
                      [2,"S"],
                      [2,"M"],
                      [2,"M"],
                      [2,"L"],
                      [2,"L"],
                      [3,"L"],
                      [3,"M"],
                      [3,"M"],
                      [3,"L"],
                      [3,"L"]
                      ])
    y_train=np.array([-1,-1,1,1,-1,-1,-1,1,1,1,1,1,1,1,-1])
    enc = preprocessing.OneHotEncoder(categories='auto')
    enc.fit(X_train)
    X_train = enc.transform(X_train).toarray()
    print(X_train)
    clf = MultinomialNB(alpha=0.0000001)
    clf.fit(X_train, y_train)
    X_new = np.array([[2, 'S']])
    X_new = enc.transform(X_new).toarray()
    y_predict = clf.predict(X_new)
    print("------------------------------------")
    print("{}被分类为:{}".format(X_new,y_predict))
    print(clf.predict_proba(X_new))
    
if __name__ == '__main__':
    main()

参考：

[1] 深度之眼统计学习方法集训营课后练习

[2] 《统计学习方法》李航

猜你喜欢