理论

记 $c$ 为类标， $\mathbf{x}$ 为输入，由贝叶斯公式， $P(c|\mathbf{x}) = \frac{P(c)P(\mathbf{x}|c)}{P(\mathbf{x})}$ . 朴素贝叶斯分类器假设每个属性相互独立， $P(\mathbf{x}|c) = \prod\limits_{i=1}^dP(x_i|c)$ . 对于所有类别来说， $P(\mathbf{x})$ 相同，因此朴素贝叶斯分类器对 $\mathbf{x}$ 的类标判别 $c_\mathbf{x}=\arg \max\limits_{c\in{Y}}P(c)\prod\limits_{i=1}^dP(x_i|c)$ .

令 $D_c$ 表示训练集 $D$ 中第 $c$ 类样本组成的集合， $N$ 表示类别数，则类先验概率（拉普拉斯平滑）

$P(c) = \frac{|D_c|+1}{|D|+N}$

对于离散属性，令 $D_{c,x_i}$ 表示类标为 $c$ 、属性取值为 $x_i$ 的样本组成的集合， $N_i$ 为第 $i$ 个属性的可能取值数，则类先验概率（拉普拉斯平滑）

$P(x_i|c) = \frac {|D_{c,x_i}|+1}{|D_c|+N_i}$

对于连续属性，假设服从正态分布，即 $p(x_i|c)\sim\mathcal{N}(\mu_{c,i}, \sigma_{c,i}^2)$ ，用概率密度表示

$p(x_i|c)=\frac{1}{\sqrt{2\pi}\sigma_{c,i}}\exp(-\frac{(x_i-\mu_{c,i})^2}{2{\sigma_{c,i}^2}})$

代码

# -*- coding: utf-8 -*-
"""
Created on Mon Aug 27 21:44:30 2018

Naive Bayesian Classifier
From 'Machine Learning, Zhihua Zhou' Ch7
Model: P166 problem 7.3
        With Laplacian Smoothing
Dataset: P84 watermelon_3.0 (watermelon_3.0.npy)

@author: weiyx15
"""

import numpy as np

class Naive_Bayesian_Classifier:
    def Gausian_density(self, x, mu, sigma):
        return 1/np.sqrt(2*np.pi)/sigma*np.exp(-(x-mu)*(x-mu)/2/sigma/sigma)
    
    def load_data(self, filename):
        dic = np.load(filename)
        self.d1 = dic['arr_0']  # number of discrete property
        self.d2 = dic['arr_1']  # number of continuous property
        self.k = dic['arr_2']   # number of categories
        self.xs = dic['arr_3']  # discrete property range
        self.x = dic['arr_4']   # training data
        self.y = dic['arr_5']   # training label
        self.m = self.x.shape[0]# training data size
        
    def __init__(self):
        self.load_data('watermelon_3.0.npz')
    
    ''' pxy = 
        p(x0=0|c0), p(x0=1|c0), ..., p(x0=x0m|c0)
        p(x0=0|c1), p(x0=1|c1), ..., p(x0=x0m|c1)
        ...
        p(x0=0|ck), p(x0=1|ck), ..., p(x0=x0m|ck)
        
        p(x1=0|c0), p(x1=1|c0), ..., p(x1=x1m|c0)
        p(x1=0|c1), p(x1=1|c1), ..., p(x1=x1m|c1)
        ...
        p(x1=0|ck), p(x1=1|ck), ..., p(x1=x1m|ck)
        
        ...
        ...
        ...
    '''
    def train(self):
        # deal with discrete property
        self.py = np.ones((self.k,))
        for i in range(self.m):
            self.py[self.y[i]] = self.py[self.y[i]] + 1
        pxylen = self.d1 * self.k
        self.pxy = [list()] * pxylen
        for i in range(self.d1*self.k):
            self.pxy[i] = [1] * self.xs[int(i/self.k)]
        for i in range(self.m):
            for j in range(self.d1):
                self.pxy[self.y[i]*self.d1 + j][int(self.x[i,j])] =\
                self.pxy[self.y[i]*self.d1 + j][int(self.x[i,j])] + 1
        for i in range(self.d1*self.k):
            for j in range(len(self.pxy[i])):
                self.pxy[i][j] = self.pxy[i][j] \
                / (self.py[i%int(self.k)] + self.xs[int(i/self.k)] - 1)
        self.py = self.py / (self.m + self.k)
        # deal with continuous property
        self.mu = np.zeros((self.k, self.d2))
        self.sigma = np.zeros((self.k, self.d2))
        for i in range(self.k):
            for j in range(self.d2):
                ij_ind = np.where(self.x[:, self.d1+j])
                ij_sample = self.x[ij_ind, self.d1+j]
                ij_sample = ij_sample.reshape((ij_sample.size,))
                self.mu[i][j] = np.mean(ij_sample)
                self.sigma[i][j] = np.std(ij_sample, ddof=1)
    
    def test(self, xt):
        pmax = -np.inf
        idmax = -1
        for i in range(self.k):
            pi = np.log2(self.py[i])
            for j in range(self.d1):
                pi = pi + np.log2(self.pxy[self.d1*i+j][int(xt[j])])
            for j in range(self.d2):
                pi = pi + np.log2(self.Gausian_density\
                        (xt[self.d1+j], self.mu[i][j], self.sigma[i][j]))
            if pi > pmax:
                pmax = pi
                idmax = i
        return idmax
    
if __name__ == '__main__':
    nbc = Naive_Bayesian_Classifier()
    nbc.train()
    ans = nbc.test([1, 0, 0, 0, 0, 0, 0.697, 0.46])
    # test data: 'Machine Learning, Zhihua Zhou' P151

周志华《机器学习》Ch7. 贝叶斯分类器：朴素贝叶斯分类器的python实现

理论

代码

猜你喜欢