理论
记为类标,为输入,由贝叶斯公式,. 朴素贝叶斯分类器假设每个属性相互独立,. 对于所有类别来说,相同,因此朴素贝叶斯分类器对的类标判别.
令表示训练集中第类样本组成的集合,表示类别数,则类先验概率(拉普拉斯平滑)
对于离散属性,令表示类标为、属性取值为的样本组成的集合,为第个属性的可能取值数,则类先验概率(拉普拉斯平滑)
对于连续属性,假设服从正态分布,即,用概率密度表示
代码
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 27 21:44:30 2018
Naive Bayesian Classifier
From 'Machine Learning, Zhihua Zhou' Ch7
Model: P166 problem 7.3
With Laplacian Smoothing
Dataset: P84 watermelon_3.0 (watermelon_3.0.npy)
@author: weiyx15
"""
import numpy as np
class Naive_Bayesian_Classifier:
def Gausian_density(self, x, mu, sigma):
return 1/np.sqrt(2*np.pi)/sigma*np.exp(-(x-mu)*(x-mu)/2/sigma/sigma)
def load_data(self, filename):
dic = np.load(filename)
self.d1 = dic['arr_0'] # number of discrete property
self.d2 = dic['arr_1'] # number of continuous property
self.k = dic['arr_2'] # number of categories
self.xs = dic['arr_3'] # discrete property range
self.x = dic['arr_4'] # training data
self.y = dic['arr_5'] # training label
self.m = self.x.shape[0]# training data size
def __init__(self):
self.load_data('watermelon_3.0.npz')
''' pxy =
p(x0=0|c0), p(x0=1|c0), ..., p(x0=x0m|c0)
p(x0=0|c1), p(x0=1|c1), ..., p(x0=x0m|c1)
...
p(x0=0|ck), p(x0=1|ck), ..., p(x0=x0m|ck)
p(x1=0|c0), p(x1=1|c0), ..., p(x1=x1m|c0)
p(x1=0|c1), p(x1=1|c1), ..., p(x1=x1m|c1)
...
p(x1=0|ck), p(x1=1|ck), ..., p(x1=x1m|ck)
...
...
...
'''
def train(self):
# deal with discrete property
self.py = np.ones((self.k,))
for i in range(self.m):
self.py[self.y[i]] = self.py[self.y[i]] + 1
pxylen = self.d1 * self.k
self.pxy = [list()] * pxylen
for i in range(self.d1*self.k):
self.pxy[i] = [1] * self.xs[int(i/self.k)]
for i in range(self.m):
for j in range(self.d1):
self.pxy[self.y[i]*self.d1 + j][int(self.x[i,j])] =\
self.pxy[self.y[i]*self.d1 + j][int(self.x[i,j])] + 1
for i in range(self.d1*self.k):
for j in range(len(self.pxy[i])):
self.pxy[i][j] = self.pxy[i][j] \
/ (self.py[i%int(self.k)] + self.xs[int(i/self.k)] - 1)
self.py = self.py / (self.m + self.k)
# deal with continuous property
self.mu = np.zeros((self.k, self.d2))
self.sigma = np.zeros((self.k, self.d2))
for i in range(self.k):
for j in range(self.d2):
ij_ind = np.where(self.x[:, self.d1+j])
ij_sample = self.x[ij_ind, self.d1+j]
ij_sample = ij_sample.reshape((ij_sample.size,))
self.mu[i][j] = np.mean(ij_sample)
self.sigma[i][j] = np.std(ij_sample, ddof=1)
def test(self, xt):
pmax = -np.inf
idmax = -1
for i in range(self.k):
pi = np.log2(self.py[i])
for j in range(self.d1):
pi = pi + np.log2(self.pxy[self.d1*i+j][int(xt[j])])
for j in range(self.d2):
pi = pi + np.log2(self.Gausian_density\
(xt[self.d1+j], self.mu[i][j], self.sigma[i][j]))
if pi > pmax:
pmax = pi
idmax = i
return idmax
if __name__ == '__main__':
nbc = Naive_Bayesian_Classifier()
nbc.train()
ans = nbc.test([1, 0, 0, 0, 0, 0, 0.697, 0.46])
# test data: 'Machine Learning, Zhihua Zhou' P151