import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
#读取数据
data = pd.read_csv('data.csv',encoding='utf8',header=None)
arr = data.values
#预处理
encoder = preprocessing.LabelEncoder()
labels = encoder.fit_transform(arr[:,1])
for label,num in enumerate(encoder.classes_):
print(label,'-->',num)
arr[:,1] = labels
x,y = arr[:,0:-1],arr[:,-1]
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=1)
#朴素贝叶斯
py = {}
n = len(y_train)
for value in y_train:
if value in py:
py[value] += 1
else:
py[value] = 1
m = x_train.shape[1]
pxy = {}
for i in py:
pxy[i] = {}
yindex = set(np.where(y_train==i)[0])
for j in range(m):
pxy[i][j] = {}
xvalue = np.unique(x_train[:,j],return_index=False,return_counts=False)
for k in xvalue:
xindex = set(np.where(x_train[:,j]==k)[0])
index = xindex & yindex
pxy[i][j][k] = len(index)/py[i]
predict = []
for vector in x_train:
count = {}
for label in py:
p = py[label]/len(y_train)
for index in range(m):
p = p*pxy[label][index][vector[index]]
count[label] = p
predict.append(max(count,key=count.get))
train_acc = len(np.where(predict*y_train>0)[0])/len(y_train)
predict = []
for vector in x_test:
count = {}
for label in py:
p = py[label]/len(y_test)
for index in range(m):
p = p*pxy[label][index][vector[index]]
count[label] = p
predict.append(max(count,key=count.get))
test_acc = len(np.where(predict*y_test>0)[0])/len(y_test)
print('朴素贝叶斯:')
print('Train acc = ',train_acc,' Test acc = ',test_acc)
#贝叶斯估计
c = 1
py = {}
K = len(np.unique(y_train))
for value in y_train:
if value in py:
py[value] += 1
else:
py[value] = 1
m = x_train.shape[1]
pxy = {}
for i in py:
pxy[i] = {}
yindex = set(np.where(y_train==i)[0])
for j in range(m):
pxy[i][j] = {}
xvalue = np.unique(x_train[:,j],return_index=False,return_counts=False)
for k in xvalue:
xindex = set(np.where(x_train[:,j]==k)[0])
index = xindex & yindex
pxy[i][j][k] = (len(index)+c)/(py[i]+len(xvalue)*c)
predict = []
for vector in x_train:
count = {}
for label in py:
p = (py[label]+c)/(len(y_train)+K*c)
for index in range(m):
p = p*pxy[label][index][vector[index]]
count[label] = p
predict.append(max(count,key=count.get))
train_acc = len(np.where(predict*y_train>0)[0])/len(y_train)
predict = []
for vector in x_test:
count = {}
for label in py:
p = (py[label]+c)/(len(y_test)+K*c)
for index in range(m):
p = p*pxy[label][index][vector[index]]
count[label] = p
predict.append(max(count,key=count.get))
test_acc = len(np.where(predict*y_test>0)[0])/len(y_test)
print('贝叶斯估计:')
print('Train acc = ',train_acc,' Test acc = ',test_acc)