Python(朴素贝叶斯/贝叶斯估计)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')
#读取数据
data = pd.read_csv('data.csv',encoding='utf8',header=None)
arr = data.values
#预处理
encoder = preprocessing.LabelEncoder()
labels = encoder.fit_transform(arr[:,1])
for label,num in enumerate(encoder.classes_):
    print(label,'-->',num)
arr[:,1] = labels

x,y = arr[:,0:-1],arr[:,-1]
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=1)

#朴素贝叶斯
py = {}
n = len(y_train)
for value in y_train:
    if value in py:
        py[value] += 1
    else:
        py[value] = 1

m = x_train.shape[1]
pxy = {}
for i in py:
    pxy[i] = {}
    yindex = set(np.where(y_train==i)[0])
    for j in range(m):
        pxy[i][j] = {}
        xvalue = np.unique(x_train[:,j],return_index=False,return_counts=False)
        for k in xvalue:
            xindex = set(np.where(x_train[:,j]==k)[0])
            index = xindex & yindex
            pxy[i][j][k] = len(index)/py[i]

predict = []
for vector in x_train:
    count = {}
    for label in py:
        p = py[label]/len(y_train)
        for index in range(m):
            p = p*pxy[label][index][vector[index]]
        count[label] = p
    predict.append(max(count,key=count.get))
    
train_acc = len(np.where(predict*y_train>0)[0])/len(y_train)
 
predict = []
for vector in x_test:
    count = {}
    for label in py:
        p = py[label]/len(y_test)
        for index in range(m):
            p = p*pxy[label][index][vector[index]]
        count[label] = p
    predict.append(max(count,key=count.get))           

test_acc = len(np.where(predict*y_test>0)[0])/len(y_test)
print('朴素贝叶斯:')
print('Train acc = ',train_acc,' Test acc = ',test_acc)           

#贝叶斯估计
c = 1
py = {}
K = len(np.unique(y_train))      
for value in y_train:
    if value in py:
        py[value] += 1
    else:
        py[value] = 1

m = x_train.shape[1]
pxy = {}
for i in py:
    pxy[i] = {}
    yindex = set(np.where(y_train==i)[0])
    for j in range(m):
        pxy[i][j] = {}
        xvalue = np.unique(x_train[:,j],return_index=False,return_counts=False)
        for k in xvalue:
            xindex = set(np.where(x_train[:,j]==k)[0])
            index = xindex & yindex
            pxy[i][j][k] = (len(index)+c)/(py[i]+len(xvalue)*c)
predict = []
for vector in x_train:
    count = {}
    for label in py:
        p = (py[label]+c)/(len(y_train)+K*c)
        for index in range(m):
            p = p*pxy[label][index][vector[index]]
        count[label] = p
    predict.append(max(count,key=count.get))
    
train_acc = len(np.where(predict*y_train>0)[0])/len(y_train)

predict = []
for vector in x_test:
    count = {}
    for label in py:
        p = (py[label]+c)/(len(y_test)+K*c)
        for index in range(m):
            p = p*pxy[label][index][vector[index]]
        count[label] = p
    predict.append(max(count,key=count.get))           

test_acc = len(np.where(predict*y_test>0)[0])/len(y_test)
print('贝叶斯估计:')
print('Train acc = ',train_acc,' Test acc = ',test_acc)

猜你喜欢

转载自blog.csdn.net/qinlan1994/article/details/82919246