朴素贝叶斯——算法实现

朴素贝叶斯通常用于特征的范围是离散的,即每个特征可取值的范围是离散的。给定训练集、标签,计算训练集中每个类标签的概率,再计算每个类标签下每个特征的概率。给出带预测样本的特征,计算每个类标签下的概率,选择最大的类标签作为预测类。

通常可以使用贝叶斯估计防止某个概率为0,即对于每个特征的概率分子分母都加上一项lambda,Sj*lambda,使用log函数避免概率太小而没什么区别。

过滤留言板恶意留言、过滤垃圾邮件

  1. 根据训练数据创建词库列表
  2. 根据每一个训练数据生成与训练数据对应的二进制行向量(存在为1,不存在为0,或累加)
  3. 根据行向量计算用于分类的概率值,类概率;每一类中各特征出现的概率;(使用log或初始为其他值保证分类的时候概率不为0)
  4. 对新输入进行分类,计算后验概率,选择最大的那个作为预测的类
import numpy as np

"""过滤恶意留言"""
def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not
    return postingList,classVec


def createVocabList(dataSet): # mxn
    vocabSet = set()  #create empty set
    for document in dataSet:
        vocabSet = vocabSet | set(document) #union of the two sets
    return list(vocabSet)


def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
    return returnVec


def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1  # 同一个词出现多次
    return returnVec

def trainNB0(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = np.sum(trainCategory)/float(numTrainDocs)
    p0Num = np.ones(numWords); p1Num = np.ones(numWords)      #change to
    # ones()
    p0Denom = 2.0; p1Denom = 2.0                        #change to 2.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += np.sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += np.sum(trainMatrix[i])
    p1Vect = np.log(p1Num/p1Denom)          #change to log()
    p0Vect = np.log(p0Num/p0Denom)          #change to log()
    return p0Vect,p1Vect,pAbusive


def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = np.sum(vec2Classify * p1Vec) + np.log(pClass1)    #element-wise mult
    p0 = np.sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

if __name__ == '__main__':
    postingList, classVec = loadDataSet()
    vocabSet = createVocabList(postingList)
    trainMatrix =[]
    for postinDoc in postingList:
        trainMatrix.append(setOfWords2Vec(vocabSet, postinDoc))

    p0, p1, pAb = trainNB0(trainMatrix,classVec)
    t1 = ['love', 'my', 'dalmation']
    t2 = ['stupid', 'garbage']
    t = [t1,t2]
    for ti in postingList:
        t1_vec = setOfWords2Vec(vocabSet, ti)
        result = classifyNB(t1_vec, p0, p1, pAb)
        print(result)

import numpy as np
import re
import os
from ex04_1_bayes import *

"""过滤垃圾邮件"""
def textParse(bigString):    #input is big string, #output is word list
    listOfTokens = re.split(r'\W+', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]

def spamTest():
    path_ham = r'H:\ML\MachineLearninginAction\04bayes\email\ham'
    path_spam = r'H:\ML\MachineLearninginAction\04bayes\email\spam'
    ham_files = os.listdir(path_ham)
    spam_files = os.listdir(path_spam)
    num_of_ham = len(ham_files)
    num_of_spam = len(spam_files)  # 读取文件
    data_set = []
    labels_set = [0]*num_of_ham
    labels_set.extend([1]*num_of_spam)  # 设置标签
    for i in range(num_of_ham):
        with open(os.path.join(path_ham, ham_files[i])) as f:
            data_set.append(textParse(f.read()))
    for i in range(num_of_spam):
        with open(os.path.join(path_spam, spam_files[i])) as f:
            data_set.append(textParse(f.read()))  # 提取单词到矩阵

    vocab_set = createVocabList(data_set) # 建立单词库
    train_mat = []
    for data in data_set:
        train_mat.append(setOfWords2Vec(vocab_set,data))
    rand_index = np.r_[0:len(train_mat)]
    np.random.shuffle(rand_index)  # 打乱索引
    train_mat = np.array(train_mat)
    labels_set = np.array(labels_set)
    train_mat_select = train_mat[rand_index[:40]]
    labels_set_select = labels_set[rand_index[:40]] # 40个作为训练集
    p0, p1, pAb = trainNB0(train_mat_select, labels_set_select)  # 返回训练集参数
    test_set_select = train_mat[rand_index[40:]]
    test_labels_select = labels_set[rand_index[40:]]  # 10个作为测试集
    test_results = []
    for train_m in test_set_select:
        re = classifyNB(train_m, p0, p1, pAb)
        test_results.append(re)  # 分类
    print(test_results)
    print(test_labels_select)
    compare_results = np.argwhere(np.array(test_results)==np.array(test_labels_select))
    acc = 1.0*len(compare_results)/len(test_labels_select)
    print(acc)


if __name__ == '__main__':
    spamTest()
发布了46 篇原创文章 · 获赞 0 · 访问量 1044

猜你喜欢

转载自blog.csdn.net/weixin_37680513/article/details/103026643
今日推荐