Naive Bayes - Algorithm

Naive Bayes range typically used is discrete features, i.e., the range of possible values ​​for each feature are discrete. Given training set, the tag, the training set is calculated probability of each class label, the probability of each feature for each class label recalculated. The characterization of the samples with a prediction, the probability of each class is calculated tab, select the maximum class label as a prediction class.

Bayesian estimation is usually used to prevent certain probability is 0, that are coupled with a lambda for the probability of the numerator and denominator of each feature, Sj * lambda, using a log function to avoid the probability is too small and no difference.

Message Board filter malicious messages, filter junk e-mail

  1. Create a list based on the training data dictionary
  2. According to each of the training data and the training data corresponding to the generated binary row vector (the presence of 1, 0 is not present, or additive)
  3. The row vector calculating probability values ​​for the classification, the class probability; probability of occurrence of each feature in each category; (log, or used as other initial values ​​are guaranteed when the classification probability is not 0)
  4. Enter the new classification, the posterior probability calculation, select the maximum predicted that as a class
import numpy as np

"""过滤恶意留言"""
def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not
    return postingList,classVec


def createVocabList(dataSet): # mxn
    vocabSet = set()  #create empty set
    for document in dataSet:
        vocabSet = vocabSet | set(document) #union of the two sets
    return list(vocabSet)


def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
    return returnVec


def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1  # 同一个词出现多次
    return returnVec

def trainNB0(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = np.sum(trainCategory)/float(numTrainDocs)
    p0Num = np.ones(numWords); p1Num = np.ones(numWords)      #change to
    # ones()
    p0Denom = 2.0; p1Denom = 2.0                        #change to 2.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += np.sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += np.sum(trainMatrix[i])
    p1Vect = np.log(p1Num/p1Denom)          #change to log()
    p0Vect = np.log(p0Num/p0Denom)          #change to log()
    return p0Vect,p1Vect,pAbusive


def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = np.sum(vec2Classify * p1Vec) + np.log(pClass1)    #element-wise mult
    p0 = np.sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

if __name__ == '__main__':
    postingList, classVec = loadDataSet()
    vocabSet = createVocabList(postingList)
    trainMatrix =[]
    for postinDoc in postingList:
        trainMatrix.append(setOfWords2Vec(vocabSet, postinDoc))

    p0, p1, pAb = trainNB0(trainMatrix,classVec)
    t1 = ['love', 'my', 'dalmation']
    t2 = ['stupid', 'garbage']
    t = [t1,t2]
    for ti in postingList:
        t1_vec = setOfWords2Vec(vocabSet, ti)
        result = classifyNB(t1_vec, p0, p1, pAb)
        print(result)

 

import numpy as np
import re
import os
from ex04_1_bayes import *

"""过滤垃圾邮件"""
def textParse(bigString):    #input is big string, #output is word list
    listOfTokens = re.split(r'\W+', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]

def spamTest():
    path_ham = r'H:\ML\MachineLearninginAction\04bayes\email\ham'
    path_spam = r'H:\ML\MachineLearninginAction\04bayes\email\spam'
    ham_files = os.listdir(path_ham)
    spam_files = os.listdir(path_spam)
    num_of_ham = len(ham_files)
    num_of_spam = len(spam_files)  # 读取文件
    data_set = []
    labels_set = [0]*num_of_ham
    labels_set.extend([1]*num_of_spam)  # 设置标签
    for i in range(num_of_ham):
        with open(os.path.join(path_ham, ham_files[i])) as f:
            data_set.append(textParse(f.read()))
    for i in range(num_of_spam):
        with open(os.path.join(path_spam, spam_files[i])) as f:
            data_set.append(textParse(f.read()))  # 提取单词到矩阵

    vocab_set = createVocabList(data_set) # 建立单词库
    train_mat = []
    for data in data_set:
        train_mat.append(setOfWords2Vec(vocab_set,data))
    rand_index = np.r_[0:len(train_mat)]
    np.random.shuffle(rand_index)  # 打乱索引
    train_mat = np.array(train_mat)
    labels_set = np.array(labels_set)
    train_mat_select = train_mat[rand_index[:40]]
    labels_set_select = labels_set[rand_index[:40]] # 40个作为训练集
    p0, p1, pAb = trainNB0(train_mat_select, labels_set_select)  # 返回训练集参数
    test_set_select = train_mat[rand_index[40:]]
    test_labels_select = labels_set[rand_index[40:]]  # 10个作为测试集
    test_results = []
    for train_m in test_set_select:
        re = classifyNB(train_m, p0, p1, pAb)
        test_results.append(re)  # 分类
    print(test_results)
    print(test_labels_select)
    compare_results = np.argwhere(np.array(test_results)==np.array(test_labels_select))
    acc = 1.0*len(compare_results)/len(test_labels_select)
    print(acc)


if __name__ == '__main__':
    spamTest()

 

Published 46 original articles · won praise 0 · Views 1044

Guess you like

Origin blog.csdn.net/weixin_37680513/article/details/103026643