Naive Bayes Method-Spam Classification

This article builds a spam classification model based on Naive Bayes, and the research object is English spam.

The email content is saved in a txt file, which is divided into training sample train and test sample test.
In the training sample, the normal mail is named: pos; the spam mail is named: neg.
At the same time, the test sample to be classified can be placed under pos or neg in the test file test for testing. If it is spam, the category is 0, otherwise the category is 1.

In the idea of ​​naive Bayesian spam classification: there is a method and a hypothesis:
  Bayes' theorem : the problem of solving p(c|x) becomes the
  characteristic condition of solving p(x|c) Independence hypothesis : the n features of X are all conditionally independent under certain certain conditions.
For details, please see: [Supervised Learning] Naive Bayes Method .
Data on GitHub: Data and Code .

1 Mail classification algorithm steps

1 Establish a vocabulary list (used to collect the vocabulary of all emails in the training set.)
2 Each email vocabulary vector (use all vocabulary to determine the number of words in each email.)
3 Calculate the prior probability p(b)
4 Calculate the conditional probability of each vocabulary
5 test cases

Complete program:

# -*- coding: utf-8 -*-
# @Time    : 2020/4/16 21:09
# @Author  : Zudy

'''
1. 基于朴素贝叶斯的垃圾邮件分类
'''

from sklearn import datasets
from time import time
import numpy
import re
import os
import random

def load_data(folder_path):
    print("Loading dataset ...")
    loadTime = time()
    datalist = datasets.load_files(folder_path)
    #datalist是一个Bunch类,其中重要的数据项有
    #data:原始数据
    #filenames:每个文件的名称
    #target:类别标签(子目录的文件从0开始标记了索引)
    #target_names:类别标签(子目录的具体名称)
    #输出总文档数和类别数
    print("summary: {0} documents in {1} categories.".format(len(datalist.data),len(datalist.target_names)))
    #加载数据所用的时间
    print("Load data in {0}seconds".format(time() - loadTime))
    #去停用词操作
    #datalist.data = [word for word in datalist.data if(word not in stopwords.words('english'))]
    return datalist

# 返回集合列表
def word_create(ori_data):
    print("\nVectorzing dataset ...")
    word_dic = set([]) #建立一个集合列表
    #词向量的时间
    vectorTime = time()
    #词典的构造
    for doc in ori_data.data:
        #doc是byte,这里将byte转化为string
        doc = str(doc, encoding = "utf-8")
        #使用正则表达式将特殊符号去除
        doc = re.sub("[\s+\.\!\/_,$%^*(+\"\'-]+|[+——!,。?、~@#¥%……&*()<>]+", " ", doc)
        #使用默认的空格方式将email分隔开,然后转化为小写字母,与原集合取并集
        word_dic = word_dic|set(doc.lower().split())
    #向量化的时间和词典中词的数量
    print("Vectorzing time:{0}\nThe number of word_dictionary:{1}".format(vectorTime,len(word_dic)))
    return list(word_dic)

def doc_represent(wordDic,ori_data):
    #创建一个文档数(行)*词向量(列)长度的二维数组
    doc_re = numpy.zeros((len(ori_data.data),len(wordDic)),dtype= numpy.int)
    #计数器
    count = 0
    #用来记录词向量表示时间
    representTime = time()
    for doc in ori_data.data:
        #同word_create函数,进行同样的操作
        doc = str(doc, encoding = "utf-8")
        doc = re.sub("[\s+\.\!\/_,$%^*(+\"\'-]+|[+——!,。?、~@#¥%……&*()<>]+", " ", doc)
        for word in doc.lower().split():
            if word in wordDic:
                #将对应词向量位置置1
                doc_re[count][wordDic.index(word)] = 1
        count = count+1
    print("Represent doc time:{0}\nThe number of doc:{1}".format(representTime-time(),len(doc_re)))
    #返回表示文档的二维数组
    return doc_re

def pre_probabilty(ori_data):
    s_pre_pro = []
    #正常邮件的先验概率
    P_normal = (normal + 1.0)/(len(ori_data.data) + 2.0)
    s_pre_pro.append(P_normal)
    #垃圾邮件的先验概率
    P_spam = (spam + 1.0)/(len(ori_data.data) + 2.0)
    s_pre_pro.append(P_spam)
    #返回先验概率的列表
    return s_pre_pro

#计算每个词在正常邮件垃圾邮件中的数目
def wordNum_email(email_repre,wordDic):
    #用二维向量存储
    num_word = numpy.zeros((2,len(wordDic)),dtype= numpy.int)
    for i in range(len(wordDic)):
        #在正常邮件的数目
        for j in range(normal):
            num_word[0][i] += email_repre[j][i]
        #在垃圾邮件中的数目
        for j in range(normal, spam+normal):
            num_word[1][i] += email_repre[j][i]
    return num_word

#条件概率
def con_probabilty(email_repre,wordDic):
    #得到每个词汇在正常邮件、垃圾邮件中的数目
    word_num = wordNum_email(email_repre,wordDic)
    word_pro = numpy.zeros((2,len(wordDic)),dtype = numpy.double)
    for i in range(len(wordDic)):
        word_pro[0][i] = round((word_num[0][i]+1)/(normal + 2),8)
        word_pro[1][i] = round((word_num[1][i]+1)/(spam + 2 ),8)
    return word_pro

#得到每个类别中的文档数
def class_num(path,class_name):
    count = 0
    path=path+"/"+class_name
    for root, dirs, files in os.walk(path):  # 遍历统计
        for each in files:
            count += 1
    return count

#测试
def test_spam(test_repre,pre_pro,con_pro):
    email_pro = numpy.zeros((len(test_repre),2),dtype = numpy.double)
    email_judge = []
    normal_num = 0
    spam_num = 0
    for i in range(len(test_repre)):
        email_pro[i][0] = round(pre_pro[0],8)
        email_pro[i][1] = round(pre_pro[1],8)
        for j in range(len(test_repre[0])):
            if test_repre[i][j] != 0:
                email_pro[i][0] *= con_pro[0][j]
                email_pro[i][1] *= con_pro[1][j]
        if email_pro[i][0] > email_pro[i][1] :
            email_judge.append(0)
        elif email_pro[i][0] < email_pro[i][1] :
            email_judge.append(1)
        else :
            if random.random() > 0.5:
                email_judge.append(1)
            else:
                email_judge.append(0)
    for i in range(normal_test):
        if email_judge[i] == 0:
            normal_num +=1
    for i in range(normal_test,len(test_repre)):
        if email_judge[i] == 1:
            spam_num +=1
    print("email_judge=")
    print(email_judge)
    print("normal_num="+str(normal_num)+"\nspam_num="+str(spam_num))
    return (normal_num + spam_num)/len(test_repre)

if __name__ == "__main__":
     # 训练集和测试集的路径
     train_path = "D:/Python/Python_learning/Book_code/LH_mechine_learning/bayes/spamDataset/email/train1"
     test_path = "D:/Python/Python_learning/Book_code/LH_mechine_learning/bayes/spamDataset/email/test1"
     train_list = load_data(train_path)
     test_list = load_data(test_path)

     normal = class_num(train_path,"pos")  # 正常邮件的数目
     spam = class_num(train_path,"neg")  # 垃圾邮件的数目

     WordDictionary = word_create(train_list)   # 建立词汇表
     docRepre = doc_represent(WordDictionary,train_list)  # 将训练数据进行向量表示
     prePro = pre_probabilty(train_list)
     conPro = con_probabilty(docRepre,WordDictionary)
     print("\npreProbablity:",prePro)   # 计算先验概率
     print("conProbablity:",conPro)   # 计算条件概率

     testRepre = doc_represent(WordDictionary,test_list)  # 测试数据的向量表示
     normal_test = class_num(test_path, "pos")  # 正常邮件的数目
     spam_test = class_num(test_path, "neg")  # 垃圾邮件的数目

     test_accuracy = test_spam(testRepre,prePro,conPro)  # 测试数据的准确率
     print ("test accuracy")
     print(test_accuracy)

2 Test results

Loading dataset ...
summary: 43 documents in 2 categories.
Load data in 0.008994102478027344seconds
Loading dataset ...
summary: 36 documents in 2 categories.
Load data in 0.007995843887329102seconds

Vectorzing dataset ...
Vectorzing time:1587043343.3496442
The number of word_dictionary:2426
Represent doc time:-0.5976784229278564
The number of doc:43

preProbablity: [0.5777777777777777, 0.4222222222222222]
conProbablity: [[0.59259259 0.07407407 0.03703704 ... 0.11111111 0.07407407 0.03703704]
 [0.7        0.05       0.15       ... 0.05       0.05       0.1       ]]
Represent doc time:-0.5137045383453369
The number of doc:36
email_judge=
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1]
normal_num=16
spam_num=13
test accuracy
0.8055555555555556

References:
Link: Spam classification based on Naive Bayes .

Guess you like

Origin blog.csdn.net/qq_41709378/article/details/105565577