4.3 Naive Bayesian classification practice-news classification, spam recognition

One, sklearn20 news classification

Someone may encounter the 20newsgroups data set loading timeout problem, my solution is to go online scientifically

from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
# 特征抽取
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

if __name__ == '__main__':
    # 加载数据集
    news = fetch_20newsgroups(subset='all')
    # 目标值与特征值
    X = news.data
    y = news.target
    # 分割数据集
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

    # 特征抽取
    tdidf = TfidfVectorizer()
    X_train = tdidf.fit_transform(X_train)
    # 打印单词列表
    print(tdidf.get_feature_names())
    X_test = tdidf.transform(X_test)

    # 朴素贝叶斯
    model = MultinomialNB(alpha=1.0)
    print(X_train.toarray())
    # 训练
    model.fit(X_train, y_train)
    # 预测
    y_pred = model.predict(X_test)
    # 准确率
    print('准确率=', model.score(X_test, y_test))


准确率= 0.832507958967103

2. Spam recognition

Spam data extraction code: 1234

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


# 获得词汇列表
def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        # 词汇表与set取并集
        vocabSet = vocabSet | set(document)
    # 返回一个经过自然排序的词汇表
    return sorted(list(vocabSet))


# 对邮件内容进行预处理
def textParse(bigString):
    import re
    list0fTokens = re.split(r'\W*', bigString)
    # 返回长度大于2并转化为小写
    return [tok.lower() for tok in list0fTokens if len(tok) > 2]


# 词袋模型
def bag0fWords2Vec(vocabList, inputSet):
    # 初始化向量,其长度为词汇表程度相同
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        # 在词汇表对应位置上相加
        returnVec[vocabList.index(word)] += 1
    return returnVec


# 读取邮件
def loaddata():
    docList = []
    classList = []

    # 读取垃圾邮件
    num = 26
    for i in range(1, num):
        wordList = textParse(open('data/email/ham/%d.txt' % i).read())
        docList.append(wordList)
        classList.append(1)

        # 读取非垃圾邮件
        wordList = textParse(open('data/email/spam/%d.txt' % i).read())
        docList.append(wordList)
        classList.append(0)

    vocabList = createVocabList(docList)

    X = []
    for docIndex in range(len(docList)):
        X.append(bag0fWords2Vec(vocabList, docList[docIndex]))

    return X, classList, vocabList


if __name__ == '__main__':
    # 读取邮件
    X, y, vocabList = loaddata()

    # 数据分割
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

    # model
    model = MultinomialNB(alpha=1.0)
    model.fit(X_train, y_train)
    y_hat = model.predict(X_test)
    print('accuracy =', accuracy_score(y_test, y_hat))

accuracy = 1.0

Guess you like

Origin blog.csdn.net/weixin_46649052/article/details/112546134