4.3 Clasificación ingenua bayesiana práctica-clasificación de noticias, reconocimiento de spam

Uno, clasificación de noticias sklearn20

Alguien puede encontrar el problema de tiempo de espera de carga del conjunto de datos de 20newsgroups, mi solución es conectarme científicamente

from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
# 特征抽取
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

if __name__ == '__main__':
    # 加载数据集
    news = fetch_20newsgroups(subset='all')
    # 目标值与特征值
    X = news.data
    y = news.target
    # 分割数据集
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

    # 特征抽取
    tdidf = TfidfVectorizer()
    X_train = tdidf.fit_transform(X_train)
    # 打印单词列表
    print(tdidf.get_feature_names())
    X_test = tdidf.transform(X_test)

    # 朴素贝叶斯
    model = MultinomialNB(alpha=1.0)
    print(X_train.toarray())
    # 训练
    model.fit(X_train, y_train)
    # 预测
    y_pred = model.predict(X_test)
    # 准确率
    print('准确率=', model.score(X_test, y_test))


准确率= 0.832507958967103

2. Reconocimiento de spam

Código de extracción de datos de spam: 1234

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


# 获得词汇列表
def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        # 词汇表与set取并集
        vocabSet = vocabSet | set(document)
    # 返回一个经过自然排序的词汇表
    return sorted(list(vocabSet))


# 对邮件内容进行预处理
def textParse(bigString):
    import re
    list0fTokens = re.split(r'\W*', bigString)
    # 返回长度大于2并转化为小写
    return [tok.lower() for tok in list0fTokens if len(tok) > 2]


# 词袋模型
def bag0fWords2Vec(vocabList, inputSet):
    # 初始化向量,其长度为词汇表程度相同
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        # 在词汇表对应位置上相加
        returnVec[vocabList.index(word)] += 1
    return returnVec


# 读取邮件
def loaddata():
    docList = []
    classList = []

    # 读取垃圾邮件
    num = 26
    for i in range(1, num):
        wordList = textParse(open('data/email/ham/%d.txt' % i).read())
        docList.append(wordList)
        classList.append(1)

        # 读取非垃圾邮件
        wordList = textParse(open('data/email/spam/%d.txt' % i).read())
        docList.append(wordList)
        classList.append(0)

    vocabList = createVocabList(docList)

    X = []
    for docIndex in range(len(docList)):
        X.append(bag0fWords2Vec(vocabList, docList[docIndex]))

    return X, classList, vocabList


if __name__ == '__main__':
    # 读取邮件
    X, y, vocabList = loaddata()

    # 数据分割
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

    # model
    model = MultinomialNB(alpha=1.0)
    model.fit(X_train, y_train)
    y_hat = model.predict(X_test)
    print('accuracy =', accuracy_score(y_test, y_hat))

accuracy = 1.0

Supongo que te gusta

Origin blog.csdn.net/weixin_46649052/article/details/112546134
Recomendado
Clasificación