自然语言处理--PCA+LDA构建垃圾消息过滤器

PCA和 LDA 结合起来有助于创建一个精确的、泛化能力强的短消息分类模型,这样面对新的短消息时出错概率大幅降低:

import numpy as np
import pandas as pd
from nlpia.data.loaders import get_data
from nltk.tokenize.casual import casual_tokenize
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA

# 从 nlpia 包中的 DataFrame 加载短消息数据
pd.options.display.width = 120
sms = get_data('sms-spam')
# 向短消息的索引号后面添加一个感叹号,以使垃圾短消息更容易被发现
index = ['sms{}{}'.format(i, '!'*j) for (i,j) in zip(range(len(sms)), sms.spam)]
sms.index = index
print(sms.head(6))

# 计算每条消息的 TF-IDF 向量
tfidf = TfidfVectorizer(tokenizer=casual_tokenize)
tfidf_docs = tfidf.fit_transform(raw_documents=sms.text).toarray()
# 来自分词器(casual_tokenize)的 9232 个不同的1-gram 词条
print(len(tfidf.vocabulary_))
tfidf_docs = pd.DataFrame(tfidf_docs)
# 减去平均值对向量化的文档(词袋向量)进行中心化处理
tfidf_docs = tfidf_docs - tfidf_docs.mean()

# lda
lda = LDA(n_components=1)
lda = lda.fit(tfidf_docs, sms.spam)
sms['lda_spaminess'] = lda.predict(tfidf_docs)
print(((sms.spam - sms.lda_spaminess) ** 2.).sum() ** .5 ) # rmse
print((sms.spam == sms.lda_spaminess).sum())
print(len(sms))

# 交叉验证
lda = LDA(n_components=1)
scores = cross_val_score(lda, tfidf_docs, sms.spam, cv=5)
# 显然这个模型并不好,永远不要对模型在训练集上的效果感到兴奋。
print("Accuracy: {:.2f} (+/-{:.2f})".format(scores.mean(), scores.std() * 2))

# 下面我们保留三分之一的数据集用于测试
X_train, X_test, y_train, y_test = train_test_split(tfidf_docs, sms.spam, test_size=0.33, random_state=271828)
lda = LDA(n_components=1)
print(lda.fit(X_train, y_train))
print(lda.score(X_test, y_test).round(3))

# 下面我们看看 LSA+LDA 是否有助于创建一个精确的、泛化能力强的模型
# PCA降维:把数据集 9232 维的 TF-IDF 向量转换为 16 维主题向量
pca = PCA(n_components=16)
pca = pca.fit(tfidf_docs)
pca_topicvectors = pca.transform(tfidf_docs)
columns = ['topic{}'.format(i) for i in range(pca.n_components)]
pca_topicvectors = pd.DataFrame(pca_topicvectors, columns=columns, index=index)

X_train, X_test, y_train, y_test = train_test_split(pca_topicvectors.values, sms.spam, test_size=0.3, random_state=271828)
lda = LDA(n_components=1)
print(lda.fit(X_train, y_train))
print(lda.score(X_test, y_test).round(3))
# 交叉验证
lda = LDA(n_components=1)
scores = cross_val_score(lda, pca_topicvectors, sms.spam, cv=10)
print("Accuracy: {:.3f} (+/-{:.3f})".format(scores.mean(), scores.std() * 2))

猜你喜欢

转载自blog.csdn.net/fgg1234567890/article/details/112439694
今日推荐