文本分类(2)-基于传统机器学习方法进行文本分类

传统机器学习的文本分类通常提取TFIDF或者词袋特征,然后给模型进行训练,传统的机器学习的分类模型由很多,比如逻辑回归、支持向量机、多层感知机、贝叶斯等等。利用传统机器学习方法进行文本分类的基本思路:获取数据、数据预处理(上一篇博客已经讲过了https://blog.csdn.net/weixin_44766179/article/details/89855100)、特征提取、模型训练、预测。
下面利用传统机器学习方法实现垃圾邮件分类任务。

import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
import warnings

warnings.filterwarnings('ignore')

data_file = './spam.csv'

df = pd.read_csv(data_file, encoding='latin1')

labels = df.v1
texts = df.v2

def clear_data(text):
    # 英文缩写替换
    text_abbreviation = []
    for item in text:
        item = item.lower().replace("it's", "it is").replace("i'm", "i am").replace("he's", "he is").replace("she's", "she is")\
        .replace("we're", "we are").replace("they're", "they are").replace("you're", "you are").replace("that's", "that is")\
        .replace("this's", "this is").replace("can't", "can not").replace("don't", "do not").replace("doesn't", "does not")\
        .replace("we've", "we have").replace("i've", " i have").replace("isn't", "is not").replace("won't", "will not")\
        .replace("hasn't", "has not").replace("wasn't", "was not").replace("weren't", "were not").replace("let's", "let us")
        
        text_abbreviation.append(item)
        
    # 删除标点符号、数字等其他字符
    text_clear_str = []
    for item in text_abbreviation:
        item = re.sub("[^a-zA-Z]", " ", item)
        text_clear_str.append(' '.join(item.split()))
        
    texts = []
    stem_porter = PorterStemmer()  # 词形归一化
    stop_words = stopwords.words("english")  # 停用词

    # 分词、词形归一化、删除停用词
    for item in text_clear_str:
        words_token = word_tokenize(item)  # 分词
        words = [stem_porter.stem(w) for w in words_token if w not in stop_words]
        texts.append(' '.join(words))
        
    return texts

texts = clear_data(texts)

le = LabelEncoder()
labels = le.fit_transform(labels)
# TFIDF特征提取
def features_extraction(text):
    vector = TfidfVectorizer()
    return vector.fit_transform(text).todense()

features = features_extraction(texts)

x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=2)
# 逻辑回归
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression().fit(x_train, y_train)

y_pred = lr.predict(x_test)

print('accuracy_lr : ', accuracy_score(y_test, y_pred))  # 输出:0.9524663677130045
# 支持向量机
from sklearn.svm import SVC

svc = SVC(kernel='linear').fit(x_train, y_train)

y_pred = svc.predict(x_test)

print('accuracy_svm: ', accuracy_score(y_test, y_pred))  # 输出:0.9739910313901345
# 多层感知机
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(100, 100,)).fit(x_train, y_train)

y_pred = mlp.predict(x_test)

print('accuracy_mlp: ', accuracy_score(y_test, y_pred))  #   输出:0.9748878923766816
# 贝叶斯
from sklearn.naive_bayes import MultinomialNB

mb = MultinomialNB().fit(x_train, y_train)

y_pred = mb.predict(x_test)

print('accuracy_mb: ', accuracy_score(y_test, y_pred))  # 输出:0.9623318385650225
发布了51 篇原创文章 · 获赞 74 · 访问量 24万+

猜你喜欢

转载自blog.csdn.net/weixin_44766179/article/details/90019284