全监督词分类算法

前言

场景
根据关键词来判断一个标题属于什么类型的文章
例如
《小米雷军用苹果被吐槽,官方回应:不用就是不关心产品》标题里出现 小米苹果,则大概率认为该文章的主题是手机,而此处就是要用 算法找出【小米、苹果】这类词及其对应的主题
常用方法
基于统计、监督分类模型(贝叶斯…)、半监督、无监督模型(词向量…)
代码+语料+结果
https://github.com/AryeYellow/NLP/tree/master/classification/word_clf

基于统计

单标签多分类

from collections import Counter
from jieba import cut

def statistics(X, Y):
    length = len(Y)
    print('|'.join(['word', 'label', 'max_freq', 'amount', 'probability']), '|'.join('-'*5), sep='\n')
    for w, total in Counter(w for x in X for w in set(cut(x))).most_common():
        c = Counter(Y[i] for i in range(length) if w in X[i])
        label, frequency = c.most_common()[0]
        print(w, label, frequency, total, '%.1f%%' % (frequency/total*100), sep='|')

texts = ['小米小米', '苹果和橙', '小米和苹果']
labels = ['phone', 'fruit', 'phone']
statistics(texts, labels)
word label max_freq amount probability
小米 phone 2 2 100.0%
苹果 fruit 1 2 50.0%
fruit 1 2 50.0%
fruit 1 1 100.0%

多标签多分类

from collections import Counter
import jieba

def statistics(X, Y):
    length = len(Y)
    labels = sorted(set(i for y in Y for i in y))
    print('|'.join(['word', 'frequency', *('【%s】' % i for i in labels)]), '|'.join('-'*5), sep='\n')
    for w, total in Counter(w for x in X for w in x).most_common():
        c = Counter(y for i in range(length) if w in X[i] for y in Y[i])
        print(w, total, *('%d%%' % (c[i]/total*100) for i in labels), sep='|')

jieba.add_word('刘诗诗')
texts = ['刘诗诗吃苹果', '苹果手机', '刘诗诗代言小米']
X = [set(jieba.cut(text)) for text in texts]
Y = [['entertain', 'fruit'], ['phone'], ['entertain', 'phone']]
statistics(X, Y)
word frequency 【entertain】 【fruit】 【phone】
刘诗诗 2 100.0% 50.0% 50.0%
苹果 2 50.0% 50.0% 50.0%
1 100.0% 100.0% 0.0%
手机 1 0.0% 0.0% 100.0%
代言 1 100.0% 0.0% 100.0%
小米 1 100.0% 0.0% 100.0%

机器学习

from jieba import cut
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
# from sklearn.linear_model import LogisticRegression
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
from collections import Counter

# 读取语料
X = ['小米小米', '苹果和橙', '小米和苹果']
Y = ['phone', 'fruit', 'phone']

# 向量化
vectorizer = TfidfVectorizer(tokenizer=cut)
x = vectorizer.fit_transform(X)

# 分类模型
clf = MultinomialNB()
# clf = LogisticRegression()
# clf = DecisionTreeClassifier()
# clf = RandomForestClassifier()
clf.fit(x, Y)

# 监督模型
words = Counter(word for text in X for word in cut(text))
print('word', 'freq', 'label', 'probability', sep=' | ');print('-|-|-|-')
for word, freq in words.most_common():
    pred = clf.predict(vectorizer.transform([word]))[0]
    probability = max(clf.predict_proba(vectorizer.transform([word]))[0])
    print(word, freq, pred, probability, sep=' | ')
word freq label probability
小米 3 phone 0.8140295398557026
苹果 2 phone 0.6383233682898345
2 phone 0.6383233682898345
1 phone 0.5025766173291208

补充

阅读扩展:半监督词分类算法

https://blog.csdn.net/Yellow_python/article/details/100940617

全部分析结果比较

from jieba import cut
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from collections import Counter

# 读取语料
X = ['小米小米', '苹果和橙', '小米和苹果']
Y = ['phone', 'fruit', 'phone']

# 文本向量化
vectorizer = TfidfVectorizer(tokenizer=cut)
x = vectorizer.fit_transform(X)

# 分类模型
models = [
    MultinomialNB().fit(x, Y),
    LogisticRegression().fit(x, Y),  # 0.22的sklearn版本,要加参数solver='liblinear'
    DecisionTreeClassifier().fit(x, Y),
    RandomForestClassifier().fit(x, Y),
]

# 词频统计
words = Counter(word for text in X for word in cut(text))

# 预测
import sklearn;print(sklearn.__version__)  # 打印sklearn版本
print('word', *[m.__class__.__name__ for m in models], sep=' | ');print('-|-|-|-|-')
for w, _ in words.most_common()+[('', 0)]:
    print(w, *[m.predict(vectorizer.transform([w]))[0]+' {:.2f}%'.format(max(
        m.predict_proba(vectorizer.transform([w]))[0])*100) for m in models], sep=' | ')
word MultinomialNB LogisticRegression DecisionTreeClassifier RandomForestClassifier
小米 phone 81.40% phone 67.98% phone 100.00% phone 84.00%
苹果 phone 63.83% phone 54.48% phone 100.00% phone 59.00%
phone 63.83% phone 54.48% phone 100.00% phone 70.00%
phone 50.26% fruit 52.98% fruit 100.00% fruit 56.00%
phone 66.67% phone 55.30% phone 100.00% phone 67.00%
上述结果分析
贝叶斯和 逻辑回归较稳定,决策树和随机森林结果不稳定;
逻辑回归和随机森林效果较好
决策树输出值只有0和1,非常不稳定
不同版本的逻辑回归不一样,建议设置 solver='liblinear'

猜你喜欢

转载自blog.csdn.net/Yellow_python/article/details/106243002