前言

场景: 根据关键词来判断一个标题属于什么类型的文章
例如: 《小米雷军用苹果被吐槽，官方回应：不用就是不关心产品》标题里出现小米和苹果，则大概率认为该文章的主题是手机，而此处就是要用算法找出【小米、苹果】这类词及其对应的主题
常用方法: 基于统计、监督分类模型（贝叶斯…）、半监督、无监督模型（词向量…）
代码+语料+结果: https://github.com/AryeYellow/NLP/tree/master/classification/word_clf

基于统计

单标签多分类

from collections import Counter
from jieba import cut

def statistics(X, Y):
    length = len(Y)
    print('|'.join(['word', 'label', 'max_freq', 'amount', 'probability']), '|'.join('-'*5), sep='\n')
    for w, total in Counter(w for x in X for w in set(cut(x))).most_common():
        c = Counter(Y[i] for i in range(length) if w in X[i])
        label, frequency = c.most_common()[0]
        print(w, label, frequency, total, '%.1f%%' % (frequency/total*100), sep='|')

texts = ['小米小米', '苹果和橙', '小米和苹果']
labels = ['phone', 'fruit', 'phone']
statistics(texts, labels)

word	label	max_freq	amount	probability
小米	phone	2	2	100.0%
苹果	fruit	1	2	50.0%
和	fruit	1	2	50.0%
橙	fruit	1	1	100.0%

多标签多分类

from collections import Counter
import jieba

def statistics(X, Y):
    length = len(Y)
    labels = sorted(set(i for y in Y for i in y))
    print('|'.join(['word', 'frequency', *('【%s】' % i for i in labels)]), '|'.join('-'*5), sep='\n')
    for w, total in Counter(w for x in X for w in x).most_common():
        c = Counter(y for i in range(length) if w in X[i] for y in Y[i])
        print(w, total, *('%d%%' % (c[i]/total*100) for i in labels), sep='|')

jieba.add_word('刘诗诗')
texts = ['刘诗诗吃苹果', '苹果手机', '刘诗诗代言小米']
X = [set(jieba.cut(text)) for text in texts]
Y = [['entertain', 'fruit'], ['phone'], ['entertain', 'phone']]
statistics(X, Y)

word	frequency	【entertain】	【fruit】	【phone】
刘诗诗	2	100.0%	50.0%	50.0%
苹果	2	50.0%	50.0%	50.0%
吃	1	100.0%	100.0%	0.0%
手机	1	0.0%	0.0%	100.0%
代言	1	100.0%	0.0%	100.0%
小米	1	100.0%	0.0%	100.0%

机器学习

from jieba import cut
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
# from sklearn.linear_model import LogisticRegression
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
from collections import Counter

# 读取语料
X = ['小米小米', '苹果和橙', '小米和苹果']
Y = ['phone', 'fruit', 'phone']

# 向量化
vectorizer = TfidfVectorizer(tokenizer=cut)
x = vectorizer.fit_transform(X)

# 分类模型
clf = MultinomialNB()
# clf = LogisticRegression()
# clf = DecisionTreeClassifier()
# clf = RandomForestClassifier()
clf.fit(x, Y)

# 监督模型
words = Counter(word for text in X for word in cut(text))
print('word', 'freq', 'label', 'probability', sep=' | ');print('-|-|-|-')
for word, freq in words.most_common():
    pred = clf.predict(vectorizer.transform([word]))[0]
    probability = max(clf.predict_proba(vectorizer.transform([word]))[0])
    print(word, freq, pred, probability, sep=' | ')

word	freq	label	probability
小米	3	phone	0.8140295398557026
苹果	2	phone	0.6383233682898345
和	2	phone	0.6383233682898345
橙	1	phone	0.5025766173291208

补充

阅读扩展：半监督词分类算法

https://blog.csdn.net/Yellow_python/article/details/100940617

全部分析结果比较

from jieba import cut
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from collections import Counter

# 读取语料
X = ['小米小米', '苹果和橙', '小米和苹果']
Y = ['phone', 'fruit', 'phone']

# 文本向量化
vectorizer = TfidfVectorizer(tokenizer=cut)
x = vectorizer.fit_transform(X)

# 分类模型
models = [
    MultinomialNB().fit(x, Y),
    LogisticRegression().fit(x, Y),  # 0.22的sklearn版本，要加参数solver='liblinear'
    DecisionTreeClassifier().fit(x, Y),
    RandomForestClassifier().fit(x, Y),
]

# 词频统计
words = Counter(word for text in X for word in cut(text))

# 预测
import sklearn;print(sklearn.__version__)  # 打印sklearn版本
print('word', *[m.__class__.__name__ for m in models], sep=' | ');print('-|-|-|-|-')
for w, _ in words.most_common()+[('', 0)]:
    print(w, *[m.predict(vectorizer.transform([w]))[0]+' {:.2f}%'.format(max(
        m.predict_proba(vectorizer.transform([w]))[0])*100) for m in models], sep=' | ')

word	MultinomialNB	LogisticRegression	DecisionTreeClassifier	RandomForestClassifier
小米	phone 81.40%	phone 67.98%	phone 100.00%	phone 84.00%
苹果	phone 63.83%	phone 54.48%	phone 100.00%	phone 59.00%
和	phone 63.83%	phone 54.48%	phone 100.00%	phone 70.00%
橙	phone 50.26%	fruit 52.98%	fruit 100.00%	fruit 56.00%
	phone 66.67%	phone 55.30%	phone 100.00%	phone 67.00%

上述结果分析: 贝叶斯和 逻辑回归较稳定，决策树和随机森林结果不稳定；
逻辑回归和随机森林效果较好
决策树输出值只有0和1，非常不稳定
不同版本的逻辑回归不一样，建议设置 solver='liblinear'

全监督词分类算法

文章目录

前言

基于统计

单标签多分类

多标签多分类

机器学习

补充

阅读扩展：半监督词分类算法

全部分析结果比较

猜你喜欢