原理
TF(Term Frequency):词频
IDF(Inverse Document Frequency):逆文本频率指数
jieba(免训练)
from jieba.analyse import tfidf
sentence = '佛山市科技局发布关于发展佛山市人工智能项目的通知'
print(tfidf(sentence))
print(tfidf(sentence, allowPOS=('n', 'ns', 'v', 'vn'))) # 按词性筛选
print(tfidf(sentence, allowPOS=('n', 'ns', 'v', 'vn'), withFlag=True)) # 返回词性
print(tfidf(sentence, withWeight=True)) # 返回权重
['佛山市', '科技局', '人工智能', '通知', '发布', '关于', '项目', '发展'] ['佛山市', '科技局', '人工智能', '通知', '发布', '项目', '发展'] [pair('佛山市', 'ns'), pair('科技局', 'n'), pair('人工智能', 'n'), pair('通知', 'v'), pair('发布', 'v'), pair('项目', 'n'), pair('发展', 'vn')] [('佛山市', 2.2638012411777777), ('科技局', 1.3454353536333334), ('人工智能', 1.0508918217211112), ('通知', 0.6714233436266667), ('发布', 0.5657954481322222), ('关于', 0.5532763439699999), ('项目', 0.5425367102355555), ('发展', 0.39722939449333333)]
Python手写
from collections import Counter
from math import log10
from re import split
from jieba.posseg import dt
FLAGS = set('a an b f i j l n nr nrfg nrt ns nt nz s t v vi vn z eng'.split())
def cut(text):
for sentence in split('[^a-zA-Z0-9\u4e00-\u9fa5]+', text.strip()):
for w in dt.cut(sentence):
if len(w.word) > 2 and w.flag in FLAGS:
yield w.word
class TFIDF:
def __init__(self, idf, idf_max=0.):
self.idf = idf
self.idf_max = idf_max or max(idf.values())
@classmethod
def train(cls, texts):
texts = [set(cut(text)) for text in texts]
lent = len(texts)
words = set(w for t in texts for w in t)
idf = dict()
for word in words:
idf[word] = log10(lent/(sum(1 if word in t else 0 for t in texts)+1))
idf_max = log10(lent)
return cls(idf, idf_max)
def get_idf(self, word):
return self.idf.get(word, self.idf_max)
def extract(self, text, top_n=10):
counter = Counter()
for w in cut(text):
counter[w] += self.get_idf(w)
return [i[0] for i in counter.most_common(top_n)]
sklearn
from re import split
from jieba.posseg import dt
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
FLAGS = set('a an b f i j l n nr nrfg nrt ns nt nz s t v vi vn z eng'.split())
def cut(text):
for sentence in split('[^a-zA-Z0-9\u4e00-\u9fa5]+', text.strip()):
for w in dt.cut(sentence):
if len(w.word) > 2 and w.flag in FLAGS:
yield w.word
class TFIDF:
def __init__(self, idf):
self.idf = idf
@classmethod
def train(cls, texts):
model = TfidfVectorizer(tokenizer=cut)
model.fit(texts)
idf = {w: model.idf_[i] for w, i in model.vocabulary_.items()}
return cls(idf)
def get_idf(self, word):
return self.idf.get(word, max(self.idf.values()))
def extract(self, text, top_n=10):
counter = Counter()
for w in cut(text):
counter[w] += self.get_idf(w)
return [i[0] for i in counter.most_common(top_n)]
gensim
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from re import split
from jieba.posseg import dt
FLAGS = set('a an b f i j l n nr nrfg nrt ns nt nz s t v vi vn z eng'.split())
def lcut(text):
return [
w.word for sentence in split('[^a-zA-Z0-9\u4e00-\u9fa5]+', text.strip())
for w in dt.cut(sentence)if len(w.word) > 2 and w.flag in FLAGS]
class TFIDF:
def __init__(self, dictionary, model):
self.model = model
self.doc2bow = dictionary.doc2bow
self.id2word = {i: w for w, i in dictionary.token2id.items()}
@classmethod
def train(cls, texts):
texts = [lcut(text) for text in texts]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
model = TfidfModel(corpus)
return cls(dictionary, model)
def extract(self, text, top_n=10):
vector = self.doc2bow(lcut(text))
key_words = sorted(self.model[vector], key=lambda x: x[1], reverse=True)
return [self.id2word[i] for i, j in key_words][:top_n]
手写、sklearn、gensim结果比较
from time import time
t0 = time()
with open('policy.txt', encoding='utf-8')as f:
_texts = f.read().strip().split('\n\n')
tfidf = TFIDF.train(_texts)
for _text in _texts:
print(tfidf.extract(_text))
print(time() - t0)
- 手写
-
[‘装配式’, ‘商品房’, ‘财政补贴’, ‘实施细则’, ‘长沙市’, ‘高新区’, ‘建筑面积’, ‘通讯员’, ‘肖日凡’, ‘内五区’]
[‘养老院’, ‘服务质量’, ‘民政局’, ‘民政部门’, ‘质监局’, ‘公安局’, ‘管理系统’, ‘公安部’, ‘医疗卫生’, ‘社会福利’]
[‘老年人’, ‘优待证’, ‘百岁老人’, ‘县市区’, ‘人口普查’, ‘登记表’, ‘身份证’, ‘通报批评’, ‘工作日内’, ‘符合条件者’]
[‘高科技’, ‘现代农业’, ‘特色产业’, ‘产业化’, ‘长沙市’, ‘科技局’, ‘报上来’, ‘苏卫群’, ‘顺利完成’, ‘长沙县’]
[‘电容量’, ‘开关站’, ‘需求预测’, ‘MVA’, ‘福建省’, ‘变电站’, ‘输变电’, ‘变电所’, ‘KVA’, ‘建设项目’]
65.98366403579712
- sklearn
-
[‘装配式’, ‘商品房’, ‘财政补贴’, ‘实施细则’, ‘长沙市’, ‘高新区’, ‘建筑面积’, ‘通讯员’, ‘肖日凡’, ‘内五区’]
[‘养老院’, ‘服务质量’, ‘民政局’, ‘民政部门’, ‘质监局’, ‘长沙市’, ‘公安局’, ‘实施方案’, ‘管理系统’, ‘违规行为’]
[‘老年人’, ‘优待证’, ‘百岁老人’, ‘县市区’, ‘人口普查’, ‘登记表’, ‘身份证’, ‘工作日内’, ‘通报批评’, ‘责任人’]
[‘高科技’, ‘现代农业’, ‘特色产业’, ‘产业化’, ‘长沙市’, ‘科技局’, ‘报上来’, ‘苏卫群’, ‘顺利完成’, ‘长沙县’]
[‘电容量’, ‘开关站’, ‘需求预测’, ‘MVA’, ‘福建省’, ‘变电站’, ‘输变电’, ‘变电所’, ‘建设项目’, ‘KVA’]
90.46931791305542
- gensim
-
[‘装配式’, ‘商品房’, ‘财政补贴’, ‘实施细则’, ‘长沙市’, ‘高新区’, ‘建筑面积’, ‘内五区’, ‘减免税’, ‘哪些项目’]
[‘养老院’, ‘服务质量’, ‘民政局’, ‘民政部门’, ‘质监局’, ‘公安局’, ‘管理系统’, ‘公安部’, ‘社会福利’, ‘医疗卫生’]
[‘优待证’, ‘老年人’, ‘百岁老人’, ‘县市区’, ‘人口普查’, ‘登记表’, ‘身份证’, ‘通报批评’, ‘工作日内’, ‘主办人’]
[‘高科技’, ‘现代农业’, ‘特色产业’, ‘产业化’, ‘长沙市’, ‘报上来’, ‘苏卫群’, ‘科技局’, ‘顺利完成’, ‘长沙县’]
[‘电容量’, ‘开关站’, ‘MVA’, ‘需求预测’, ‘福建省’, ‘变电所’, ‘变电站’, ‘输变电’, ‘KVA’, ‘装机容量’]
65.78198957443237