原理

TF（Term Frequency）：词频
IDF（Inverse Document Frequency）：逆文本频率指数

$TF = \frac{该词频数}{文档词语总数}$
$IDF = \log(\frac{文档总数}{出现该词文档数+1})$

jieba（免训练）

from jieba.analyse import tfidf
sentence = '佛山市科技局发布关于发展佛山市人工智能项目的通知'
print(tfidf(sentence))
print(tfidf(sentence, allowPOS=('n', 'ns', 'v', 'vn')))  # 按词性筛选
print(tfidf(sentence, allowPOS=('n', 'ns', 'v', 'vn'), withFlag=True))  # 返回词性
print(tfidf(sentence, withWeight=True))  # 返回权重

['佛山市', '科技局', '人工智能', '通知', '发布', '关于', '项目', '发展']
['佛山市', '科技局', '人工智能', '通知', '发布', '项目', '发展']
[pair('佛山市', 'ns'), pair('科技局', 'n'), pair('人工智能', 'n'), pair('通知', 'v'), pair('发布', 'v'), pair('项目', 'n'), pair('发展', 'vn')]
[('佛山市', 2.2638012411777777), ('科技局', 1.3454353536333334), ('人工智能', 1.0508918217211112), ('通知', 0.6714233436266667), ('发布', 0.5657954481322222), ('关于', 0.5532763439699999), ('项目', 0.5425367102355555), ('发展', 0.39722939449333333)]

Python手写

from collections import Counter
from math import log10
from re import split
from jieba.posseg import dt

FLAGS = set('a an b f i j l n nr nrfg nrt ns nt nz s t v vi vn z eng'.split())


def cut(text):
    for sentence in split('[^a-zA-Z0-9\u4e00-\u9fa5]+', text.strip()):
        for w in dt.cut(sentence):
            if len(w.word) > 2 and w.flag in FLAGS:
                yield w.word


class TFIDF:
    def __init__(self, idf, idf_max=0.):
        self.idf = idf
        self.idf_max = idf_max or max(idf.values())

    @classmethod
    def train(cls, texts):
        texts = [set(cut(text)) for text in texts]
        lent = len(texts)
        words = set(w for t in texts for w in t)
        idf = dict()
        for word in words:
            idf[word] = log10(lent/(sum(1 if word in t else 0 for t in texts)+1))
        idf_max = log10(lent)
        return cls(idf, idf_max)

    def get_idf(self, word):
        return self.idf.get(word, self.idf_max)

    def extract(self, text, top_n=10):
        counter = Counter()
        for w in cut(text):
            counter[w] += self.get_idf(w)
        return [i[0] for i in counter.most_common(top_n)]

sklearn

from re import split
from jieba.posseg import dt
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

FLAGS = set('a an b f i j l n nr nrfg nrt ns nt nz s t v vi vn z eng'.split())


def cut(text):
    for sentence in split('[^a-zA-Z0-9\u4e00-\u9fa5]+', text.strip()):
        for w in dt.cut(sentence):
            if len(w.word) > 2 and w.flag in FLAGS:
                yield w.word


class TFIDF:
    def __init__(self, idf):
        self.idf = idf

    @classmethod
    def train(cls, texts):
        model = TfidfVectorizer(tokenizer=cut)
        model.fit(texts)
        idf = {w: model.idf_[i] for w, i in model.vocabulary_.items()}
        return cls(idf)

    def get_idf(self, word):
        return self.idf.get(word, max(self.idf.values()))

    def extract(self, text, top_n=10):
        counter = Counter()
        for w in cut(text):
            counter[w] += self.get_idf(w)
        return [i[0] for i in counter.most_common(top_n)]

gensim

from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from re import split
from jieba.posseg import dt

FLAGS = set('a an b f i j l n nr nrfg nrt ns nt nz s t v vi vn z eng'.split())


def lcut(text):
    return [
        w.word for sentence in split('[^a-zA-Z0-9\u4e00-\u9fa5]+', text.strip())
        for w in dt.cut(sentence)if len(w.word) > 2 and w.flag in FLAGS]


class TFIDF:
    def __init__(self, dictionary, model):
        self.model = model
        self.doc2bow = dictionary.doc2bow
        self.id2word = {i: w for w, i in dictionary.token2id.items()}

    @classmethod
    def train(cls, texts):
        texts = [lcut(text) for text in texts]
        dictionary = Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]
        model = TfidfModel(corpus)
        return cls(dictionary, model)

    def extract(self, text, top_n=10):
        vector = self.doc2bow(lcut(text))
        key_words = sorted(self.model[vector], key=lambda x: x[1], reverse=True)
        return [self.id2word[i] for i, j in key_words][:top_n]

手写、sklearn、gensim结果比较

from time import time
t0 = time()
with open('policy.txt', encoding='utf-8')as f:
    _texts = f.read().strip().split('\n\n')
tfidf = TFIDF.train(_texts)
for _text in _texts:
    print(tfidf.extract(_text))
print(time() - t0)

手写: [‘装配式’, ‘商品房’, ‘财政补贴’, ‘实施细则’, ‘长沙市’, ‘高新区’, ‘建筑面积’, ‘通讯员’, ‘肖日凡’, ‘内五区’]
[‘养老院’, ‘服务质量’, ‘民政局’, ‘民政部门’, ‘质监局’, ‘公安局’, ‘管理系统’, ‘公安部’, ‘医疗卫生’, ‘社会福利’]
[‘老年人’, ‘优待证’, ‘百岁老人’, ‘县市区’, ‘人口普查’, ‘登记表’, ‘身份证’, ‘通报批评’, ‘工作日内’, ‘符合条件者’]
[‘高科技’, ‘现代农业’, ‘特色产业’, ‘产业化’, ‘长沙市’, ‘科技局’, ‘报上来’, ‘苏卫群’, ‘顺利完成’, ‘长沙县’]
[‘电容量’, ‘开关站’, ‘需求预测’, ‘MVA’, ‘福建省’, ‘变电站’, ‘输变电’, ‘变电所’, ‘KVA’, ‘建设项目’]
65.98366403579712
sklearn: [‘装配式’, ‘商品房’, ‘财政补贴’, ‘实施细则’, ‘长沙市’, ‘高新区’, ‘建筑面积’, ‘通讯员’, ‘肖日凡’, ‘内五区’]
[‘养老院’, ‘服务质量’, ‘民政局’, ‘民政部门’, ‘质监局’, ‘长沙市’, ‘公安局’, ‘实施方案’, ‘管理系统’, ‘违规行为’]
[‘老年人’, ‘优待证’, ‘百岁老人’, ‘县市区’, ‘人口普查’, ‘登记表’, ‘身份证’, ‘工作日内’, ‘通报批评’, ‘责任人’]
[‘高科技’, ‘现代农业’, ‘特色产业’, ‘产业化’, ‘长沙市’, ‘科技局’, ‘报上来’, ‘苏卫群’, ‘顺利完成’, ‘长沙县’]
[‘电容量’, ‘开关站’, ‘需求预测’, ‘MVA’, ‘福建省’, ‘变电站’, ‘输变电’, ‘变电所’, ‘建设项目’, ‘KVA’]
90.46931791305542
gensim: [‘装配式’, ‘商品房’, ‘财政补贴’, ‘实施细则’, ‘长沙市’, ‘高新区’, ‘建筑面积’, ‘内五区’, ‘减免税’, ‘哪些项目’]
[‘养老院’, ‘服务质量’, ‘民政局’, ‘民政部门’, ‘质监局’, ‘公安局’, ‘管理系统’, ‘公安部’, ‘社会福利’, ‘医疗卫生’]
[‘优待证’, ‘老年人’, ‘百岁老人’, ‘县市区’, ‘人口普查’, ‘登记表’, ‘身份证’, ‘通报批评’, ‘工作日内’, ‘主办人’]
[‘高科技’, ‘现代农业’, ‘特色产业’, ‘产业化’, ‘长沙市’, ‘报上来’, ‘苏卫群’, ‘科技局’, ‘顺利完成’, ‘长沙县’]
[‘电容量’, ‘开关站’, ‘MVA’, ‘需求预测’, ‘福建省’, ‘变电所’, ‘变电站’, ‘输变电’, ‘KVA’, ‘装机容量’]
65.78198957443237

TF-IDF关键词抽取

文章目录

原理

jieba（免训练）

Python手写

sklearn

gensim

手写、sklearn、gensim结果比较

猜你喜欢