一.TF-IDF相关概念
1.1词频(TF)
1.2逆文件频率(IDF)
1.3词频-逆文件频率(TF-IDF)
二.代码实现
import numpy as np
corpus = [
'这 是 第一个 文档',
'这是 第二个 文档',
'这是 最后 一个 文档',
'现在 没有 文档 了 文档'
]
words_list = []
for corpu in corpus:
words_list.append(corpu.split())
print(words_list)
from collections import Counter
count_list = []
for words in words_list:
count = Counter(words)
count_list.append(count)
print(count_list)
print(count_list[0].values())
import math
def tf(word, count):
return count[word] / sum(count.values())
def idf(word, count_list):
n_contain = sum([1 for count in count_list if word in count])
return math.log(len(count_list) / (1 + n_contain))
def tf_idf(word, count, count_list):
return tf(word, count) * idf(word, count_list)
for index, count in enumerate(count_list):
print('第{}个文档的TF IDF的信息'.format(index + 1))
scores = {
word: tf_idf(word, count, count_list) for word in count}
sorted_word = sorted(scores.items(), key=lambda x: x[1], reverse=True)
for word, score in sorted_word:
print('word:{}, TF IDF :{}'.format(word, round(score, 5)))
print(words_list)
from gensim import corpora
dic = corpora.Dictionary(words_list)
new_corpus = [dic.doc2bow(words) for words in words_list]
print(new_corpus)
print(dic.token2id)
from gensim import models
tfidf = models.TfidfModel(new_corpus)
tfidf.save('tfidf.model')
models.TfidfModel.load('tfidf.model')
tf_idf_vec = []
for corpu in corpus:
doc_bow = dic.doc2bow(corpu.lower().split())
vec = tfidf[doc_bow]
tf_idf_vec.append(vec)
print(tf_idf_vec)
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec = TfidfVectorizer()
tf_idf_matrix = tfidf_vec.fit_transform(corpus)
print(tfidf_vec.get_feature_names())
print(tfidf_vec.vocabulary_)
print(tf_idf_matrix.toarray())
三.缺点