[Template] Code TF-IDF realized the importance of the word computing

# TF-IDF via jieba
import jieba
import jieba.analyse as ana
ana.extract_tags(txt, withWeight=True)  # withWeight参数指定是否显示tf-idf值
# TF-IDF via sklearn
# not for reading, but for modeling in the future
# 去停用词分词函数等参考之前模版
# sklearn要求词之间是空格的list格式
textlist = [" ".join(cut_word_without_stopword(x)) for x in txt]
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
# 将文本语料转换成词频矩阵【关键第一步】
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(textlist) 
# 查看转换后的词典
vectorizer.vocabulary_
# 基于词频矩阵计算tf-idf【关键第二步】
tftrans = TfidfTransformer()
tf_idf = tftrans.fit_transform(x)
# 数组格式显示tf-idf
tf_idf_array = tf_idf.toarray() 
tf_idf_array
# 矩阵格式显示tf-idf
tf_idf_dense = tf_idf.todense()
# TF-IDF via gensim
# 准备语料data_list,gensim要求的是词与词之间逗号分隔的字典格式
from gensim import corpora, models
# 生成语料对应的字典
dic = corpora.Dictionary(data_list)
# 生成BOW稀疏向量
corpus = [dic.doc2bow(x) for x in data_list]
# 建立tf-idf模型
tf_idf_models = models.TfidfModel(corpus) 
# 对所需要的文档计算结果 
corpus_tfidf = timodels[corpus]
# 列出指定的tf-idf
corpus_tfidf[n]  
Published 40 original articles · won praise 0 · Views 1696

Guess you like

Origin blog.csdn.net/weixin_44680262/article/details/104794195