文本分词、生成tfidf值并降序排序

# coding:utf-8  
import os  
import sys  
import jieba  
from sklearn import feature_extraction  
from sklearn.feature_extraction.text import TfidfTransformer  
from sklearn.feature_extraction.text import CountVectorizer
sys.path.append("../")
f = open('../your.txt', 'r')
data=f.readlines()
f2=open('../experiments/tfidf_your.txt', 'w')
if __name__ == "__main__":  
    all_txt_cut=""
    for text in data:
        seq_list=jieba.cut(text,cut_all=False)#精确模式
        for word in list(seq_list):
            all_txt_cut+=word+" "
    all_txt_cut=[all_txt_cut]
    vectorizer=CountVectorizer()#该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频  
    transformer=TfidfTransformer()#该类会统计每个词语的tf-idf权值  
    tfidf=transformer.fit_transform(vectorizer.fit_transform(all_txt_cut))#第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵  
    word=vectorizer.get_feature_names()#获取词袋模型中的所有词语  
    weight=tfidf.toarray()#将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重 
    words_tfidf={} 
    for i in range(len(word)):
        words_tfidf[word[i]]=weight[0][i]
    words_tfidf_order=sorted(words_tfidf.items(),key=lambda x:x[1],reverse=True) #字典排序,生成元组
    for k in range(len(words_tfidf_order)):
        s=str(words_tfidf_order[k][0])+" "+str(words_tfidf_order[k][1])+"\n"
        f2.write(s)

猜你喜欢

转载自blog.csdn.net/qq_36663518/article/details/107630891