Python实现基于TF-IDF抽取文本数据关键词

结束了一整天的工作，来写一点东西记录一下吧。今天主要就是实现一下TF-IDF算法，这个相信接触过文本数据处理的人来说简直就是小儿科，所以原理什么的我就不多累赘了，这里直接看代码：

#!usr/bin/env python
#encoding:utf-8
from __future__ import division

'''
__Author__:沂水寒城
功能：  TF-IDF算法

TF-IDF实际上是：TF * IDF
                TF：词频(Term Frequency)
                IDF：逆文档频率(Inverse Document Frequency)
'''


import os
import re
import sys
import xlrd
import math
import json
import jieba
import sklearn
import numpy as np
import jieba.analyse


if sys.version_info==2:
    reload(sys)
    sys.setdefaultencoding( "utf-8" )



def dataPrepossing(one_line):
    '''
    数据内容去除无效字符，预处理
    '''
    sigmod_list=['，','。','（','）','-','——','\n','“','”','*','#',
                 '《','》','、','[',']','(',')','-','.','/','】','【']
    for one_sigmod in sigmod_list:
        one_line=one_line.replace(one_sigmod,'')
    return one_line


def seg(one_content, stopwords=[]):  
    ''' 
    分词并去除停用词 
    '''  
    segs=jieba.cut(one_content, cut_all=False)  
    # segs=[w.encode('utf8') for w in list(segs)]  
    segs=[unicode(w) for w in list(segs)]  
    seg_set=set(set(segs)-set(stopwords))  
    return list(seg_set) 


def cutData2Words(dataDir='data/',save_path='cut_words.json'):
    '''
    将原始的文本数据切分为词数据
    '''
    result={}
    txt_list=os.listdir(dataDir)
    for one_txt in txt_list:
        one_name=one_txt.split('.')[0].strip()
        one_txt_path=dataDir+one_txt 
        with open(one_txt_path) as f:
            one_data=f.read()
        one_words_list=seg(one_data)
        result[one_name]=one_words_list
    with open(save_path,'w') as f:
        f.write(json.dumps(result))


def calWordsIdf(data='cut_words.json',save_path='word_idf.json'):
    '''
    将整个语料库读入矩阵，并计算每个词的idf值，写入文件
    '''
    with open(data) as f:
        data_dict=json.load(f)
    word_doc_list=[]
    for one_key in data_dict:
        word_doc_list.append(data_dict[one_key])
    word_dict={}
    for one_list in word_doc_list:
        for one in one_list:
            if one in word_dict:
                word_dict[one]+=1
            else:
                word_dict[one]=1
    print('='*60)
    idf_dict={}
    n=len(word_doc_list)
    for one_word in word_dict:
        idf_dict[one_word]=math.log(n*1.0/word_dict[one_word])
    with open(save_path,'w') as f:
        f.write(json.dumps(idf_dict))


def calTxtKeyWords(data='data/1.txt',topK=30):
    '''
    计算单个文件数据的关键词[基于TF-IDF值降序筛选]
    '''
    with open('word_idf.json') as f:
        idf_dict=json.load(f)
    with open(data) as f:
        content=f.read()
    words_list=seg(content)
    tf_dict={}
    for one_word in words_list:
        if one_word in tf_dict:
            tf_dict[one_word]+=1
        else:
            tf_dict[one_word]=1
    tfidf_dict={}
    for oneW in tf_dict:
        try:
            tf=tf_dict[oneW]
            idf=idf_dict[oneW]
            tfidf_dict[oneW]=tf*idf
        except Exception,e:
            pass
            #print 'Exception: ',e 
    sorted_list=sorted(tfidf_dict.items(),key=lambda e:e[1], reverse=True)
    for one in sorted_list[:5]:
        print(one)
    return sorted_list[:topK]


if __name__ == '__main__':
    print('Loading.............................................')


    #数据集分词
    cutData2Words(dataDir='data/',save_path='cut_words.json')

    #计算词的逆文档频率
    calWordsIdf(data='cut_words.json',save_path='word_idf.json')

    #计算文档 1.txt 对的关键词
    calTxtKeyWords(data='data/01.txt',topK=30)

测试所用的数据内容如下所示：

由于限制原因未能上传

分词结果如下所示：

由于限制原因未能上传

IDF值计算如下所示：

由于限制原因未能上传

由于限制原因未能上传自己使用的语料数据，感兴趣的话可以自己找一篇文章试试，语料集我用的很小，这里最好用一个比较大的语料数据集来计算IDF值，这样最终计算得到的TF-IDF值可靠性较高。

Together_CZ 博客专家

发布了532 篇原创文章 · 获赞 1297 · 访问量 334万+

他的留言板关注

Python实现基于TF-IDF抽取文本数据关键词

猜你喜欢