python word document word frequency statistics

How to word document word frequency count it? First with the module docx word document into txt format, and then use jieba module word, and word frequency statistics. It is not very simple -

#2020年3月10日
#Elizabeth
from docx import Document
import jieba #分词模块

#自定义函数,将word文档写入txt文档
def to_txt(path):
    document=Document(path)
    txt=open('/Users/fangluping/Desktop/数据分析笔试试题/词频统计.txt','w+')
    for paragraph in document.paragraphs:
        text=paragraph.text 
        txt.write(text)
    txt.close()
    return txt

if __name__=='__main__':
    path0='/Users/fangluping/Desktop/数据分析笔试试题/笔试题目-V1.0.docx'
    to_txt(path0) #调用写入txt文档的函数

    #分词
    txt=open('/Users/fangluping/Desktop/词频统计.txt','r',encoding='utf-8').read()
    words=jieba.lcut(txt)
    counts={}
    for word in words:
        if len(word)==1:
            continue
        else:
            counts[word]=counts.get(word,0)+1
    items=list(counts.items())
    items.sort(key=lambda x:x[1],reverse=True)

    for i in range(10):
        word,count=items[i]
        print("{0:<10}{1:>5}".format(word,count))

Guess you like

Origin blog.51cto.com/14534896/2477002