年报文本分析:jieba词频统计

添词、计数

import os
import re
import time
import math
import openpyxl
import fitz
import jieba
import jieba.analyse
from collections import Counter
'''
使用Python操作PDF:常用PDF库总结 - 知乎
https://zhuanlan.zhihu.com/p/352722932
'''
def pdr_reader(file):
    doc = fitz.open(file)
    content=''
    for page in doc:
        text = page.get_text()
        content=content+text
    return content
def keyword_count(content,words):
    raw_list = jieba.lcut(content)
    counts = {
    
    }
    for word in raw_list:
        if word in words:
            counts[word] = counts.get(word, 0) + 1
    return counts
def count_sum(counts):
    count1 = 0
    count2 = 0
    for key in counts:
        count1 += 1
        count2 += counts.get(key, 0)
    return [count1,count2]
file=r"E:\Alark\Desktop\数字化关键词.txt"
f=open(file,'r',encoding='utf-8')
requests=re.compile(r'\w*[\u4e00-\u9fa5]*、',re.S)
words=[]
result=re.finditer(requests,f.read())
for p in result:
    tmp=re.sub("、",'',p.group(0))
    words.append(tmp)
f.close()
for word in words:
    jieba.add_word(word)
#########
mywb = openpyxl.load_workbook(r'G:\词频统计.xlsx')
mysheet = mywb.active
row=0
list=[]
for r in mysheet.rows:
    if r[0].value!=None:
        row+=1
        list.append((r[0].value,r[1].value))
        if mysheet['A'+str(row+1)]==None:
            break
input=r'G:\年报'
order=0
for i in range(0,6):
    path=input+'\\'+str(2015+i)+'\\'+"制造"
    os.chdir(path)
    for file in os.listdir():
        if os.path.splitext(file)[1]=='.pdf':
            print(str(order+1)+'、'+file+':processing......')
            t1=time.monotonic()
            code=os.path.splitext(file)[0][:6]
            if (code,2015+i) in list:
                order+=1
                print(str(order) + '、' + file + ': is already there.')
            else:
                year=2015+i
                content = pdr_reader(file)
                counts = keyword_count(content)
                [count1,count2]=count_sum(counts)
                values=[code,year,count1,count2,math.log(count1+1,math.e),math.log(count2+1,math.e),1]
                row += 1
                order+=1
                list.append((code,year))
                for j in range(0,7):
                    mysheet.cell(row=row, column=j+1, value=values[j])
                t2 = time.monotonic()
                print(str(order) + '、' + file + ':done.',str(t2-t1)+'s')
                mywb.save(r'G:\词频统计.xlsx')
    path = input + '\\' + str(2015 + i) + '\\' + "其他"
    os.chdir(path)
    for file in os.listdir():
        if os.path.splitext(file)[1]=='.pdf':
            print(str(row+1)+'、'+file+':processing......')
            t1 = time.monotonic()
            code=os.path.splitext(file)[0][:6]
            if (code,2015+i) in list:
                order+=1
                print(str(order) + '、' + file + ': is already there.')
            else:
                year=2015+i
                content = pdr_reader(file)
                counts = keyword_count(content)
                [count1,count2]=count_sum(counts)
                values=[code,year,count1,count2,math.log(count1+1,math.e),math.log(count2+1,math.e),0]
                order+=1
                row += 1
                list.append((code, year))
                for j in range(0,7):
                    mysheet.cell(row=row, column=j+1, value=values[j])
                t2 = time.monotonic()
                print(str(row) + '、' + file + ':done.',str(t2-t1)+'s')
                mywb.save(r'G:\词频统计.xlsx')
mywb.save(r'G:\词频统计.xlsx')

特定章节选取

确定成功提取内容是关键
思路:返回最大中文片段==标题

猜你喜欢

转载自blog.csdn.net/qq_37639139/article/details/124840490