【Python】输出pdf的内容(写入EXCEL)

1.将pdf文件内容写入txt文件:

利用PDFminer3k模块来抽取PDF内容,包括文本、图像、曲线等:

# -*- coding: utf-8 -*-
import sys
import importlib
importlib.reload(sys)

from pdfminer.pdfparser import PDFParser,PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import *
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed

'''
解析pdf文件,获取文件中包含的各种对象
'''

# 解析pdf文件函数
def parse(pdf_path):
    fp = open(pdf_path, 'rb')  # 以二进制读模式打开
    # 用文件对象来创建一个pdf文档分析器
    parser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器 与文档对象
    parser.set_document(doc)
    doc.set_parser(parser)

    # 提供初始化密码
    # 如果没有密码 就创建一个空的字符串
    doc.initialize()

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf 资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # 用来计数页面,图片,曲线,figure,水平文本框等对象的数量
        num_page, num_image, num_curve, num_figure, num_TextBoxHorizontal = 0, 0, 0, 0, 0
        # 循环遍历列表,每次处理一个page的内容
        for page in doc.get_pages(): # doc.get_pages() 获取page列表
            num_page += 1  # 页面增一
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            for x in layout:
                if isinstance(x,LTImage):  # 图片对象
                    num_image += 1
                if isinstance(x,LTCurve):  # 曲线对象
                    num_curve += 1
                if isinstance(x,LTFigure):  # figure对象
                    num_figure += 1
                if isinstance(x, LTTextBoxHorizontal):  # 获取文本内容
                    num_TextBoxHorizontal += 1  # 水平文本框对象增一
                    # 保存文本内容
                    with open(r'test.txt', 'a') as f:
                        results = x.get_text()
                        print(results,end='')
                        f.write(results)
        print('对象数量:\n','页面数:%s\n'%num_page,'图片数:%s\n'%num_image,'曲线数:%s\n'%num_curve,'水平文本框:%s\n'
              %num_TextBoxHorizontal)


if __name__ == '__main__':
    pdf_path = r'D:\python tests\ZQfd_paiming\pdf\12.pdf'
    parse(pdf_path)

2.利用pdf2htmlEX工具,将pdf转化为html文件,分析源码,根据格式提取出需要的内容:

下载pdf2htmlEX,将需要处理的pdf转化为html:

(1)先处理了前两页:13.html:13

# -*- coding: UTF-8 -*-
# -*- coding: gbk -*-
from bs4 import BeautifulSoup
import re

file = open('C:/Users/wdh/Desktop/pdf2htmlEX-win32-0.14.6-upx-with-poppler-data/13.html', 'rb')
# html=str(file.read())          #str类型,中文->十六进制
html = file.read()  # byte类型,直接显示中文,但是下文的定位是根据十六进制来的
div_bf = BeautifulSoup(html, 'lxml')
div = div_bf.find_all('div', class_='c x0 y0 w2 h2')  # 每一页
# div=div_bf.find_all('div',class_='pf w0 h0')
pattern_id = re.compile('<div class="t m0 x8 h3 y.*? ff1 fs0 fc1 sc1 ls0 ws0">')  # 以学号开始的每个人的信息
pattern_name = re.compile('<div class="t m0 x9 h3 y.*? ff1 fs0 fc1 sc1 ls0 ws0">')  # 姓名
pattern_grade = re.compile('<div class="t m0 xb h3 y.*? ff1 fs0 fc1 sc1 ls0 ws0">')  # 分数:初试、复试、总成绩
ID = []
NAME = []
GRADE1 = []  # 初试
GRADE2 = []  # 复试
GRADE3 = []  # 总成绩
COLLEGE = []
page_num = len(div)

for i in range(page_num):
    ID += [[]]
    NAME += [[]]
    GRADE1 += [[]]
    GRADE2 += [[]]
    GRADE3 += [[]]
    COLLEGE += [[]]

    str_each = str(div[i])  # 分析每页信息

    per_id = re.findall(pattern_id, str_each)
    per_name = re.findall(pattern_name, str_each)
    per_grade = re.findall(pattern_grade, str_each)
    per_num = len(per_id)  # 每页人数
    for j in range(per_num):
        start_id = str_each.find(per_id[j]) + 50
        if str_each[start_id] == '>':
            id = str_each[start_id + 1:start_id + 6]
        else:
            id = str_each[start_id + 2:start_id + 7]
        ID[i].append(id)

        start_name = str_each.find(per_name[j]) + 50
        end_name = str_each.find('<', start_name)
        if str_each[start_name] == '>':
            name = str_each[start_name + 1:end_name]
        else:
            name = str_each[start_name + 2:end_name]
        NAME[i].append(name.replace(' ', ''))  # 避免名字中有空格出现

        start_grade = str_each.find(per_grade[j]) + 50
        if str_each[start_grade] != '>':
            start_grade += 1
        grade1 = str_each[start_grade + 1:start_grade + 4]
        grade2 = str_each[start_grade + 31:start_grade + 36]
        grade3 = str_each[start_grade + 63:start_grade + 68]
        GRADE1[i].append(grade1)
        GRADE2[i].append(grade2)
        GRADE3[i].append(grade3)

        # 有的学院和专业是跟在姓名div后面的,有的是另外开辟了div,处理起来比较麻烦
        # 处理姓名到得分之间的部分,将这部分中诸如<span class="_ _3"></span>
        # 或者</div><div class="t m0 xd h3 y13 ff1 fs0 fc1 sc1 ls0 ws0">去掉

        college_info = str_each[end_name:start_grade + 1]
        pattern_useless1 = re.compile('<span class="_ _[0-9|a-f]"></span>')
        pattern_useless2 = re.compile('<span class="_ _[0-9|a-f]"> </span>')
        pattern_useless3 = re.compile('</div><div class="t m0 x.*? h3 y.*? ff1 fs0 fc1 sc1 ls0 ws0">')
        useless1 = re.findall(pattern_useless1, college_info)
        useless2 = re.findall(pattern_useless2, college_info)
        useless3 = re.findall(pattern_useless3, college_info)
        for each in useless1:
            college_info = college_info.replace(each, '')
        for each in useless2:
            college_info = college_info.replace(each, '')
        for each in useless3:
            college_info = college_info.replace(each, '')
        COLLEGE[i].append(college_info)

        # print('%-10s %-10s %-50s %-5s %-7s %-7s' % (ID[i][j],NAME[i][j],COLLEGE[i][j],GRADE1[i][j],GRADE2[i][j],GRADE3[i][j]))
        print('|%-10s|%s' % (ID[i][j], NAME[i][j]), end='')
        for x in range(10 - len(NAME[i][j])):
            print('\u3000', end='')
        print('|%s' % COLLEGE[i][j], end='')
        for x in range(30 - len(COLLEGE[i][j])):
            print('\u3000', end='')
        print('|%-7s|%-7s|%-7s|' % (GRADE1[i][j], GRADE2[i][j], GRADE3[i][j]))

(2)再处理所有的168页,结构不完全相同,使用的方法也有很大不同:转化后的zong1.html:zong1

效果截图:

# -*- coding: UTF-8 -*-
# -*- coding: gbk -*-
from bs4 import BeautifulSoup
import re
file=open('C:/Users/wdh/Desktop/pdf2htmlEX-win32-0.14.6-upx-with-poppler-data/zong1.html','rb')
#html=str(file.read())          #str类型,中文->十六进制
html=file.read()                #byte类型,直接显示中文,但是下文的定位是根据十六进制来的
div_bf=BeautifulSoup(html,'lxml')
div=div_bf.find_all('div',class_='pf w0 h0')
pattern_id=re.compile('<div class="t m0 x4 h3 y[0-9|a-f]+ ff2 fs0 fc0 sc1 ls1 ws0">')     #以学号开始的每个人的信息
pattern_name=re.compile('<span class="ff1">.*?</span>')    #姓名
#pattern_college1=re.compile('<div class="t m0 x9 h3 y[0-9|a-f]+ ff1 fs0 fc0 sc1 ls1 ws0">')  #学院
pattern_college1=re.compile('<div class=".*?"')
pattern_college2=re.compile('<span class="_ _[0-9|a-f]+">')

pattern_grade=re.compile('<div class="t m0 xb h3 y.*? ff2 fs0 fc0 sc1 ls1 ws0">')   #分数:初试、复试、总成绩
ID=[]
NAME=[]
GRADE1=[]   #初试
GRADE2=[]   #复试
GRADE3=[]   #总成绩
COLLEGE=[]  #学院
REMARKS=[]  #备注

page_num=len(div)

for i in range(page_num):
    ID+=[[]]
    NAME+=[[]]
    GRADE1+=[[]]
    GRADE2+=[[]]
    GRADE3+=[[]]
    COLLEGE+=[[]]
    REMARKS+=[[]]

    str_each=str(div[i])    #分析每页信息

    per_id=re.findall(pattern_id,str_each)
    per_name=re.findall(pattern_name,str_each)
    per_college1=re.findall(pattern_college1,str_each)
    per_college2 = re.findall(pattern_college2, str_each)
    per_grade=re.findall(pattern_grade,str_each)
    per_num=len(per_id)   #每页人数

    for j in range(per_num):
        start_id=str_each.find(per_id[j])+50
        if str_each[start_id]=='>':
            id=str_each[start_id+1:start_id+6]
        else:
            id=str_each[start_id+2:start_id+7]
        ID[i].append(id)

        #学号是规整的,姓名有的跟在学号后面:<span class="ff1">.*?</span>'
        # 有的:<div class="t m0 x4 h3 y.*? ff1 fs0 fc0 sc1 ls1 ws0">.*?</div>
        start_name1=str_each.find('<span class="ff1">',start_id)
        start_name2=str_each.find('<div class="t m0 x4 h3 y',start_id)
        if start_name1==-1:
            if str_each[start_name2+50]!='>':
                start_name=start_name2+51
            else:
                start_name=start_name2+50
        elif start_name2==-1:
            start_name=start_name1+18
        elif start_name1<start_name2:
            start_name=start_name1+18
        else:
            if str_each[start_name2+50]!='>':
                start_name=start_name2+51
            else:
                start_name=start_name2+50

        end_name=str_each.find('</',start_name)
        name=str_each[start_name:end_name]
        NAME[i].append(name.replace(' ','').replace('>','').replace('·','\u3000'))        #避免名字中有空格出现

        #只有学号是规整的,分数有的跟在学院后面:<span class="ff2">
        #有的:<div class="t m0 xb h3 y22 ff2 fs0 fc0 sc1 ls1 ws0">

        start_grade1 = str_each.find('<span class="ff2">', start_id)
        start_grade2 = str_each.find('<div class="t m0 xb h3 y', start_id)
        start_grade3=str_each.find('<div class="t m0 x10 h3 y',start_id)
        if start_grade3!=-1 and start_grade3<start_grade2:
            start_grade2=start_grade3
        if start_grade1 == -1:
            if str_each[start_grade2 + 50] != '>':
                start_grade = start_grade2 + 52
            else:
                start_grade = start_grade2 + 51
        elif start_grade2 == -1:
            start_grade = start_grade1 + 18
        elif start_grade1 < start_grade2:
            start_grade = start_grade1 + 18
        else:
            if str_each[start_grade2 + 50] != '>':
                start_grade = start_grade2 + 52
            else:
                start_grade = start_grade2 + 51

        end_grade = str_each.find('</div', start_grade)

        GRADE1[i].append(str_each[start_grade:start_grade+3].replace('>',''))

        if str_each[start_grade+30].isdigit():
            GRADE2[i].append(str_each[start_grade+30:start_grade+35])
            GRADE3[i].append(str_each[start_grade + 62:start_grade + 67])
        else:
            GRADE2[i].append('-')
            GRADE3[i].append('-')

        '''
        if str_each[start_grade:end_grade].count('<span class=')>=3:    #有备注
            start_remarks=str_each[start_grade:end_grade].rfind('>')
            remarks=str_each[start_remarks+1:end_grade]
            print(remarks)
            REMARKS[i].append(remarks)
        else:
            REMARKS[i].append('-')
        '''

        # 学院专业信息在姓名和成绩之间,再利用正则表达式,将格式信息等去掉

        college = str_each[end_name:start_grade]
        for each in per_college1:
            college=college.replace(each,'')
        for each in per_college2:
            college=college.replace(each,'')
        college=college.replace('<span class="ff2">','').replace('</span>','').replace('</div>','').replace('>','').replace(' ','').replace('(','(').replace(')',')')
        COLLEGE[i].append(college)

        print('|%-10s|%s' % (ID[i][j],NAME[i][j]),end='')
        for x in range(12-len(NAME[i][j])):
            print('\u3000',end='')
        print('|%s' % COLLEGE[i][j],end='')
        for x in range(30-len(COLLEGE[i][j])):
            print('\u3000',end='')
        print('|%-7s|%-7s|%-7s|' % (GRADE1[i][j],GRADE2[i][j],GRADE3[i][j]))

3.在上述代码的基础上添加几行,使用xlwt将字典写入excel中:

效果截图:

# coding=utf-8

from xlwt import *

# 需要xlwt库的支持
# -*- coding: UTF-8 -*-
# -*- coding: gbk -*-
from bs4 import BeautifulSoup
import re
file=open('C:/Users/wdh/Desktop/pdf2htmlEX-win32-0.14.6-upx-with-poppler-data/zong1.html','rb')
#html=str(file.read())          #str类型,中文->十六进制
html=file.read()                #byte类型,直接显示中文,但是下文的定位是根据十六进制来的
div_bf=BeautifulSoup(html,'lxml')
div=div_bf.find_all('div',class_='pf w0 h0')
pattern_id=re.compile('<div class="t m0 x4 h3 y[0-9|a-f]+ ff2 fs0 fc0 sc1 ls1 ws0">')     #以学号开始的每个人的信息
pattern_name=re.compile('<span class="ff1">.*?</span>')    #姓名
#pattern_college1=re.compile('<div class="t m0 x9 h3 y[0-9|a-f]+ ff1 fs0 fc0 sc1 ls1 ws0">')  #学院
pattern_college1=re.compile('<div class=".*?"')
pattern_college2=re.compile('<span class="_ _[0-9|a-f]+">')

pattern_grade=re.compile('<div class="t m0 xb h3 y.*? ff2 fs0 fc0 sc1 ls1 ws0">')   #分数:初试、复试、总成绩
ID=[]
NAME=[]
GRADE1=[]   #初试
GRADE2=[]   #复试
GRADE3=[]   #总成绩
COLLEGE=[]  #学院
REMARKS=[]  #备注

page_num=len(div)

file_excel = Workbook(encoding='utf-8')
# 指定file以utf-8的格式打开
table = file_excel.add_sheet('data')
# 指定打开的文件名
data = {'考生编号 (后五位)':['姓名','拟录取院系/专业','初试成绩','复试成绩','总成绩']}
# 字典数据

for i in range(page_num):
    ID+=[[]]
    NAME+=[[]]
    GRADE1+=[[]]
    GRADE2+=[[]]
    GRADE3+=[[]]
    COLLEGE+=[[]]
    REMARKS+=[[]]

    str_each=str(div[i])    #分析每页信息

    per_id=re.findall(pattern_id,str_each)
    per_name=re.findall(pattern_name,str_each)
    per_college1=re.findall(pattern_college1,str_each)
    per_college2 = re.findall(pattern_college2, str_each)
    per_grade=re.findall(pattern_grade,str_each)
    per_num=len(per_id)   #每页人数

    for j in range(per_num):
        start_id=str_each.find(per_id[j])+50
        if str_each[start_id]=='>':
            id=str_each[start_id+1:start_id+6]
        else:
            id=str_each[start_id+2:start_id+7]
        ID[i].append(id)

        #学号是规整的,姓名有的跟在学号后面:<span class="ff1">.*?</span>'
        # 有的:<div class="t m0 x4 h3 y.*? ff1 fs0 fc0 sc1 ls1 ws0">.*?</div>
        start_name1=str_each.find('<span class="ff1">',start_id)
        start_name2=str_each.find('<div class="t m0 x4 h3 y',start_id)
        if start_name1==-1:
            if str_each[start_name2+50]!='>':
                start_name=start_name2+51
            else:
                start_name=start_name2+50
        elif start_name2==-1:
            start_name=start_name1+18
        elif start_name1<start_name2:
            start_name=start_name1+18
        else:
            if str_each[start_name2+50]!='>':
                start_name=start_name2+51
            else:
                start_name=start_name2+50

        end_name=str_each.find('</',start_name)
        name=str_each[start_name:end_name]
        NAME[i].append(name.replace(' ','').replace('>','').replace('·','\u3000'))        #避免名字中有空格出现

        #只有学号是规整的,分数有的跟在学院后面:<span class="ff2">
        #有的:<div class="t m0 xb h3 y22 ff2 fs0 fc0 sc1 ls1 ws0">

        start_grade1 = str_each.find('<span class="ff2">', start_id)
        start_grade2 = str_each.find('<div class="t m0 xb h3 y', start_id)
        start_grade3=str_each.find('<div class="t m0 x10 h3 y',start_id)
        if start_grade3!=-1 and start_grade3<start_grade2:
            start_grade2=start_grade3
        if start_grade1 == -1:
            if str_each[start_grade2 + 50] != '>':
                start_grade = start_grade2 + 52
            else:
                start_grade = start_grade2 + 51
        elif start_grade2 == -1:
            start_grade = start_grade1 + 18
        elif start_grade1 < start_grade2:
            start_grade = start_grade1 + 18
        else:
            if str_each[start_grade2 + 50] != '>':
                start_grade = start_grade2 + 52
            else:
                start_grade = start_grade2 + 51

        end_grade = str_each.find('</div', start_grade)

        GRADE1[i].append(str_each[start_grade:start_grade+3].replace('>',''))

        if str_each[start_grade+30].isdigit():
            GRADE2[i].append(str_each[start_grade+30:start_grade+35])
            GRADE3[i].append(str_each[start_grade + 62:start_grade + 67])
        else:
            GRADE2[i].append('-')
            GRADE3[i].append('-')

        '''
        if str_each[start_grade:end_grade].count('<span class=')>=3:    #有备注
            start_remarks=str_each[start_grade:end_grade].rfind('>')
            remarks=str_each[start_remarks+1:end_grade]
            print(remarks)
            REMARKS[i].append(remarks)
        else:
            REMARKS[i].append('-')
        '''

        # 学院专业信息在姓名和成绩之间,再利用正则表达式,将格式信息等去掉

        college = str_each[end_name:start_grade]
        for each in per_college1:
            college=college.replace(each,'')
        for each in per_college2:
            college=college.replace(each,'')
        college=college.replace('<span class="ff2">','').replace('</span>','').replace('</div>','').replace('>','').replace(' ','').replace('(','(').replace(')',')')
        COLLEGE[i].append(college)

        target=[]
        target.append(NAME[i][j])
        target.append(COLLEGE[i][j])
        target.append(GRADE1[i][j])
        target.append(GRADE2[i][j])
        target.append(GRADE3[i][j])

        data[ID[i][j]]=target

ldata=[]
num=[a for a in data]

#for循环指定取出key值存入num中
for x in num:
    #for循环将data字典中的键和值分批的保存在ldata中
    t=[x]
    for a in data[x]:
        t.append(a)
    ldata.append(t)

for i,p in enumerate(ldata):
    #将数据写入文件,i是enumerate()函数返回的序号数
    for j,q in enumerate(p):
        table.write(i,j,q)
file_excel.save('data.xlsx')

猜你喜欢

转载自blog.csdn.net/li_jiaqian/article/details/80312103