from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
from pdfminer.pdfparser import PDFParser, PDFDocument
import pandas as pd
def Pdf_to_txt(fp):
# 创建一个pdf文档分析器
praser_pdf = PDFParser(fp)
# # 创建一个PDF文档
document = PDFDocument(praser_pdf)
# 连接分析器 与文档对象
praser_pdf.set_document(document)
document.set_parser(praser_pdf)
# 检测文档是否提供txt转换,不提供就忽略
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
else:
# 创建PDf资源管理器 来管理共享资源
rsrcmgr = PDFResourceManager()
# 创建一个PDF参数分析器
laparams = LAParams()
# 创建聚合器
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# 创建一个PDF页面解释器对象
interpreter = PDFPageInterpreter(rsrcmgr, device)
# 循环遍历列表,每次处理一页的内容
# doc.get_pages() 获取page列表
for page in document.get_pages():
# 使用页面解释器来读取
interpreter.process_page(page)
# 使用聚合器获取内容
layout = device.get_result()
title = []
lin1, lin2, lin3, lin4, lin5, lin6, lin7, lin8 = [], [], [], [], [], [], [], []
num = 0
# 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox,
# LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
for lin in layout:
# 判断是否含有get_text()方法,图片之类的就没有
if isinstance(lin, LTTextBoxHorizontal):
results = lin.get_text()
results = results.strip('\n')
print("results: " + results)
if num == 0:
title.append(results)
elif num == 1:
lin1.append(results)
elif num == 2:
lin2.append(results)
elif num == 3:
lin3.append(results)
elif num == 4:
lin4.append(results)
elif num == 5:
lin5.append(results)
elif num == 6:
lin6.append(results)
elif num == 7:
lin7.append(results)
elif num == 8:
lin8.append(results)
num = 0
num += 1
Lin_num = len(lin8)
data = {'Lin1': lin1[:Lin_num], 'Lin2': lin2[:Lin_num], 'Lin3': lin3[:Lin_num], 'Lin4': lin4[:Lin_num], 'Lin5': lin5[:Lin_num], 'Lin6': lin6[:Lin_num], 'Lin7': lin7[:Lin_num], 'Lin8': lin8[:Lin_num]}
df = pd.DataFrame(data, columns=['Lin1', 'Lin2', 'Lin3', 'Lin4', 'Lin5', 'Lin6', 'Lin7', 'Lin8'])
file_name = title[0] + '_page' + str((i + 1))
df.to_csv('tool/pdf解析/%s.txt' % file_name, index=False, sep='\t')
if name == ‘main‘:
filename = ‘E:/tempFile/Monthly_new_MA_listing_April_2017.pdf’
fp = open(filename, ‘rb’)
Pdf_to_txt(fp)