Pdfminer读取PDF文件内容保存到本地TXT

from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
from pdfminer.pdfparser import PDFParser, PDFDocument
import pandas as pd

def Pdf_to_txt(fp):
# 创建一个pdf文档分析器
praser_pdf = PDFParser(fp)
# # 创建一个PDF文档
document = PDFDocument(praser_pdf)
# 连接分析器 与文档对象
praser_pdf.set_document(document)
document.set_parser(praser_pdf)

# 检测文档是否提供txt转换,不提供就忽略
if not document.is_extractable:
    raise PDFTextExtractionNotAllowed
else:
    # 创建PDf资源管理器 来管理共享资源
    rsrcmgr = PDFResourceManager()
    # 创建一个PDF参数分析器
    laparams = LAParams()
    # 创建聚合器
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    # 创建一个PDF页面解释器对象
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # 循环遍历列表,每次处理一页的内容
    # doc.get_pages() 获取page列表
    for page in document.get_pages():
        # 使用页面解释器来读取
        interpreter.process_page(page)
        # 使用聚合器获取内容
        layout = device.get_result()
        title = []
        lin1, lin2, lin3, lin4, lin5, lin6, lin7, lin8 = [], [], [], [], [], [], [], []
        num = 0
        # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, 
        # LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
        for lin in layout:
            # 判断是否含有get_text()方法,图片之类的就没有
            if isinstance(lin, LTTextBoxHorizontal):
                results = lin.get_text()
                results = results.strip('\n')
                print("results: " + results)
                if num == 0:
                    title.append(results)
                elif num == 1:
                    lin1.append(results)
                elif num == 2:
                    lin2.append(results)
                elif num == 3:
                    lin3.append(results)
                elif num == 4:
                    lin4.append(results)
                elif num == 5:
                    lin5.append(results)
                elif num == 6:
                    lin6.append(results)
                elif num == 7:
                    lin7.append(results)
                elif num == 8:
                    lin8.append(results)
                    num = 0
                num += 1
        Lin_num = len(lin8)
        data = {'Lin1': lin1[:Lin_num], 'Lin2': lin2[:Lin_num], 'Lin3': lin3[:Lin_num], 'Lin4': lin4[:Lin_num], 'Lin5': lin5[:Lin_num], 'Lin6': lin6[:Lin_num], 'Lin7': lin7[:Lin_num], 'Lin8': lin8[:Lin_num]}
        df = pd.DataFrame(data, columns=['Lin1', 'Lin2', 'Lin3', 'Lin4', 'Lin5', 'Lin6', 'Lin7', 'Lin8'])
        file_name = title[0] + '_page' + str((i + 1))
        df.to_csv('tool/pdf解析/%s.txt' % file_name, index=False, sep='\t')

if name == ‘main‘:
filename = ‘E:/tempFile/Monthly_new_MA_listing_April_2017.pdf’
fp = open(filename, ‘rb’)
Pdf_to_txt(fp)

猜你喜欢

转载自blog.csdn.net/luzaofa/article/details/80526822
今日推荐