PDF turn into a txt

#Author:Alex.Zhang
import pyocr

import importlib

import sys

import time

importlib.reload( sys )

time1 = time.time()

# print("初始时间为:",time1)


import os.path

from pdfminer.pdfparser import PDFParser , PDFDocument

from pdfminer.pdfinterp import PDFResourceManager , PDFPageInterpreter

from pdfminer.converter import PDFPageAggregator

from pdfminer.layout import LTTextBoxHorizontal , LAParams

frompdfminer.pdfinterp Import PDFTextExtractionNotAllowed 

text_path = R & lt ' Parameters in Cryo-EM.pdf ' 


# text_path = r'photo-words.pdf ' 


DEF the parse ():
     ' '' parse text PDF and saved to TXT file '' ' 

    FP Open = (text_path, ' rb ' ) 

    # with a file object to create a PDF document parser 

    parser = PDFParser (fp) 

    # create a PDF document 

    DOC = PDFDocument () 

    # connection analyzer, the document object 

    parser.set_document (DOC) 

    DOC .set_parser (Parser) 

    #Provide initialization code, if there is no password, create an empty string 

    doc.initialize () 

    # detect whether txt document conversion, does not provide ignores 

    IF  not doc.is_extractable: 

        The raise PDFTextExtractionNotAllowed 

    the else : 

        # create PDF, Explorer, to share resources 

        rsrcmgr = PDFResourceManager () 

        # create a PDF device object 

        laparams = lAParams () 

        device = PDFPageAggregator (rsrcmgr, laparams = laparams) 

        # create a PDF explaining its target 

        Interpreter = PDFPageInterpreter (rsrcmgr, device) 

        # loop through the list of our treatments a page content 

        # doc.get_pages () Gets a list of page

        for page in doc.get_pages (): 

            interpreter.process_page (page) 

            # receiving LTPage objects of the page 

            layout = device.get_result () 

            # Here is a layout object is stored inside the LTPage page parsed various objects 

            # typically comprises LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal etc. 

            # want to get the text to obtain the text attributes of the object, 

            for the X- in layout: 

                IF (isinstance (the X-, LTTextBoxHorizontal)): 
                    with Open (r ' 2.txt ' , ' a ' , = encoding ' UTF-. 8 ' ) AS F:
                        Results = x.get_text () 

                        Print (Results) 

                        f.write (Results + " \ n- ' ) 


IF  the __name__ == ' __main__ ' : 
    the parse () 

    TIME2 = the time.time () 

    Print ( " total elapsed time is: " , time2 - time1)

 

Guess you like

Origin www.cnblogs.com/klausage/p/11621814.html