#Author:Alex.Zhang import pyocr import importlib import sys import time importlib.reload( sys ) time1 = time.time() # print("初始时间为:",time1) import os.path from pdfminer.pdfparser import PDFParser , PDFDocument from pdfminer.pdfinterp import PDFResourceManager , PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LTTextBoxHorizontal , LAParams frompdfminer.pdfinterp Import PDFTextExtractionNotAllowed text_path = R & lt ' Parameters in Cryo-EM.pdf ' # text_path = r'photo-words.pdf ' DEF the parse (): ' '' parse text PDF and saved to TXT file '' ' FP Open = (text_path, ' rb ' ) # with a file object to create a PDF document parser parser = PDFParser (fp) # create a PDF document DOC = PDFDocument () # connection analyzer, the document object parser.set_document (DOC) DOC .set_parser (Parser) #Provide initialization code, if there is no password, create an empty string doc.initialize () # detect whether txt document conversion, does not provide ignores IF not doc.is_extractable: The raise PDFTextExtractionNotAllowed the else : # create PDF, Explorer, to share resources rsrcmgr = PDFResourceManager () # create a PDF device object laparams = lAParams () device = PDFPageAggregator (rsrcmgr, laparams = laparams) # create a PDF explaining its target Interpreter = PDFPageInterpreter (rsrcmgr, device) # loop through the list of our treatments a page content # doc.get_pages () Gets a list of page for page in doc.get_pages (): interpreter.process_page (page) # receiving LTPage objects of the page layout = device.get_result () # Here is a layout object is stored inside the LTPage page parsed various objects # typically comprises LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal etc. # want to get the text to obtain the text attributes of the object, for the X- in layout: IF (isinstance (the X-, LTTextBoxHorizontal)): with Open (r ' 2.txt ' , ' a ' , = encoding ' UTF-. 8 ' ) AS F: Results = x.get_text () Print (Results) f.write (Results + " \ n- ' ) IF the __name__ == ' __main__ ' : the parse () TIME2 = the time.time () Print ( " total elapsed time is: " , time2 - time1)