pdfminer batch processing PDF files

Import PDFParser pdfminer.pdfparser from, PDFDocument 
from pdfminer.pdfinterp Import PDFResourceManager, PDFPageInterpreter, PDFTextExtractionNotAllowed 
from pdfminer.converter Import PDFPageAggregator 
from pdfminer.layout Import LTTextBoxHorizontal, LAParams, LTTextLineHorizontal, LTFigure, LTRect, LTLine, LTCurve 
Import OS 


class PdfForString (Object): 
    __init __ DEF (Self): 
        self.pdf_list = os.listdir (r'E: \ StockExchange \ PDF ') # get all pdf PDF folder names 
        # document storage resources 
        self.src = PDFResourceManager () 
        # device object 
        self.device PDFPageAggregator = (self.src, laparams = LAParams ()) 
        # interpreter object, 
        self.inter = PDFPageInterpreter (self.src, self.device)

    # Pdf generating path 
    DEF for_string (Self): 
        for in self.pdf_list pdf: 
            pdf_path = the os.path.join (os.path.dirname (os.path.dirname (__ file__)) + '/ PDF', pdf) 
            the yield pdf_path 

    # parse pdf 
    DEF pdf_analysis (Self): 
        for path in self.for_string (): 
            pd_file = Open (path, 'rb') 
            Parser = PDFParser (pd_file) # pdf file parsing objects 

            # pdf document object 
            the document PDFDocument = () 
            Parser. set_document (the Document) 
            document.set_parser (Parser) 
            Pages and the document.get_pages = () 
            yield Pages and the 

    # get the PDF information 
    def get_string (self):
        for pages in self.pdf_analysis():
            for page in pages:
                self.inter.process_page(page)
                layout = self.device.get_result()
                for x in layout:
                    if isinstance(x, LTTextBoxHorizontal):
                        print(str(x.get_text()))
            # break


PdfForString().get_string()

  

Guess you like

Origin www.cnblogs.com/wangtaobiu/p/11947133.html
Recommended