python read the PDF file content

 1 import os
 2 from pdfminer.pdfparser import PDFParser
 3 from pdfminer.pdfdocument import PDFDocument
 4 from pdfminer.pdfpage import PDFPage
 5 from pdfminer.pdfpage import PDFTextExtractionNotAllowed
 6 from pdfminer.pdfinterp import PDFResourceManager
 7 from pdfminer.pdfinterp import PDFPageInterpreter
 8 from pdfminer.pdfdevice import PDFDevice
 9 from pdfminer.layout import *
10 from pdfminer.converter import PDFPageAggregator
11 
12 
13 import os
14 import pdb
15 
16 #inputFile = r'D:\用户目录\桌面\340xxxxxxxxxxxxxxxxxx0.pdf'
17 
18 
19 def decode_text(s):
20     """
21     Decodes a PDFDocEncoding string to Unicode.
22     Adds py3 compatability to pdfminer's version.
23     """
24     if type(s) == bytes and s.startswith(b'\xfe\xff'):
25         return six.text_type(s[2:], 'utf-16be', 'ignore')
26     else:
27         ords = (ord(c) if type(c) == str else c for c in s)
28         return ''.join(PDFDocEncoding[o] for o in ords)
29 
30 
31 
32 def get_msgs(inputFile):
33     msgs = []
34     Open = FP (inputFile, ' RB ' )
 35      # to create a pdf document analyzer 
36      Parser = PDFParser (FP)  
 37 [      # create a PDF document object stored in the document structure 
38 is      Document = PDFDocument (Parser)
 39      # checks whether to allow the text file extracting 
40      IF  Not document.is_extractable:
 41 is          The raise PDFTextExtractionNotAllowed
 42 is      the else :
 43 is          # Create a PDF resource manager object to store resource Dramas 
44 is          rsrcmgr = PDFResourceManager ()
 45          # setting parameters analyzed 
46         laparams = LAParams ()
 47          # Create a PDF device object 
48          # Device = PDFDevice (rsrcmgr) 
49          Device = PDFPageAggregator (rsrcmgr, laparams = laparams)
 50          # Create a PDF interpreter object, 
51 is          Interpreter = PDFPageInterpreter (rsrcmgr, Device)
 52 is       
53 is          # process each page 
54 is          for page in PDFPage.create_pages (Document):
 55       
56 is              interpreter.process_page (page)
 57 is       
58              # to accept the target page LTPage 
59              layout = device.get_result ()
 60      
61             for x in layout:
62                 
63                 if(isinstance(x,LTTextBoxHorizontal)):
64                     #print(x.get_text().strip())
65                     
66                     msgs.append(x.get_text().strip())
67                     
68         return msgs
69                    
70 
71 
72     #print(msgs[5][5:]+ '\t' + msgs[4][4:])
73     
74 
75 
76 if __name__ == "__main__":
77     names = os.listdir('.')
78     for i in names:
79         if os.path.splitext(i)[-1] == '.pdf':
80             #print(i)
81             msg = get_msgs(i)
82             #print(msg)
83             ms = msg[5][5:]+ '\t' + msg[4][4:]
84             with open('学生信息表.txt','a') as f:
85                 f.write(ms+'\n')
86     

 

Guess you like

Origin www.cnblogs.com/chillytao-suiyuan/p/11858433.html