Python读取PDF文件

# readPdf.py

# python读取pdf格式的文档

from urllib.request import urlopen

from pdfminer.pdfinterp import PDFResourceManager, process_pdf

from pdfminer.converter import TextConverter

from pdfminer.layout import LAParams

from io import StringIO

from io import open

def readPDF(pdfFile):

rsrcmgr = PDFResourceManager()

retstr = StringIO()

laparams = LAParams()

device = TextConverter(rsrcmgr, retstr, laparams=laparams)

process_pdf(rsrcmgr, device, pdfFile)

device.close()

content = retstr.getvalue()

retstr.close()

return content

pdfFile = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf")

outputString = readPDF(pdfFile)

print(outputString)

pdfFile.close()

发布了76 篇原创文章 · 获赞 19 · 访问量 3万+

猜你喜欢

转载自blog.csdn.net/a289237642/article/details/86599859
今日推荐