场景
获取文件中的文本内容(只读不写)
安装:pip install pdfminer3k
from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
def read_pdf(path_pdf):
with open(path_pdf, 'rb') as pdf:
# resource manager
# PDF资源管理器
rsrcmgr = PDFResourceManager()
# 输出str到内存
outfp = StringIO()
# 解析PDF的参数
laparams = LAParams()
# 文本转换器
device = TextConverter(rsrcmgr, outfp, laparams=laparams)
# 进行处理
process_pdf(rsrcmgr, device, pdf)
# 获取处理后的文本内容
content = outfp.getvalue()
# 关闭设备
device.close()
outfp.close()
return content # <class 'str'>
if __name__ == '__main__':
lines = read_pdf('P020190716349644060705.pdf')
print(lines)
WORD
安装:pip install python-docx
读取段落
from docx import Document
# 加载文件
d = Document('a.docx')
# 遍历段落
for paragraph in d.paragraphs:
print(paragraph.text)
读取表格
from docx import Document
# 加载文件
d = Document('a.docx')
# 按行取数
for table in d.tables:
for row in table.rows:
for cell in row.cells:
print(cell.text)
EXCEL
from pandas import read_excel
def xlsx2df(fname, sheet_name=0):
return read_excel(fname, sheet_name)
PPT
安装:pip install python-pptx
import pptx
# 打开PPT
p = pptx.Presentation('a.pptx')
# 遍历幻灯片
for slide in p.slides:
# 遍历幻灯片内每个形状
for shape in slide.shapes:
# 文本框
if isinstance(shape, pptx.shapes.placeholder.SlidePlaceholder):
for paragraph in shape.text_frame.paragraphs:
print(paragraph.text)
# 表格
if isinstance(shape, pptx.shapes.graphfrm.GraphicFrame):
for cell in shape.table.iter_cells():
print(cell.text)