Python reads text, tables, and pictures in PDF files
Tip: After the article is written, the table of contents can be automatically generated. For how to generate it, please refer to the help document on the right.
Article directory
1. Text reading
based on fitz
import fitz
pdf_file = "example.pdf"
pdf_document = fitz.open(pdf_file)
text = ""
for page_number in range(len(pdf_document)):
page = pdf_document.load_page(page_number)
for block in page.get_text("blocks"):
x0, y0, x1, y1 = block[0:4]
text_block = block[4]
# 根据文本块属性过滤表格中的文本
# 这只是一个示例,你可以根据文本块的位置和其他属性来进一步过滤
if y1 - y0 < 20: # 通过高度过滤小文本块
continue
if "image" in text_block:
continue
text += text_block
pdf_document.close()
print(text)
2. Image reading
based on fitz
import fitz
doc = fitz.open("example.pdf") # open a document
for page_index in range(len(doc)): # iterate over pdf pages
page = doc[page_index] # get the page
image_list = page.get_images()
# print the number of images found on the page
if image_list:
print(f"Found {
len(image_list)} images on page {
page_index}")
else:
print("No images found on page", page_index)
for image_index, img in enumerate(image_list, start=1): # enumerate the image list
xref = img[0] # get the XREF of the image
pix = fitz.Pixmap(doc, xref) # create a Pixmap
if pix.n - pix.alpha > 3: # CMYK: convert to RGB first
pix = fitz.Pixmap(fitz.csRGB, pix)
pix.save("page_%s-image_%s.png" % (page_index, image_index)) # save the image as png
pix = None
3. Table reading
based on fitz
import fitz
doc = fitz.open("example.pdf") # open a document
for page_index in range(len(doc)): # iterate over pdf pages
page = doc[page_index] # get the page
image_list = page.get_images()
# print the number of images found on the page
if image_list:
print(f"Found {
len(image_list)} images on page {
page_index}")
else:
print("No images found on page", page_index)
for image_index, img in enumerate(image_list, start=1): # enumerate the image list
xref = img[0] # get the XREF of the image
pix = fitz.Pixmap(doc, xref) # create a Pixmap
if pix.n - pix.alpha > 3: # CMYK: convert to RGB first
pix = fitz.Pixmap(fitz.csRGB, pix)
pix.save("page_%s-image_%s.png" % (page_index, image_index)) # save the image as png
pix = None
Based on fitz, extract tabular data as text content
import fitz
doc = fitz.open("example.pdf") # open a document
out = open("output.txt", "wb") # create a text output
for page in doc: # iterate the document pages
text = page.get_text().encode("utf8") # get plain text (is in UTF-8)
out.write(text) # write text of page
out.write(bytes((12,))) # write page delimiter (form feed 0x0C)
out.close()
Based on pdfplumber
import pdfplumber
import pandas as pd
# 读取pdf文件,保存为pdf实例
pdf = pdfplumber.open("example.pdf")
# 访问第二页
first_page = pdf.pages[1]
# 自动读取表格信息,返回列表
tables = first_page.extract_tables(table_settings = {
})
for table in tables:
table = pd.DataFrame(table[1:], columns=table[0])
print(table)