Python reads text, tables, and pictures in PDF files

Python reads text, tables, and pictures in PDF files

Tip: After the article is written, the table of contents can be automatically generated. For how to generate it, please refer to the help document on the right.


1. Text reading

based on fitz

import fitz
pdf_file = "example.pdf"
pdf_document = fitz.open(pdf_file)
text = ""
for page_number in range(len(pdf_document)):
    page = pdf_document.load_page(page_number)
    for block in page.get_text("blocks"):
        x0, y0, x1, y1 = block[0:4]
        text_block = block[4]
        # 根据文本块属性过滤表格中的文本
        # 这只是一个示例,你可以根据文本块的位置和其他属性来进一步过滤
        if y1 - y0 < 20:  # 通过高度过滤小文本块
            continue
        if "image" in text_block:
            continue
        text += text_block
pdf_document.close()
print(text)

2. Image reading

based on fitz

import fitz
doc = fitz.open("example.pdf") # open a document
for page_index in range(len(doc)): # iterate over pdf pages
    page = doc[page_index] # get the page
    image_list = page.get_images()
    # print the number of images found on the page
    if image_list:
        print(f"Found {
      
      len(image_list)} images on page {
      
      page_index}")
    else:
        print("No images found on page", page_index)
    for image_index, img in enumerate(image_list, start=1): # enumerate the image list
        xref = img[0] # get the XREF of the image
        pix = fitz.Pixmap(doc, xref) # create a Pixmap
        if pix.n - pix.alpha > 3: # CMYK: convert to RGB first
            pix = fitz.Pixmap(fitz.csRGB, pix)
        pix.save("page_%s-image_%s.png" % (page_index, image_index)) # save the image as png
        pix = None

3. Table reading

based on fitz

import fitz
doc = fitz.open("example.pdf") # open a document
for page_index in range(len(doc)): # iterate over pdf pages
    page = doc[page_index] # get the page
    image_list = page.get_images()
    # print the number of images found on the page
    if image_list:
        print(f"Found {
      
      len(image_list)} images on page {
      
      page_index}")
    else:
        print("No images found on page", page_index)
    for image_index, img in enumerate(image_list, start=1): # enumerate the image list
        xref = img[0] # get the XREF of the image
        pix = fitz.Pixmap(doc, xref) # create a Pixmap
        if pix.n - pix.alpha > 3: # CMYK: convert to RGB first
            pix = fitz.Pixmap(fitz.csRGB, pix)
        pix.save("page_%s-image_%s.png" % (page_index, image_index)) # save the image as png
        pix = None

Based on fitz, extract tabular data as text content

import fitz
doc = fitz.open("example.pdf") # open a document
out = open("output.txt", "wb") # create a text output
for page in doc: # iterate the document pages
    text = page.get_text().encode("utf8") # get plain text (is in UTF-8)
    out.write(text) # write text of page
    out.write(bytes((12,))) # write page delimiter (form feed 0x0C)
out.close()

Based on pdfplumber

import pdfplumber
import pandas as pd
# 读取pdf文件,保存为pdf实例
pdf =  pdfplumber.open("example.pdf") 
# 访问第二页
first_page = pdf.pages[1]
# 自动读取表格信息,返回列表
tables = first_page.extract_tables(table_settings = {
    
    })
for table in tables:
    table = pd.DataFrame(table[1:], columns=table[0])
    print(table)

Guess you like

Origin blog.csdn.net/Brilliant_liu/article/details/134098051