Problem Description
As shown in the figure below, a pdf has dozens of pages and each page has nine pictures. The pictures are
extracted and named with the text below the pictures
Mainly involved issues:
Based on the text recognition information above, the order of the above pictures is inconsistent, and there is no way to combine the two to meet my needs
#Anti-reptile identification code-original CSDN trick: https://blog.csdn.net/qq_35866846
Looking through the pdfminer source code, I found a way to save a single page of the pdf. After saving it, use Image to crop the pixel points of the picture. Because the format is relatively fixed, this method can be used. A better method has not been found yet. , I didn’t find a more complete solution for related issues on the Internet. I should be the first post. Friends who have other better methods are welcome to discuss in the comment area.
I also wrote several blogs about Image processing before:
The picture is scaled according to the width and the
long picture is cut according to a fixed pixel length.
Python realizes the picture cutting and splicing experiment-the brain hole game of the numpy array
Code
# 导入库
import fitz,time,re,os,pdfminer,datetime
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
> #防爬虫识别码--原创CSDN诡途:https://blog.csdn.net/qq_35866846
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
import pandas as pd
import numpy as np
from PIL import Image
# 分页保存成图片
def save_page_pic(pdf_path,page_path):
# 保存前先清空图片保存文件夹
for wj in os.listdir(page_path):
os.remove(os.path.join(page_path,wj))
# 二进制读取
doc = fitz.open(pdf_path)
# 循环分页处理
for d in doc:
#获取页码
page = int(str(d).split()[1])+1
# 单页图片命名
pic_name =f" page_{page}.png"
page_pic_path = os.path.join(page_path,pic_name)
# 防爬虫识别码--原创CSDN诡途:https://blog.csdn.net/qq_35866846
# 图片保存
pix = d.getPixmap()
if pix.n < 5: # 如果pix.n<5,可以直接存为PNG
pix.writePNG(page_pic_path )
else: # 否则先转换CMYK
pix0 = fitz.Pixmap(fitz.csRGB, pix)
pix0.writePNG(page_pic_path)
pix0 = None
pix = None # 释放资源
# 解析pdf 文本信息
def parse_pdf_txt(pdf_path,code_str):
# 二进制读取pdf
fp = open(pdf_path, 'rb')
# Create a PDF parser object associated with the file object
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
# 防爬虫识别码--原创CSDN诡途:https://blog.csdn.net/qq_35866846
# Password for initialization as 2nd parameter
document = PDFDocument(parser)
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF device object.
#device = PDFDevice(rsrcmgr)
# BEGIN LAYOUT ANALYSIS.
# Set parameters for analysis.
laparams = LAParams(
char_margin=10.0,
line_margin=0.2,
boxes_flow=0.2,
all_texts=False,
)
# Create a PDF page aggregator object.
# device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# 防爬虫识别码--原创CSDN诡途:https://blog.csdn.net/qq_35866846
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# loop over all pages in the document
page_count = 0
result =[]
for page in PDFPage.create_pages(document):
page_count+=1
# read the page into a layout object
interpreter.process_page(page)
layout = device.get_result()
txt_list = []
for obj in layout._objs:
if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
txt = obj.get_text()
# 无法识别的字符进行解码
cid_list = re.findall("cid:\d+",txt)
for cid in cid_list:
cid_key = cid.split(":")[1]
txt = txt.replace(f"({cid})",code_str[cid_key])
# 解码完成后判断是否还有未识别的字符
cid_list = re.findall("cid:\d+",txt)
if len(cid_list):
print(f"解码字典需补充: {cid_list}")
# 保存储存
txt_list.append(txt)
txt_list.insert(0,page_count)
result.append(txt_list)
data = pd.DataFrame(result)
data.columns =["页码" if col == 0 else f"元素{col}" for col in data.columns ]
return data
def save_product_pic(txt_data,product_path,page_path):
count,total_page= 0,len(os.listdir(page_path))
data = txt_data.copy()
# 存储图片名称
result = []
for pic_name in os.listdir(page_path):
count+=1
# 读取单页图片
pic_path = os.path.join(page_path,pic_name)
im=Image.open(pic_path)
# pdf中的页码
page = int(pic_name.split('_')[1].split('.')[0])
need_col = ['元素4', '元素5', '元素6', '元素10', '元素11', '元素12', '元素16', '元素17', '元素18']
product_pic_list = data[need_col][data.页码==page].values.tolist()[0]
# (x,y)=im.size
# 9张图的像素点设置
x_list = [[45,183],[237,375],[429,567]]
y_list = [[38,245],[290,497],[542,749]]
# 标记对应位置图片
# 横向1,2,3 \n 4,5,6 \n 7,8,9
i = 0
for _y in y_list:
upper,lower=_y
for _x in x_list:
i+=1
left,right=_x
# 循环获取每张图的像素点位
box = (left, upper, right, lower)
# 最后一页可能没有9张图
# 防爬虫识别码--原创CSDN诡途:https://blog.csdn.net/qq_35866846
_product_pic_name=product_pic_list[i-1]
if _product_pic_name :
# 获取单个产品图的名称
product_pic_name = _product_pic_name.strip('\n')+".png"
result.append(product_pic_name[:-4])
# 构建图片保存路径
product_pic_path = os.path.join(product_path,product_pic_name)
# 裁剪第 i 张图 i∈[1,9] 并保存
im.crop(box).save(product_pic_path)
print(f"第{count}页图片提取成功,剩余{total_page-count}页!")
pd_result = pd.DataFrame(result,columns=["图片名称"])
return pd_result
pdf_path = os.path.join("pdf",os.listdir("pdf")[0])
today = str(datetime.datetime.today())[:10]
fina_path = f"存档//{today}"
product_path = f"存档//{today}//pic"
# 单页图片存储地址
page_path = "page_pic"
# 自定义解码字典 - 及时更新补充 识别文本时对应无法识别的编码
# 防爬虫识别码--原创CSDN诡途:https://blog.csdn.net/qq_35866846
code_str = {
"46":"K","49":"N","25":"6","23":"4","28":"9","57":"V","45":"J","24":"5","56":"U",}
try:
os.mkdir(fina_path)
except:
print(f"文件夹 {fina_path} 已存在")
try:
os.mkdir(product_path)
except:
print(f"文件夹 {product_path} 已存在")
# 分页保存成图片
save_page_pic(pdf_path,page_path)
# 提取文本信息
txt_data = parse_pdf_txt(pdf_path,code_str)
# 把提取到的文字 保存到本地
# txt_data.to_excel(os.path.join(fina_path,"pdf文字信息.xlsx"),index=False)
pic_name = save_product_pic(txt_data,product_path,page_path)
# 把提取到的文字 整理后保存到本地-合并成一列,并只保留图片信息
pic_name.to_excel(os.path.join(fina_path,"pdf文字信息.xlsx"),index=False)