1.pdf to picture
import os
import numpy as np
import cv2
from PIL import Image
Image.MAX_IMAGE_PIXELS=None
import tempfile
import time
import sys
from pdf2image import convert_from_bytes
# # 预处理程序
# sys.path.append("./data_prepare")
# from data_prepare.batch_pdf2jpg import pdf2jpg, pdf2pil_imgs
# from data_prepare.batch_rectify import batch_rectify2, rectify_single_img_fast
# sys.path.append("./tools")
# from tools.img_tools import cv_resize_long_edge, pil_resize_long_edge
def cv_resize_long_edge(cv_img, long_edge_length):
# resize the cv_image
(height, width) = cv_img.shape[:2]
max_len = max(width, height)
if max_len == long_edge_length:
return cv_img
ratio = long_edge_length / max_len
img = cv2.resize(cv_img, None, fx=ratio, fy=ratio, interpolation=cv2.INTER_LINEAR)
return img
def pil_resize_long_edge(pil_img, long_edge_length):
# resize the image
width, height = pil_img.size
max_len = max(width, height)
if max_len == long_edge_length:
return pil_img
ratio = max_len / long_edge_length
img = pil_img.resize((round(width/ratio), round(height/ratio)), Image.ANTIALIAS)
return img
class PDF:
def __init__(self, pdf_bytes, model, dpi=300, n_threads=4, save_img_dir="./tmp_ocr_dir", small_size=1280):
st = time.time()
with tempfile.TemporaryDirectory() as tmp_out:
pil_imgs = convert_from_bytes(pdf_bytes, output_folder=tmp_out, dpi=dpi, fmt="jpg", thread_count=n_threads)
print("pdf拆分用时: 共 %d 页用时: %.3fs" % (len(pil_imgs), time.time() - st))
self.pages = []
start = time.time()
# print("deal with:", pdf_path, "output:", pdf_rst_dir)
if save_img_dir is not None:
os.makedirs(save_img_dir, exist_ok=True)
for i, pil_img in enumerate(pil_imgs):
page = {}
cv_img = np.array(pil_img)# pil_img to cv_img
cv_img = cv2.cvtColor(cv_img, cv2.COLOR_BGR2RGB)
# 限制最大尺寸
max_size = 5000
if max(cv_img.shape[:2]) > max_size:
cv_img = cv_resize_long_edge(cv_img, max_size)
if model is not None:
# 使用方向分类模型
cv_img_org = rectify_single_img_fast(model, cv_img)
else:
cv_img_org = cv_img
page["ocv"] = cv_img_org# origin cv img
# page["opil"] = Image.fromarray(cv2.cvtColor(cv_img_org, cv2.COLOR_BGR2RGB)) # origin pil img
# page["scv"] = cv_resize_long_edge(cv_img_org, small_size) # small cv image
# page["spil"] = pil_resize_long_edge(page["opil"], small_size) # small pil image
self.pages.append(page)
if save_img_dir is not None:
img_path = os.path.join(save_img_dir, "%d.jpg" % i)
page["img_path"] = img_path
cv2.imwrite(img_path, page["ocv"])
print("pdf初始化及矫正: 共 %d 页用时: %.3fs" % (len(self.pages), time.time() - start))
def __getitem__(self, i):
return self.pages[i]
def __len__(self):
return len(self.pages)
if __name__ == '__main__':
tmp_dir = "./test_img"
if not os.path.exists(tmp_dir):
os.mkdir(tmp_dir)
path = './me_pdf'
pdfs_list_path = [os.path.join(path,i) for i in os.listdir(path)]
for i, pdf_list_path in enumerate(pdfs_list_path):
# pdf_file_path = '../chengdu/bank_test.pdf'
pdf_file_path = pdf_list_path
pdf_bin = open(pdf_file_path, 'rb').read()
pdf = PDF(pdf_bin, model=None, save_img_dir=None, small_size=2000)
pdf_rst = []
for i, page in enumerate(pdf):
# img = Image.fromarray(page['ocv'][..., ::-1])
# img.save(os.path.join(tmp_dir, '{}.jpg'.format(i)), dpi=(300.0, 300.0), quality=100)
img = page['ocv']
name = pdf_list_path.split('/')[-1].split('.')[0]
cv2.imwrite(os.path.join(tmp_dir, name + '_' + str(i)+'.jpg'), img)
2. Image to pdf
from reportlab.lib.pagesizes import A4, portrait, landscape
from reportlab.pdfgen import canvas
import os
import cv2
# imgs_path = './需要转换成pdf图片'
# imgs_list_path = [os.path.join(imgs_path,i) for i in os.listdir(imgs_path)]
# imgs_list_path = sorted(imgs_list_path)
# for i, img_list_path in enumerate(imgs_list_path):
# if i<1:
# print('img_list_path:', img_list_path)
def convert_images_to_pdf(imgs_path, pdf_path):
pages = 0
(w, h) = portrait(A4)
c = canvas.Canvas(pdf_path, pagesize = portrait(A4))
# l = os.listdir(img_path)
# l.sort(key= lambda x:int(x[:-4]))
imgs_list_path = [os.path.join(imgs_path, i) for i in os.listdir(imgs_path)]
imgs_list_path = sorted(imgs_list_path)
for img_list_path in imgs_list_path:
# f = img_path + os.sep + str(img_list_path)
c.drawImage(img_list_path, 0, 0, w, h)
c.showPage()
pages = pages + 1
c.save()
if __name__ == '__main__':
# imgs_path = './需要转换成pdf图片'
# path = './测试数据集_给梧州'
# path = './红头文件/样本文件_jpg'
path = './红头文件/身份证pdf'
dirs_list_path = [os.path.join(path, i) for i in os.listdir(path)]
for i, dir_list_path in enumerate(dirs_list_path):
# imgs_path ='./需要转换成pdf图片3'
# pdf_path = './good.pdf'
pdf_path = str(i+1)+'.pdf'
convert_images_to_pdf(dir_list_path, pdf_path)