Convert between pdf and picture

1.pdf to picture

import os
import numpy as np
import cv2
from PIL import Image
Image.MAX_IMAGE_PIXELS=None
import tempfile
import time
import sys
from pdf2image import convert_from_bytes

# # 预处理程序
# sys.path.append("./data_prepare")
# from data_prepare.batch_pdf2jpg import pdf2jpg, pdf2pil_imgs
# from data_prepare.batch_rectify import batch_rectify2, rectify_single_img_fast
# sys.path.append("./tools")
# from tools.img_tools import cv_resize_long_edge, pil_resize_long_edge

def cv_resize_long_edge(cv_img, long_edge_length):
    # resize the cv_image
    (height, width) = cv_img.shape[:2]
    max_len = max(width, height)
    if max_len == long_edge_length:
        return cv_img
    ratio = long_edge_length / max_len
    img = cv2.resize(cv_img, None, fx=ratio, fy=ratio, interpolation=cv2.INTER_LINEAR)
    return img

def pil_resize_long_edge(pil_img, long_edge_length):
    # resize the image
    width, height = pil_img.size
    max_len = max(width, height)
    if max_len == long_edge_length:
        return pil_img
    ratio = max_len / long_edge_length
    img = pil_img.resize((round(width/ratio), round(height/ratio)), Image.ANTIALIAS)
    return img

class PDF:
    def __init__(self, pdf_bytes, model, dpi=300, n_threads=4, save_img_dir="./tmp_ocr_dir", small_size=1280):
        st = time.time()
        with tempfile.TemporaryDirectory() as tmp_out:
            pil_imgs = convert_from_bytes(pdf_bytes, output_folder=tmp_out, dpi=dpi, fmt="jpg", thread_count=n_threads)
        print("pdf拆分用时: 共 %d 页用时: %.3fs" % (len(pil_imgs), time.time() - st))
        self.pages = []
        start = time.time()
        # print("deal with:", pdf_path, "output:", pdf_rst_dir)
        if save_img_dir is not None:
            os.makedirs(save_img_dir, exist_ok=True)

        for i, pil_img in enumerate(pil_imgs):
            page = {}
            cv_img = np.array(pil_img)# pil_img to cv_img
            cv_img = cv2.cvtColor(cv_img, cv2.COLOR_BGR2RGB)
            # 限制最大尺寸
            max_size = 5000
            if max(cv_img.shape[:2]) > max_size:
                cv_img = cv_resize_long_edge(cv_img, max_size)
            if model is not None:
                # 使用方向分类模型
                cv_img_org = rectify_single_img_fast(model, cv_img)
            else:
                cv_img_org = cv_img
            page["ocv"] = cv_img_org# origin cv img
            # page["opil"] = Image.fromarray(cv2.cvtColor(cv_img_org, cv2.COLOR_BGR2RGB)) # origin pil img
            # page["scv"] = cv_resize_long_edge(cv_img_org, small_size) # small cv image
            # page["spil"] = pil_resize_long_edge(page["opil"], small_size) # small pil image
            self.pages.append(page)
            if save_img_dir is not None:
                img_path = os.path.join(save_img_dir, "%d.jpg" % i)
                page["img_path"] = img_path
                cv2.imwrite(img_path, page["ocv"])

        print("pdf初始化及矫正: 共 %d 页用时: %.3fs" % (len(self.pages), time.time() - start))

    def __getitem__(self, i):
        return self.pages[i]

    def __len__(self):
        return len(self.pages)

if __name__ == '__main__':

    tmp_dir = "./test_img"
    if not os.path.exists(tmp_dir):
        os.mkdir(tmp_dir)

    path = './me_pdf'
    pdfs_list_path = [os.path.join(path,i) for i in os.listdir(path)]
    for i, pdf_list_path in enumerate(pdfs_list_path):
        # pdf_file_path = '../chengdu/bank_test.pdf'
        pdf_file_path = pdf_list_path
        pdf_bin = open(pdf_file_path, 'rb').read()
        pdf = PDF(pdf_bin, model=None, save_img_dir=None, small_size=2000)
        pdf_rst = []
        for i, page in enumerate(pdf):
            # img = Image.fromarray(page['ocv'][..., ::-1])
            # img.save(os.path.join(tmp_dir, '{}.jpg'.format(i)), dpi=(300.0, 300.0), quality=100)
            img = page['ocv']
            name = pdf_list_path.split('/')[-1].split('.')[0]
            cv2.imwrite(os.path.join(tmp_dir, name + '_' + str(i)+'.jpg'), img)

2. Image to pdf

from reportlab.lib.pagesizes import A4, portrait, landscape
from reportlab.pdfgen import canvas
import os
import cv2
# imgs_path = './需要转换成pdf图片'
# imgs_list_path = [os.path.join(imgs_path,i) for i in os.listdir(imgs_path)]
# imgs_list_path = sorted(imgs_list_path)
# for i, img_list_path in enumerate(imgs_list_path):
#     if i<1:
#         print('img_list_path:', img_list_path)

def convert_images_to_pdf(imgs_path, pdf_path):
    pages = 0
    (w, h) = portrait(A4)
    c = canvas.Canvas(pdf_path, pagesize = portrait(A4))
    # l = os.listdir(img_path)
    # l.sort(key= lambda x:int(x[:-4]))
    imgs_list_path = [os.path.join(imgs_path, i) for i in os.listdir(imgs_path)]
    imgs_list_path = sorted(imgs_list_path)
    for img_list_path in imgs_list_path:
        # f = img_path + os.sep + str(img_list_path)
        c.drawImage(img_list_path, 0, 0, w, h)
        c.showPage()
        pages = pages + 1
    c.save()

if __name__ == '__main__':
    # imgs_path = './需要转换成pdf图片'
    # path = './测试数据集_给梧州'
    # path = './红头文件/样本文件_jpg'
    path = './红头文件/身份证pdf'
    dirs_list_path = [os.path.join(path, i) for i in os.listdir(path)]
    for i, dir_list_path in enumerate(dirs_list_path):
        # imgs_path ='./需要转换成pdf图片3'
        # pdf_path = './good.pdf'
        pdf_path = str(i+1)+'.pdf'
        convert_images_to_pdf(dir_list_path, pdf_path)

 

Guess you like

Origin blog.csdn.net/fanzonghao/article/details/104835642