Python操作PDF

一.简介

PDF(Portable Document Format),中文名称便携文档格式是我们经常会接触到的一种文件格式,文献、文档…很多都是PDF格式。它以格式稳定的优势,使得我们在打印、分享、传输过程中能够最优的保持原有色彩和格式。

二.PyPDF2

PyPDF2是一个第三方的python PDF库,它能够对PDF文件进行分割、合并、裁剪和转换页面。

另外,它还可以对PDF文件添加自定义数据、水印、密码,也可以从PDF文件中检索出文本和元数据。

1.安装

使用pip直接安装:

pip install PyPDF2

2.删除PDF页

删除就没有目录了。。。

from PyPDF2 import PdfFileWriter, PdfFileReader
import os

def delete_pdf(index):
    pages = input1.getNumPages() 

    for i in range(pages):
        if i + 1 in index:
            continue
        output.addPage(input1.getPage(i)) 

    outputStream = open("PyPDF2-output.pdf", "wb")
    output.write(outputStream) 

os.chdir(r'F:\file\pyfile\1\Code\python-code\自动化\PDF')
output = PdfFileWriter() 
input1 = PdfFileReader(open("数学建模技能图谱.pdf", "rb")) 
delete_pdf([2, 3, 4])

3.合并PDF

from PyPDF2 import PdfFileWriter, PdfFileReader

output = PdfFileWriter()
input1 = PdfFileReader(open("example.pdf", "rb"))
input2 = PdfFileReader(open("simple2.pdf", "rb")) 

def merge_pdf(add_index, origin_index):
    pages = input1.getNumPages()
    k = 0
 	for i in range(pages):
    	if i+1 in add_index:
           output.addPage(input2.getPage(origin_index[k])) 
           pages += 1
           k += 1
    output.addPage(input1.getPage(i))

    outputStream = open("PyPDF2-output.pdf", "wb")
    output.write(outputStream)

merge_pdf([2,3,4], [0, 0, 0])
  1. 导入PyPDF2合并模块PdfFileMerger;
  2. 读取需要处理和合并的PDF文档;
  3. 从第一个PDF文档中取出需要合并的前3页;
  4. 把第二个PDF文档的第一页插入到文档中;
  5. 把第三个PDF文档附到输出文档末尾;

除了上述介绍的2项主要功能,PyPDF2也有一些其他小功能:

4.旋转

input1.getPage(1).rotateClockwise(90)

使得页面1旋转90度。

5.添加水印

from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2.pdf import ContentStream
from PyPDF2.generic import TextStringObject, NameObject
from PyPDF2.utils import b_
import os


def remove_watermark(input_file, output_file):
    """
    pdf去除水印
    :param input_file:
    :param output_file:
    :return:
    """
    with open(input_file, "rb") as f:
        # 读取pdf文件
        source = PdfFileReader(f, "rb")
        # 创建pdf输出对象
        output = PdfFileWriter()

        for page in range(source.getNumPages()):
            # 获取pdf一页属性信息
            page = source.getPage(page)

            # 获取pdf一页的内容
            content_object = page.getContents()
            # content_object = page["/Contents"].getObject()
            # 将内容对象进行转换
            content = ContentStream(content_object, source)
            for operands, operator in content.operations:
                # 根据要去除的水印格式是“Tj”文本
                if operator == b_("Tj"):
                    # 将获取的文本替换为空
                    operands[0] = TextStringObject('')
            # 转换原来的内容对象
            page.__setitem__(NameObject('/Contents'), content)
            # 增加到新的pdf上
            output.addPage(page)

        # 输入新的pdf文件
        with open(output_file, "wb") as outputStream:
            output.write(outputStream)

def create_watermark(input_pdf, output, watermark):
    watermark_obj = PdfFileReader(watermark)
    watermark_page = watermark_obj.getPage(0)
 
    pdf_reader = PdfFileReader(input_pdf)
    pdf_writer = PdfFileWriter()
 
    # 给所有页面添加水印
    for page in range(pdf_reader.getNumPages()):
        page = pdf_reader.getPage(page)
        page.mergePage(watermark_page)
        pdf_writer.addPage(page)
 
    with open(output, 'wb') as out:
        pdf_writer.write(out)
 



os.chdir(r'F:\file\pyfile\1\Code\python-code\自动化\PDF')
# create_watermark(
#         input_pdf='数学建模技能图谱.pdf',
#           output='new_数学建模技能图谱.pdf',
#         watermark='水印.pdf')
remove_watermark('有水印.pdf', '无水印.pdf')

6.加密

from PyPDF2 import PdfFileWriter, PdfFileReader
import os

def encrypt_pdf():
    input1 = PdfFileReader(open("数学建模技能图谱.pdf", "rb")) 
    output = PdfFileWriter() 

    pages = input1.getNumPages() 

    for i in range(pages):
        output.addPage(input1.getPage(i)) 

    password = "secret"
    output.encrypt(password) 
    outputStream = open("PyPDF2-output.pdf", "wb")
    # 要在写入之前进行加密
    output.write(outputStream)    


os.chdir(r'F:\file\pyfile\1\Code\python-code\自动化\PDF')
encrypt_pdf()

7.解密

from PyPDF2 import PdfFileWriter, PdfFileReader
import os

def decrypt_pdf():
    input1 = PdfFileReader(open("PyPDF2-output.pdf", "rb")) 
    password = 'secret'
    input1.decrypt(password) 

    output = PdfFileWriter() 

    pages = input1.getNumPages() 

    for i in range(pages):
        output.addPage(input1.getPage(i)) 

    outputStream = open("new-PyPDF2-output.pdf", "wb")
    output.write(outputStream)  

os.chdir(r'F:\file\pyfile\1\Code\python-code\自动化\PDF')
decrypt_pdf()

猜你喜欢

转载自blog.csdn.net/weixin_44179485/article/details/108268750