使用Python将PDF批量转为word或者html,亲测好用

主要利用了Adobe Acrobat DC软件的能力,python环境的配置略过…
第一步:
安装win32com

pip install win32com

第二步:
下载Adobe Acrobat DC,软件本身是收费的,但是有万能的度娘,附个链接吧
提取码:8888

第三步:
执行下列代码

from win32com.client.dynamic import Dispatch, ERRORS_BAD_CONTEXT

import os
import winerror
from time import sleep

ERRORS_BAD_CONTEXT.append(winerror.E_NOTIMPL)


def pdf2word(f_path, d_path):
    try:
        AvDoc = Dispatch("AcroExch.AVDoc")
        AvDoc.Open(f_path, "")
        pdDoc = AvDoc.GetPDDoc()
        jsObject = pdDoc.GetJSObject()
        jsObject.SaveAs(d_path, "com.adobe.acrobat.html")
        print('ok')
    except Exception as e:
        print('error')
        print(e)
    finally:
        pdDoc.Close()
        AvDoc.Close(True)


paths = os.walk(r'D:\AAA_DWP\files')
for path, dir_lst, file_lst in paths:
    for file_name in file_lst:
        if file_name.startswith("~$"):
            continue
        if file_name.endswith(".pdf"):
            print('>>>>>>>>>>>>>>>>>>>> start:', file_name)
            full_name = os.path.join(path, file_name)
            out_file = full_name.replace('.pdf', '.html')
            if os.path.exists(out_file):
                continue
            else:
                f_path = full_name
                d_path = 'D:\\AAA_DWP\\files\\output\\' + file_name.replace('.pdf', '.html')
                pdf2word(f_path, d_path)
                print('>>>>>>>>>>>>>>>>>>>> finish')
                sleep(10)

猜你喜欢

转载自blog.csdn.net/qq_27574367/article/details/131435043