神奇的Python脚本pdf转word、doc转docx、word转html各种格式都有

前言

对于PDF转换成word文档,我想很多人都了解过,那就是需要付费,而且很贵,但是如果你会Python,只要你会Python这么问题都不再是问题。

老规矩,需要打包好的软件关注小编,QQ群:721195303领取。

pdf文件转换为word文件

import os

fron
configparser
imtport
configParserfrom
io
import stringIo

fron
io
impont
open
fron
concurrent.futures
impont
ProcessPoolExecutorfrom
pdfminer.pdfinterp
import PDFResourceManagerfron

pdfminer.pdfinterp
import processpdf

fron
pdfminer.converter
import Textconverterfron

pdfminer.layout
import LAParams

fon
docx
Lmport
Documentdef
read
from pdf(iLe

pdth):
ith
open(file_
path, 'rb')as file:
resource
manager
PDFResourceManageroreturn_str - StringIo()
lap_params = LAParamso
device
Textconverter(
    resource
manager,
return str, Laparoms=lap_params)process
pdf(resource_
manager, device, file)
device.closeo
content = return_str.getvaluereturn_str.close(o
return content


def save text


to
word(content, file
path):
doc - DocumentO
for line in content.split('in"3:
paragraphdoc.add paragraph
doc.save(fiie path)
paragraph.add run(renove controlcharacters(line)


def remove control_characters(content):


mpa
dict.fromkeys(range(a2))
retun
content.translate(mpa)


def pdf_to_word(pdf fiLe_path, word


file
path):
content = Pead
from pdf(pdf

file
path
save
text
to
word(content, word_file
path)

def main(O:
config parser - ConfigParseroconfig


parser.read("config.cfg')config =config parser['defauit1
tasks - [
with ProcessPoolExecutor(max_workers int(config['max _worker']))8s executor:
    for
file in os.listdir(config[" pdf folder"
D:
extension
name = os.path.splitext(file)[1]
fextension
name
le
"pdf":
continue
file_name - os.path.splitext(file)[e]
pdf_file
config["pdf_folder'J+file
word
file - config[" word_folder']*'/+ file_name & '.docx"
print(“正在处理:, file
resultexecutor.submit(pdf_to_word, pdf_file, word_file)
tasks.append(result)
while True:
    exit
flag
Truefor
task in tasks:
if not task.done(o:
    exit_flag
Falseif
exit_flag:
print(“完成exit(e
ifname - "_main_':
main()

Word文件转换为pdf文件

# -*- encoding: utf-8 -*-
import  os
from win32com import client
#pip instatll win32com
def doc2pdf(doc_name, pdf_name):
    """
    :word文件转pdf
    :param doc_name word文件名称
    :param pdf_name 转换后pdf文件名称
    """
    try:
        word = client.DispatchEx("Word.Application")
        if os.path.exists(pdf_name):
            os.remove(pdf_name)
        worddoc = word.Documents.Open(doc_name,ReadOnly = 1)
        worddoc.SaveAs(pdf_name, FileFormat = 17)
        worddoc.Close()
        return pdf_name
    except:
        return 1
if __name__=='__main__':
    doc_name = "f:/test.doc"
    ftp_name = "f:/test.pdf"
    doc2pdf(doc_name, ftp_name)

doc转docx

from win32com import client
def doc2docx(doc_name,docx_name):
    """
    :doc转docx
    """
    try:
        # 首先将doc转换成docx
        word = client.Dispatch("Word.Application")
        doc = word.Documents.Open(doc_name)
        #使用参数16表示将doc转换成docx
        doc.SaveAs(docx_name,16)
        doc.Close()
        word.Quit()
    except:
        pass
if __name__ == '__main__':
    doc2docx(f:test.doc','f:/test.docx')

 

docx转html

#coding:utf-8
import docx
from docx2html import convert
import HTMLParser
def  docx2html(docx_name,new_name):
    """
    :docx转html
    """
    try:
        #读取word内容
        doc = docx.Document(docx_name,new_name)
        data = doc.paragraphs[0].text
        # 转换成html
        html_parser = HTMLParser.HTMLParser()
        #使用docx2html模块将docx文件转成html串,随后你想干嘛都行
        html = convert(new_name)
        #docx2html模块将中文进行了转义,需要将生成的字符串重新转义
        return html_parser.enescape(html)
    except:
        pass
if __name__ == '__main__':
    docx2html('f:/test.docx','f:/test1.docx')

在这里还是要推荐下我自己建的Python学习群:721195303,群里都是学Python的,如果你想学或者正在学习Python ,欢迎你加入,大家都是软件开发党,不定期分享干货(只有Python软件开发相关的),包括我自己整理的一份2021最新的Python进阶资料和零基础教学,欢迎进阶中和对Python感兴趣的小伙伴加入!
 

猜你喜欢

转载自blog.csdn.net/aaahtml/article/details/112918438