The magical Python script pdf to word, doc to docx, word to html are available in various formats

Preface

Regarding the conversion of PDF to word document, I think many people have understood that it is necessary to pay and it is very expensive, but if you know Python, as long as you know Python, it is no longer a problem.

The old rules, you need packaged software to pay attention to the editor, QQ group: 721195303 to receive.

Convert pdf file to word file

import os

fron
configparser
imtport
configParserfrom
io
import stringIo

fron
io
impont
open
fron
concurrent.futures
impont
ProcessPoolExecutorfrom
pdfminer.pdfinterp
import PDFResourceManagerfron

pdfminer.pdfinterp
import processpdf

fron
pdfminer.converter
import Textconverterfron

pdfminer.layout
import LAParams

fon
docx
Lmport
Documentdef
read
from pdf(iLe

pdth):
ith
open(file_
path, 'rb')as file:
resource
manager
PDFResourceManageroreturn_str - StringIo()
lap_params = LAParamso
device
Textconverter(
    resource
manager,
return str, Laparoms=lap_params)process
pdf(resource_
manager, device, file)
device.closeo
content = return_str.getvaluereturn_str.close(o
return content


def save text


to
word(content, file
path):
doc - DocumentO
for line in content.split('in"3:
paragraphdoc.add paragraph
doc.save(fiie path)
paragraph.add run(renove controlcharacters(line)


def remove control_characters(content):


mpa
dict.fromkeys(range(a2))
retun
content.translate(mpa)


def pdf_to_word(pdf fiLe_path, word


file
path):
content = Pead
from pdf(pdf

file
path
save
text
to
word(content, word_file
path)

def main(O:
config parser - ConfigParseroconfig


parser.read("config.cfg')config =config parser['defauit1
tasks - [
with ProcessPoolExecutor(max_workers int(config['max _worker']))8s executor:
    for
file in os.listdir(config[" pdf folder"
D:
extension
name = os.path.splitext(file)[1]
fextension
name
le
"pdf":
continue
file_name - os.path.splitext(file)[e]
pdf_file
config["pdf_folder'J+file
word
file - config[" word_folder']*'/+ file_name & '.docx"
print(“正在处理:, file
resultexecutor.submit(pdf_to_word, pdf_file, word_file)
tasks.append(result)
while True:
    exit
flag
Truefor
task in tasks:
if not task.done(o:
    exit_flag
Falseif
exit_flag:
print(“完成exit(e
ifname - "_main_':
main()

 

Convert Word file to pdf file

# -*- encoding: utf-8 -*-
import  os
from win32com import client
#pip instatll win32com
def doc2pdf(doc_name, pdf_name):
    """
    :word文件转pdf
    :param doc_name word文件名称
    :param pdf_name 转换后pdf文件名称
    """
    try:
        word = client.DispatchEx("Word.Application")
        if os.path.exists(pdf_name):
            os.remove(pdf_name)
        worddoc = word.Documents.Open(doc_name,ReadOnly = 1)
        worddoc.SaveAs(pdf_name, FileFormat = 17)
        worddoc.Close()
        return pdf_name
    except:
        return 1
if __name__=='__main__':
    doc_name = "f:/test.doc"
    ftp_name = "f:/test.pdf"
    doc2pdf(doc_name, ftp_name)

 

doc to docx

from win32com import client
def doc2docx(doc_name,docx_name):
    """
    :doc转docx
    """
    try:
        # 首先将doc转换成docx
        word = client.Dispatch("Word.Application")
        doc = word.Documents.Open(doc_name)
        #使用参数16表示将doc转换成docx
        doc.SaveAs(docx_name,16)
        doc.Close()
        word.Quit()
    except:
        pass
if __name__ == '__main__':
    doc2docx(f:test.doc','f:/test.docx')

 

docx transform html

#coding:utf-8
import docx
from docx2html import convert
import HTMLParser
def  docx2html(docx_name,new_name):
    """
    :docx转html
    """
    try:
        #读取word内容
        doc = docx.Document(docx_name,new_name)
        data = doc.paragraphs[0].text
        # 转换成html
        html_parser = HTMLParser.HTMLParser()
        #使用docx2html模块将docx文件转成html串,随后你想干嘛都行
        html = convert(new_name)
        #docx2html模块将中文进行了转义,需要将生成的字符串重新转义
        return html_parser.enescape(html)
    except:
        pass
if __name__ == '__main__':
    docx2html('f:/test.docx','f:/test1.docx')

I still want to recommend the Python learning group I built myself : 721195303 , all of whom are learning Python. If you want to learn or are learning Python, you are welcome to join. Everyone is a software development party and share dry goods from time to time (only Python software development related), including a copy of the latest Python advanced materials and zero-based teaching compiled by myself in 2021. Welcome friends who are in advanced and interested in Python to join!
 

Guess you like

Origin blog.csdn.net/aaahtml/article/details/112918438