python document processing document to retain the original style

document file format, line, pictures, headers and footers are all the same

# - * - Coding: UTF-8 - * -
# @time: 2019/5/6 11:46
# @author:
"" "
# python-docx replaced with the content of the article

docx install Python-PIP
# format, line, pictures, headers and footers are all the same
# python-docx in dealing with the problem hyperlink, you can reference the source link to modify
https://github.com/python- openxml / python-docx / issues / 85

# Specific changes as follows
\ site-packages \ docx \ oxml \ __ init__.py

# 需要新增的代码
def remove_hyperlink_tags(xml):
import re
text = xml.decode('utf-8')
text = text.replace("</w:hyperlink>","")
text = re.sub('<w:hyperlink[^>]*>', "", text)
return text.encode('utf-8')

# Need to modify the source code
DEF parse_xml (xml):
root_element = etree.fromstring (remove_hyperlink_tags (xml), oxml_parser)
return root_element
"" "

import os

from docx import Document
from win32com import client

# Write their own sentence by sentence translation packages
import doc_scan


pre_document DEF (filename):
"" "
Because python_docx (only read .docx files can not be read .doc files)
to the corresponding folder in the file into doc docx file
: param filename: absolute path of the file
: return:
"" "

= os.path.splitext file_tuple (filename)
IF file_tuple [. 1] == '.doc':
Word = client.Dispatch ( 'the Word.Application')
DOC = word.Documents.Open (filename) file in the destination path #
doc.SaveAs (file_tuple [0] + " .docx", 16) in the file after conversion path #
doc.close ()
word.Quit ()
# source files deleted
os.remove (filename)


read_document DEF ():
"" "
the original article in Chinese and then translated into English and Chinese sentence by sentence, to replace the original Chinese English, to retain the original style of the article
: return:
" ""
All files in the file doc # traverse
path os.path.dirname = (os.path.abspath with (__ file__)) + '\ DOC'
for F in the os.listdir (path):
file = "% S \ S%"% (path, F)
# source files pretreatment
pre_document (File)
the Document = the Document (File)
for NUM, paragraph in the enumerate (document.paragraphs):
# get each piece of text
old_text = paragraph.text.strip ()
IF old_text:
Inlines = paragraph.runs
IF Inlines:
# the content of the original article which is empty
for li, InLi in the enumerate (Inlines):
Inlines [li] .text = Inlines [li] .text.replace (Inlines [li] .text, '')
new_text = doc_scan.Scan(old_text)

# Replace the sentence translated article number to zero position above
Inlines [0] .text = new_text
# save the file, overwriting
document.save (file)


# 将document中的图片下载到本地
# document = Document(file)
# for shape in document.inline_shapes:
# contentID = shape._inline.graphic.graphicData.pic.blipFill.blip.embed
# contentType = document.part.related_parts[contentID].content_type
# if not contentType.startswith('image'):
# continue
# imgName = basename(document.part.related_parts[contentID].partname)
# imgData = document.part.related_parts[contentID]._blob
# with open(imgName,'wb') as fp:
# fp.write(imgData)

if __name__ == '__main__':
read_document()

----------------
Disclaimer: This article is the original article CSDN bloggers "Cocktail_py", and follow CC 4.0 BY-SA copyright agreement, reproduced, please attach the original source link and this statement. .
Original link: https: //blog.csdn.net/Cocktail_py/article/details/101149901

Guess you like

Origin www.cnblogs.com/ein-key5205/p/12305994.html