Table of contents
1. Goal 1: Decoding + Delabeling
2. Goal 2: Extract the content in the tag
3. Goal 3: Insert the processed data into the original location
4. Goal 4: Insert the specified content into the specified location
5. Goal 5: Format contextual fonts
6. Goal 6: Insert different strings into multiple different positions
7. Goal 7: Insert different strings into multiple different positions
8. Goal 8: Write the graphic and text behind the specified string
1. Goal 1: Decoding + Delabeling
Use function: html.unescape () decoding + replace () replacement
import html
data = '\u003cp\u003e(此处忽略一万个字)'
# 解码HTML实体,并替换相应字符
decoded_data = html.unescape(data).replace('<p><br></p>', '\n').replace('<p>','').replace('</p>','')
# 输出结果
print(decoded_data)
2. Goal 2: Extract the content in the tag
Idea: In fact, it is regular matching
The img tag is removed and wrapped, leaving only the URL
code:
import re
text = '<img src="URL">…………(此处省略一万字)'
# 提取URL
urls = re.findall(r'<img\s+src="([^"]+)"\s*>', text)
# 替换<img>标签为URL,并添加换行符
for url in urls:
text = re.sub(r'<img\s+src="[^"]+"\s*>', url + '\n', text, count=1)
print(text)
3. Goal 3: Insert the processed data into the original location
After downloading the picture URL in the following code, insert it into the document according to the original position
import requests
from docx import Document
from docx.shared import Inches
# 创建一个新的Word文档
doc = Document()
text = '''
图片:
https://xxxxx.png
'''
# 以换行符分割文本
lines = text.split('\n')
for line in lines:
if line.startswith('https://'):
# 下载图片
response = requests.get(line)
image_path = line.split('/')[-1] # 使用URL中的最后一部分作为文件名保存图片
with open(image_path, 'wb') as f:
f.write(response.content)
# 插入图片到Word文档
doc.add_picture(image_path, width=Inches(4)) # 根据需要调整图片的宽度
else:
# 插入文本到Word文档
doc.add_paragraph(line)
# 保存Word文档
doc.save("output.docx")
4. Goal 4: Insert the specified content into the specified location
Use python to open a word document and write the content after the specified string
from docx import Document
# 打开Word文档
doc = Document('example.docx')
# 获取文档中所有段落的内容
paragraphs = [p.text for p in doc.paragraphs]
# 指定要插入内容的位置
target_string = '指定字符串'
insert_index = paragraphs.index(target_string) + 1 # 在目标字符串后面插入,所以需要加1
# 要插入的内容
new_content = '要插入的内容'
# 在指定位置后插入内容
doc.paragraphs[insert_index].insert_paragraph_before(new_content)
# 保存修改后的Word文档
doc.save('example_modified.docx')
5. Goal 5: Format contextual fonts
The font size of the written text is the same as the previous line
from docx import Document
from docx.shared import Pt
# 打开Word文档
doc = Document('example.docx')
# 获取上一行的字体大小
previous_paragraph = doc.paragraphs[-1]
previous_run = previous_paragraph.runs[-1]
previous_font_size = previous_run.font.size
# 要写入的文本内容
new_text = '新的文本'
# 在新行中写入文本
new_paragraph = doc.add_paragraph()
new_run = new_paragraph.add_run(new_text)
# 设置新行的字体大小与上一行一致
new_font = new_run.font
new_font.size = previous_font_size
# 保存修改后的Word文档
doc.save('example_modified.docx')
Insert text with the same font size as the previous line
from docx import Document
from docx.shared import Pt
def word_info_w():
# 打开Word文档
doc = Document('test.docx')
# 获取文档中所有段落的内容
paragraphs = [p.text for p in doc.paragraphs]
# 指定要插入内容的位置
target_string = '附件:'
insert_index = paragraphs.index(target_string) + 1 # 在目标字符串后面插入,所以需要加1
# 获取上一行的字体大小
previous_paragraph = doc.paragraphs[insert_index - 1]
previous_run = previous_paragraph.runs[-1]
previous_font_size = previous_run.font.size
# 要插入的内容
new_content = '测试title'
# 在指定位置后插入内容
new_paragraph = doc.paragraphs[insert_index].insert_paragraph_before(new_content)
# 设置新插入内容的字体大小与上一行一致
new_run = new_paragraph.runs[0]
new_font = new_run.font
new_font.size = previous_font_size
# 保存修改后的Word文档
doc.save('test.docx')
if __name__ == '__main__':
word_info_w()
6. Goal 6: Insert different strings into multiple different positions
Insert different strings into multiple different positions
(may be inserted in the same position)
from docx import Document
def insert_content(doc, insert_dict):
# 获取文档中所有段落的内容
paragraphs = [p.text for p in doc.paragraphs]
for target_string, new_content in insert_dict.items():
if target_string in paragraphs:
# 指定要插入内容的位置
insert_index = paragraphs.index(target_string) + 1 # 在目标字符串后面插入,所以需要加1
# 获取上一行的字体大小
previous_paragraph = doc.paragraphs[insert_index - 1]
previous_run = previous_paragraph.runs[-1]
previous_font_size = previous_run.font.size
# 在指定位置后插入内容
new_paragraph = doc.paragraphs[insert_index].insert_paragraph_before(new_content)
# 设置新插入内容的字体大小与上一行一致
new_run = new_paragraph.runs[0]
new_font = new_run.font
new_font.size = previous_font_size
# 保存修改后的Word文档
doc.save('test.docx')
if __name__ == '__main__':
# 打开Word文档
doc = Document('test.docx')
# 定义要插入的内容和位置的字典
insert_dict = {
'附件:': '测试title1',
'目录:': '测试title2'
}
# 插入内容
insert_content(doc, insert_dict)
7. Goal 7: Insert different strings into multiple different positions
from docx import Document
def insert_content(doc, target_string, new_content):
# 获取文档中所有段落的内容
paragraphs = [p.text for p in doc.paragraphs]
if target_string in paragraphs:
# 指定要插入内容的位置
insert_index = paragraphs.index(target_string) + 1 # 在目标字符串后面插入,所以需要加1
if insert_index < len(doc.paragraphs):
# 在指定位置后插入内容
doc.paragraphs[insert_index].insert_paragraph_before(new_content)
# 保存修改后的 Word 文档
doc.save('test.docx')
if __name__ == '__main__':
# 打开 Word 文档
doc = Document('test.docx')
# 定义要插入的内容和位置的字典
insert_dict = {
'指定字符1位置': '插入内容1',
'指定字符2位置': '插入内容2',
'指定字符3位置': '插入内容3'
}
for target_string, new_content in insert_dict.items():
# 插入内容
insert_content(doc, target_string, new_content)
specify font size
from docx import Document
from docx.shared import Pt
def insert_content(doc, target_string, new_content):
# 获取文档中所有段落的内容
paragraphs = [p.text for p in doc.paragraphs]
if target_string in paragraphs:
# 指定要插入内容的位置
insert_index = paragraphs.index(target_string) + 1 # 在目标字符串后面插入,所以需要加1
if insert_index < len(doc.paragraphs):
# 在指定位置后插入内容
paragraph = doc.paragraphs[insert_index]
run = paragraph.insert_paragraph_before(new_content).runs[0]
font = run.font
font.size = Pt(12) # 设置字体大小为3号字体(12磅)
# 保存修改后的 Word 文档
doc.save('test.docx')
if __name__ == '__main__':
# 打开 Word 文档
doc = Document('test.docx')
# 定义要插入的内容和位置的字典
insert_dict = {
'指定字符1位置': '插入内容1',
'指定字符2位置': '插入内容2',
'指定字符3位置': '插入内容3'
}
for target_string, new_content in insert_dict.items():
# 插入内容
insert_content(doc, target_string, new_content)
8. Goal 8: Write the graphic and text behind the specified string
from docx import Document
from docx.shared import Pt
from docx.shared import Inches
import requests
def word_img_text_w(word, target_string):
# 打开 Word 文档
doc = Document('test.docx')
# 获取文档中所有段落的内容
paragraphs = [p.text for p in doc.paragraphs]
if target_string in paragraphs:
# 指定目标字符串的位置
insert_index = paragraphs.index(target_string) + 1 # 在目标字符串后面插入,所以需要加1
# 以换行符分割文本
lines = word.split('\n')
for line in lines:
if line.startswith('https://'):
# 下载图片
response = requests.get(line)
image_path = line.split('/')[-1] # 图片保存的本地路径,使用URL中的最后一部分作为文件名
with open(image_path, 'wb') as f:
f.write(response.content)
# 插入图片到Word文档
doc.paragraphs[insert_index].add_run().add_picture(image_path, width=Inches(4)) # 根据需要调整图片的宽度
insert_index += 1
else:
# 插入文本到Word文档
run = doc.paragraphs[insert_index].add_run(line)
run.font.size = Pt(16) # 设置字体大小为16磅
insert_index += 1
# 保存Word文档
doc.save("test.docx")
if __name__ == '__main__':
# 要插入的内容
content = '''测试
https://xx.png
https://xxxx.png'''
# 指定目标字符串
target_string = '指定目标字符1'
# 插入内容到Word文档
word_img_text_w(content, target_string)