自动化将 word 转为 pdf,再将pdf转为图片!

参考:

https://blog.csdn.net/ynyn2013/article/details/49120731

https://www.jianshu.com/p/f57cc64b9f5e

一、将 doc 转为 pdf

1、install 依赖

pip install pywin32

2、直接调用win32com接口打开文件,另存为pdf。SaveAs中的参数17代表村委pdf格式,完了关闭文件,关闭word。

 1     def doc2pdf(self):
 2         try:
 3             w = Dispatch("Word.Application")
 4             doc = w.Documents.Open(self.docPath, ReadOnly=1)
 5             doc.SaveAs(self.pdfPath, 17)
 6         except:
 7             traceback.print_exc()
 8         finally:
 9             doc.Close()
10             w.Quit()
11         self.checkFile(self.pdfPath, 'pdf')

以下为文件格式对应表

wdFormatDocument = 0
wdFormatDocument97 = 0
wdFormatDocumentDefault = 16
wdFormatDOSText = 4
wdFormatDOSTextLineBreaks = 5
wdFormatEncodedText = 7
wdFormatFilteredHTML = 10
wdFormatFlatXML = 19
wdFormatFlatXMLMacroEnabled = 20
wdFormatFlatXMLTemplate = 21
wdFormatFlatXMLTemplateMacroEnabled = 22
wdFormatHTML = 8
wdFormatPDF = 17
wdFormatRTF = 6
wdFormatTemplate = 1
wdFormatTemplate97 = 1
wdFormatText = 2
wdFormatTextLineBreaks = 3
wdFormatUnicodeText = 7
wdFormatWebArchive = 9
wdFormatXML = 11
wdFormatXMLDocument = 12
wdFormatXMLDocumentMacroEnabled = 13
wdFormatXMLTemplate = 14
wdFormatXMLTemplateMacroEnabled = 15
wdFormatXPS = 18

  

二、将pdf转为图片

1、install 依赖

1.1、pip isntall pdf2image

1.2、Windows安装配置poppler
Windows用户必须为Windows安装 poppler ( http://blog.alivate.com.au/poppler-windows/ ),然后将bin/文件夹添加到PATH (开始>输入env>编辑系统环境变量>环境变量...>系统变量>Path)
安装完poppler需重启系统后生效。
 
2、将pdf转为图片
 1     def pdf2image(self):
 2         # 建立图片文件夹
 3         self.imgFold = os.path.join(self.fileFold, self.fileName)
 4         if not os.path.exists(self.imgFold):
 5             os.mkdir(self.imgFold)
 6 
 7         # 转存图片
 8         pages = convert_from_path(self.pdfPath)
 9         for i, page in enumerate(pages):
10             imgPath = os.path.join(self.imgFold, str(i)+'.jpg')
11             page.save(imgPath, 'JPEG')
12         self.checkFile(imgPath, 'last img')

三、直接将word转为图片

方法:结合1,2

代码如下:

 1 import os
 2 import traceback
 3 from win32com.client import Dispatch
 4 from pdf2image import convert_from_path
 5 
 6 class Word2Pdf2Img():
 7     def __init__(self, docPath):
 8         # 初始化路径
 9         self.docPath = docPath
10         self.fileName = os.path.basename(self.docPath).split('.')[0]
11         self.fileFold = os.path.dirname(self.docPath)
12         self.pdfPath = os.path.join(self.fileFold, self.fileName + '.pdf')
13 
14     @staticmethod
15     def checkFile(filePath, fileType=''):
16         if os.path.isfile(filePath):
17             print ('file {} existed!'.format(fileType))
18         else:
19             print ('file {} not existed!'.format(fileType))
20 
21     def doc2pdf(self):
22         try:
23             w = Dispatch("Word.Application")
24             doc = w.Documents.Open(self.docPath, ReadOnly=1)
25             doc.SaveAs(self.pdfPath, 17)
26         except:
27             traceback.print_exc()
28         finally:
29             doc.Close()
30             w.Quit()
31         self.checkFile(self.pdfPath, 'pdf')
32 
33     def pdf2image(self):
34         # 建立图片文件夹
35         self.imgFold = os.path.join(self.fileFold, self.fileName)
36         if not os.path.exists(self.imgFold):
37             os.mkdir(self.imgFold)
38 
39         # 转存图片
40         pages = convert_from_path(self.pdfPath)
41         for i, page in enumerate(pages):
42             imgPath = os.path.join(self.imgFold, str(i)+'.jpg')
43             page.save(imgPath, 'JPEG')
44         self.checkFile(imgPath, 'last img')
45         
46     def doc2image(self):
47         self.doc2pdf()
48         self.pdf2image()

猜你喜欢

转载自www.cnblogs.com/Fosen/p/11835737.html