使用Python将PDF转换为WORD

1、安装pdfminer

PDFMiner是从PDF文档中提取信息的工具。pdfminer3k是pdfminer的Python 3端口。

pip install pdfminer3k

2、读取PDF文件内容

from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter,process_pdf
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
from docx import Document
document = Document()
import warnings
warnings.filterwarnings("ignore")
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO
from urllib.request import urlopen
import pandas as pd

def readPDF(pdfFile):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)

    process_pdf(rsrcmgr, device, pdfFile)
    device.close()

    content = retstr.getvalue()
    retstr.close()
    return content
def save_to_file(file_name, contents):
    fh = open(file_name, 'w')
    fh.write(contents)
    fh.close()

save_to_file('mobiles.txt', 'your contents str')


def main():
    pdfFile = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf")
    outputString = readPDF(pdfFile)    #c.word
    save_to_file('c.csv',outputString)
if __name__ == '__main__':
    main()

3、安装Python DocX

Python DocX目前是Python OpenXML的一部分，你可以用它打开Word 2007及以后的文档，而用它保存的文档可以在Microsoft Office 2007/2010, Microsoft Mac Office 2008, Google Docs, OpenOffice.org 3, and Apple iWork 08中打开。

pip install python_docx

安装经常报错，
ERROR: Exception:
Traceback (most recent call last):
File “c:\users\l\appdata\local\programs\python\python37\lib\site-packages\pip_vendor\resolvelib\resolvers.py”, line 171, in _merge_into_criterion
crit = self.state.criteria[name]
KeyError: ‘python-docx’
During handling of the above exception, another exception occurred:

解决办法：

直接下载python-docx安装包

https://pypi.org/project/python-docx/#files

pip install ./downloads/python-docx-0.8.10.tar.gz

其中 ./downloads/python-docx-0.8.10.tar.gz表示的是下载的python-docx-0.8.10.tar.gz文件的真实路径。

例如下载的python-docx-0.8.10.tar.gz文件处于C盘下，则应使用命令:

pip install C:\python-docx-0.8.10.tar.gz

python-docx-0.8.10 需要 lxml>=2.3.2 ，因此如果 lxml版本不对，还需要升级 lxml版本

4、使用DocX保存Word

from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
from docx import Document
document = Document()
import warnings
warnings.filterwarnings("ignore")
import os
file_name=os.open('a.pdf',os.O_RDWR )

def main():

    fn = open(file_name,'rb')
    parser = PDFParser(fn)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    resource = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(resource,laparams=laparams)
    interpreter = PDFPageInterpreter(resource,device)
    for i in doc.get_pages():
        interpreter.process_page(i)
        layout = device.get_result()
        for out in layout:
            if hasattr(out,"get_text"):
                content = out.get_text().replace(u'\xa0', u' ') 
                document.add_paragraph(
                    content, style='ListBullet'   
                )
            document.save('a'+'.docx')
    print ('处理完成')
 
if __name__ == '__main__':
    main()