Convert pdf files to word, csv using Python

One: Download the required libraries

1: pdfminer installation library command pip install pdfminer3k

pdfminer3k is a Python 3 port of pdfminer. PDFMiner is a tool for extracting information from PDF documents. Unlike other PDF-related tools, it is entirely focused on acquiring and analyzing textual data. PDFMiner allows to get the exact position of the text in the page, as well as other information such as fonts or lines. It contains a PDF converter that can convert PDF files to other text formats such as HTML. It has an extensible PDF parser that can be used for other purposes than text analysis.

2: docx install library command pip  install python_docx

Python DocX is currently part of Python OpenXML, you can use it to open Word 2007 and later documents, and documents saved with it can be found in Microsoft Office 2007/2010, Microsoft Mac Office 2008, Google Docs, OpenOffice.org 3, and Apple Open in iWork 08.

from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter,process_pdf
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
from docx import Document
document = Document()
import warnings
warnings.filterwarnings("ignore")
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO
from urllib.request import urlopen
import pandas as pd

def readPDF(pdfFile):
    rsrcmgr = PDFResourceManager ()
    retstr = StringIO ()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams= laparams)

    process_pdf(rsrcmgr, device, pdfFile)
    device.close()

    content = retstr.getvalue()
    retstr.close ()
    return content
def save_to_file(file_name, contents):
    fh = open(file_name, 'w')
    fh.write(contents)
    fh.close()

save_to_file('mobiles.txt', 'your contents str')


def main():
    pdfFile = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf")
    outputString = readPDF(pdfFile)
#c.word save_to_file(
'c.csv',outputString) if __name__ == '__main__': main()

Save as word using docx

from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
from docx import Document
document = Document()
import warnings
warnings.filterwarnings("ignore")
import os
file_name=os.open('/Users/dudu/Desktop/test1/a.pdf',os.O_RDWR )

def main():

    fn = open(file_name,'rb')
    parser = PDFParser(fn)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    resource = PDFResourceManager ()
    laparams = LAParams()
    device = PDFPageAggregator(resource,laparams=laparams)
    interpreter = PDFPageInterpreter(resource,device)
    for i in doc.get_pages():
        interpreter.process_page(i)
        layout = device.get_result()
        for out in layout:
            if hasattr(out,"get_text"):
                content = out.get_text().replace(u'\xa0', u' ') 
                document.add_paragraph(
                    content, style='ListBullet'   
                )
            document.save( ' a ' + ' .docx ' )
     print ( ' processing completed ' )
 
if __name__ == '__main__':
    main()

Add the following public account, I will send some information regularly.

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325081995&siteId=291194637