The python pdf word after the tab is inserted into

Technology used

  1. python programming fundamentals

  2. Use pyPdf

  3. Operation word python

  4. The use of regular expressions

  5. windows programming the bat

 

The following is an example of a library using pyPdf:

    from pyPdf import PdfFileWriter, PdfFileReader

    output = PdfFileWriter()
    input1 = PdfFileReader(file("document1.pdf", "rb"))

    # add page 1 from input1 to output document, unchanged
    output.addPage(input1.getPage(0))

    # add page 2 from input1, but rotated clockwise 90 degrees
    output.addPage(input1.getPage(1).rotateClockwise(90))

    # add page 3 from input1, rotated the other way:
    output.addPage(input1.getPage(2).rotateCounterClockwise(90))
    # alt: output.addPage(input1.getPage(2).rotateClockwise(270))

    # add page 4 from input1, but first add a watermark from another pdf:
    page4 = input1.getPage(3)
    watermark = PdfFileReader(file("watermark.pdf", "rb"))
    page4.mergePage(watermark.getPage(0))

    # add page 5 from input1, but crop it to half size:
    page5 = input1.getPage(4)
    page5.mediaBox.upperRight = (
        page5.mediaBox.getUpperRight_x () / 2 ,
        page5.mediaBox.getUpperRight_y () / 2
    )
    output.addPage(page5)

    # print how many pages input1 has:
    print "document1.pdf has %s pages." % input1.getNumPages())

    # finally, write "output" to document-output.pdf
    outputStream = file("document-output.pdf", "wb")
    output.write(outputStream)

With this library, you can easily split an existing pdf do.

Because I need to extract pdf keywords you want to come out, it is used as the file name. pyPdf available in pdf extract all the text.

inputfile.getPage(0).extractText()

Here returned unicode, need to turn str

inputfile.getPage(0).extractText().encode("utf-8")

Then extract the keywords of each page out, increasing function as follows:

p_sheetName = re.compile('Blattname: (.+?)project')
def getSheetName(str):
    m = p_sheetName.search(str)
    if m:
        return m.group(1)
    else:
        return None;

The final code is as follows:

from pyPdf import PdfFileWriter, PdfFileReader
import re,os

p_sheetName = re.compile('Blattname: (.+?)project')
def getSheetName(str):
    m = p_sheetName.search(str)
    if m:
        return m.group(1)
    else:
        return None;

def splitpdf(srcFile):
        input1 = file(srcFile,"rb")
        inputfile = PdfFileReader(input1)
        numofpages = inputfile.getNumPages()
        print "pages: %d" % numofpages
        #new directory
        folderName,ext_ = os.path.splitext(srcFile)
        if not os.path.isdir(folderName):
            os.makedirs(folderName)
        for page_index in range(1,numofpages+1):
            output = PdfFileWriter()
            output.addPage(inputfile.getPage(page_index-1))
            
            sheetName = getSheetName(inputfile.getPage(page_index-1).extractText().encode("utf-8"))
            #save file
            saveFileName = os.path.join(folderName,"%d %s.pdf" % (page_index,sheetName))
            print saveFileName
            outputFile = file(saveFileName,"wb")
            output.write(outputFile)
            outputFile.close()
        input1.close()


splitpdf("E:\\test.pdf")

Next, pdf parameterized

from pyPdf import PdfFileWriter, PdfFileReader
import re,sys,os,string

def translator(frm='', to='', delete='', keep=None):
    if len(to) == 1 :
        to = to * len(frm)
    trans = string.maketrans(frm,to)
    if keep is not None:
        allchars = string.maketrans('','')
        delete = allchars.translate(allchars,keep.translate(allchars,delete))
    def translate(s):
        return s.translate(trans,delete)
    return translate

delete_some_speicl = translator(delete="/:\\?*><|")

p_sheetName = re.compile('Blattname: (.+?)project')
def getSheetName(str):
    m = p_sheetName.search(str)
    return delete_some_speicl(m.group(1))

def splitpdf(srcFile):
    try:
        folderName,ext_ = os.path.splitext(srcFile)
        if ext_ != '.pdf':
            raise Exception(os.path.basename(srcFile) + " is not pdf!")
        input1 = file(srcFile,"rb")
        inputfile = PdfFileReader(input1)
        numofpages = inputfile.getNumPages()
        print "pages: %d" % numofpages
        #new directory
        if not os.path.isdir(folderName):
            os.makedirs(folderName)
        for page_index in range(1,numofpages+1):
            output = PdfFileWriter()
            output.addPage(inputfile.getPage(page_index-1))
            
            sheetName = getSheetName(inputfile.getPage(page_index-1).extractText().encode("utf-8"))
            #save file
            saveFileName = os.path.join(folderName,"%d %s.pdf" % (page_index,sheetName))
            print saveFileName
            outputFile = file(saveFileName,"wb")
            output.write(outputFile)
            outputFile.close()
        input1.close()
        print "Split success!"
        print "please find them at " + folderName
    except Exception,e:
        print e

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print 'usage: %s filename' % os.path.basename(sys.argv[0])
        exit(0)
    #print sys.argv[1]
    splitpdf(sys.argv[1])   

Here is the translator function keywords to filter out special characters, because errors may occur when a new file.

 

In fact, apart pdf also need some manual operation, or the need to import to the word in use vba, I would like to finish with a python direct these things, if you used the word to operate win32com

The following is an example of operation using the word:

import win32com
from win32com.client import Dispatch, constants

W = win32com.client.Dispatch ( ' the Word.Application ' )
 # or using the following method using starting a separate process: 
# W = win32com.client.DispatchEx ( 'the Word.Application')

# Running in the background, no display, no warning 
w.Visible = 0
w.DisplayAlerts = 0

# Open a new file 
DOC = w.Documents.Open (FileName = filenamein)
 # WordDoc = w.Documents.Add () # Create a new document

# Insert text 
myRange = doc.Range (0,0)
myRange.InsertBefore('Hello from Python!')

# Using styles 
wordSel = myRange.Select ()
wordSel.Style = constants.wdStyleHeading1

# Body text replacement 
w.Selection.Find.ClearFormatting ()
w.Selection.Find.Replacement.ClearFormatting()
w.Selection.Find.Execute(OldStr, False, False, False, False, False, True, 1, True, NewStr, 2)

# Header text Alternatively 
w.ActiveDocument.Sections [0] .Headers [0] .Range.Find.ClearFormatting ()
w.ActiveDocument.Sections[0].Headers[0].Range.Find.Replacement.ClearFormatting()
w.ActiveDocument.Sections[0].Headers[0].Range.Find.Execute(OldStr, False, False, False, False, False, True, 1, False, NewStr, 2)

# Table operation 
doc.Tables [0] .Rows [0] .Cells [0] .Range.Text = ' 123123 ' 
worddoc.Tables [0] .Rows.Add () # add a line

# Converted to HTML 
WC = win32com.client.constants
w.ActiveDocument.WebOptions.RelyOnCSS = 1
w.ActiveDocument.WebOptions.OptimizeForBrowser = 1
w.ActiveDocument.WebOptions.BrowserLevel = 0 # constants.wdBrowserLevelV4
w.ActiveDocument.WebOptions.OrganizeInFolder = 0
w.ActiveDocument.WebOptions.UseLongFileNames = 1
w.ActiveDocument.WebOptions.RelyOnVML = 0
w.ActiveDocument.WebOptions.AllowPNG = 1
w.ActiveDocument.SaveAs( FileName = filenameout, FileFormat = wc.wdFormatHTML )

# Print 
doc.PrintOut ()

# 关闭
# doc.Close()
w.Documents.Close(wc.wdDoNotSaveChanges)
w.Quit()

Modeled on the embodiment, modify the preceding code is as follows:

from pyPdf import PdfFileWriter, PdfFileReader
import re,sys,os,string,win32com
from win32com.client import Dispatch, constants
win32com.client.gencache.EnsureDispatch('Word.Application')


def translator(frm='', to='', delete='', keep=None):
    if len(to) == 1 :
        to = to * len(frm)
    trans = string.maketrans(frm,to)
    if keep is not None:
        allchars = string.maketrans('','')
        delete = allchars.translate(allchars,keep.translate(allchars,delete))
    def translate(s):
        return s.translate(trans,delete)
    return translate

delete_some_speicl = translator(delete="/:\\?*><|")

p_sheetName = re.compile('Blattname: (.+?)project')
def getSheetName(str):
    m = p_sheetName.search(str)
    return m.group(1)

def splitPdfToWord(srcFile):
    try:
        folderName,ext_ = os.path.splitext(srcFile)
        if ext_ != '.pdf':
            raise Exception(os.path.basename(srcFile) + " is not pdf!")
        input1 = file(srcFile,"rb")
        inputfile = PdfFileReader(input1)
        numofpages = inputfile.getNumPages()
        print "Total Pages: %d" % numofpages
        wordApp = win32com.client.Dispatch('Word.Application')
        wordApp.Visible = False
        wordApp.DisplayAlerts = 0
        doc = wordApp.Documents.Add()
        sel = wordApp.Selection
        #new directory
        if not os.path.isdir(folderName):
            os.makedirs(folderName)
        for page_index in range(1,numofpages+1):
            output = PdfFileWriter()
            output.addPage(inputfile.getPage(page_index-1))
            
            sheetName = getSheetName(inputfile.getPage(page_index-1).extractText().encode("utf-8"))
            sel.Style = constants.wdStyleHeading1
            sel.TypeText("Page%d %s" % (page_index,sheetName))
            sheetName = delete_some_speicl(sheetName)
            #save file
            saveFileName = os.path.join(folderName,"%d %s.pdf" % (page_index,sheetName))
            print "Add Page %d" % page_index
            #print saveFileName
            outputFile = file(saveFileName,"wb")
            output.write(outputFile)
            outputFile.close()
            sel.TypeParagraph()
            sel.Style = constants.wdStyleBodyText
            sel.InlineShapes.AddOLEObject(ClassType="AcroExch.Document.11",FileName=saveFileName)
            sel.InsertBreak (Type = constants.wdPageBreak)
        input1.close()
        doc.SaveAs(folderName+".doc")
        print "Split success!"
        print "please find them at " + folderName
        print "create word document success!"
        print "Location:" + folderName + ".doc"
    except Exception,e:
        print e
    finally:
        wordApp.Quit()

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print 'usage: %s filename' % os.path.basename(sys.argv[0])
        sys.exit(1)
    splitPdfToWord(sys.argv[1])   

 

Reproduced in: https: //www.cnblogs.com/zhangyonghugo/p/3501065.html

Guess you like

Origin blog.csdn.net/weixin_34258078/article/details/93946736
Recommended