Technology used
1. python programming fundamentals
2. Use pyPdf
3. Operation word python
4. The use of regular expressions
5. windows programming the bat
The following is an example of a library using pyPdf:
from pyPdf import PdfFileWriter, PdfFileReader output = PdfFileWriter() input1 = PdfFileReader(file("document1.pdf", "rb")) # add page 1 from input1 to output document, unchanged output.addPage(input1.getPage(0)) # add page 2 from input1, but rotated clockwise 90 degrees output.addPage(input1.getPage(1).rotateClockwise(90)) # add page 3 from input1, rotated the other way: output.addPage(input1.getPage(2).rotateCounterClockwise(90)) # alt: output.addPage(input1.getPage(2).rotateClockwise(270)) # add page 4 from input1, but first add a watermark from another pdf: page4 = input1.getPage(3) watermark = PdfFileReader(file("watermark.pdf", "rb")) page4.mergePage(watermark.getPage(0)) # add page 5 from input1, but crop it to half size: page5 = input1.getPage(4) page5.mediaBox.upperRight = ( page5.mediaBox.getUpperRight_x () / 2 , page5.mediaBox.getUpperRight_y () / 2 ) output.addPage(page5) # print how many pages input1 has: print "document1.pdf has %s pages." % input1.getNumPages()) # finally, write "output" to document-output.pdf outputStream = file("document-output.pdf", "wb") output.write(outputStream)
With this library, you can easily split an existing pdf do.
Because I need to extract pdf keywords you want to come out, it is used as the file name. pyPdf available in pdf extract all the text.
inputfile.getPage(0).extractText()
Here returned unicode, need to turn str
inputfile.getPage(0).extractText().encode("utf-8")
Then extract the keywords of each page out, increasing function as follows:
p_sheetName = re.compile('Blattname: (.+?)project') def getSheetName(str): m = p_sheetName.search(str) if m: return m.group(1) else: return None;
The final code is as follows:
from pyPdf import PdfFileWriter, PdfFileReader import re,os p_sheetName = re.compile('Blattname: (.+?)project') def getSheetName(str): m = p_sheetName.search(str) if m: return m.group(1) else: return None; def splitpdf(srcFile): input1 = file(srcFile,"rb") inputfile = PdfFileReader(input1) numofpages = inputfile.getNumPages() print "pages: %d" % numofpages #new directory folderName,ext_ = os.path.splitext(srcFile) if not os.path.isdir(folderName): os.makedirs(folderName) for page_index in range(1,numofpages+1): output = PdfFileWriter() output.addPage(inputfile.getPage(page_index-1)) sheetName = getSheetName(inputfile.getPage(page_index-1).extractText().encode("utf-8")) #save file saveFileName = os.path.join(folderName,"%d %s.pdf" % (page_index,sheetName)) print saveFileName outputFile = file(saveFileName,"wb") output.write(outputFile) outputFile.close() input1.close() splitpdf("E:\\test.pdf")
Next, pdf parameterized
from pyPdf import PdfFileWriter, PdfFileReader import re,sys,os,string def translator(frm='', to='', delete='', keep=None): if len(to) == 1 : to = to * len(frm) trans = string.maketrans(frm,to) if keep is not None: allchars = string.maketrans('','') delete = allchars.translate(allchars,keep.translate(allchars,delete)) def translate(s): return s.translate(trans,delete) return translate delete_some_speicl = translator(delete="/:\\?*><|") p_sheetName = re.compile('Blattname: (.+?)project') def getSheetName(str): m = p_sheetName.search(str) return delete_some_speicl(m.group(1)) def splitpdf(srcFile): try: folderName,ext_ = os.path.splitext(srcFile) if ext_ != '.pdf': raise Exception(os.path.basename(srcFile) + " is not pdf!") input1 = file(srcFile,"rb") inputfile = PdfFileReader(input1) numofpages = inputfile.getNumPages() print "pages: %d" % numofpages #new directory if not os.path.isdir(folderName): os.makedirs(folderName) for page_index in range(1,numofpages+1): output = PdfFileWriter() output.addPage(inputfile.getPage(page_index-1)) sheetName = getSheetName(inputfile.getPage(page_index-1).extractText().encode("utf-8")) #save file saveFileName = os.path.join(folderName,"%d %s.pdf" % (page_index,sheetName)) print saveFileName outputFile = file(saveFileName,"wb") output.write(outputFile) outputFile.close() input1.close() print "Split success!" print "please find them at " + folderName except Exception,e: print e if __name__ == '__main__': if len(sys.argv) < 2: print 'usage: %s filename' % os.path.basename(sys.argv[0]) exit(0) #print sys.argv[1] splitpdf(sys.argv[1])
Here is the translator function keywords to filter out special characters, because errors may occur when a new file.
In fact, apart pdf also need some manual operation, or the need to import to the word in use vba, I would like to finish with a python direct these things, if you used the word to operate win32com
The following is an example of operation using the word:
import win32com from win32com.client import Dispatch, constants W = win32com.client.Dispatch ( ' the Word.Application ' ) # or using the following method using starting a separate process: # W = win32com.client.DispatchEx ( 'the Word.Application') # Running in the background, no display, no warning w.Visible = 0 w.DisplayAlerts = 0 # Open a new file DOC = w.Documents.Open (FileName = filenamein) # WordDoc = w.Documents.Add () # Create a new document # Insert text myRange = doc.Range (0,0) myRange.InsertBefore('Hello from Python!') # Using styles wordSel = myRange.Select () wordSel.Style = constants.wdStyleHeading1 # Body text replacement w.Selection.Find.ClearFormatting () w.Selection.Find.Replacement.ClearFormatting() w.Selection.Find.Execute(OldStr, False, False, False, False, False, True, 1, True, NewStr, 2) # Header text Alternatively w.ActiveDocument.Sections [0] .Headers [0] .Range.Find.ClearFormatting () w.ActiveDocument.Sections[0].Headers[0].Range.Find.Replacement.ClearFormatting() w.ActiveDocument.Sections[0].Headers[0].Range.Find.Execute(OldStr, False, False, False, False, False, True, 1, False, NewStr, 2) # Table operation doc.Tables [0] .Rows [0] .Cells [0] .Range.Text = ' 123123 ' worddoc.Tables [0] .Rows.Add () # add a line # Converted to HTML WC = win32com.client.constants w.ActiveDocument.WebOptions.RelyOnCSS = 1 w.ActiveDocument.WebOptions.OptimizeForBrowser = 1 w.ActiveDocument.WebOptions.BrowserLevel = 0 # constants.wdBrowserLevelV4 w.ActiveDocument.WebOptions.OrganizeInFolder = 0 w.ActiveDocument.WebOptions.UseLongFileNames = 1 w.ActiveDocument.WebOptions.RelyOnVML = 0 w.ActiveDocument.WebOptions.AllowPNG = 1 w.ActiveDocument.SaveAs( FileName = filenameout, FileFormat = wc.wdFormatHTML ) # Print doc.PrintOut () # 关闭 # doc.Close() w.Documents.Close(wc.wdDoNotSaveChanges) w.Quit()
Modeled on the embodiment, modify the preceding code is as follows:
from pyPdf import PdfFileWriter, PdfFileReader import re,sys,os,string,win32com from win32com.client import Dispatch, constants win32com.client.gencache.EnsureDispatch('Word.Application') def translator(frm='', to='', delete='', keep=None): if len(to) == 1 : to = to * len(frm) trans = string.maketrans(frm,to) if keep is not None: allchars = string.maketrans('','') delete = allchars.translate(allchars,keep.translate(allchars,delete)) def translate(s): return s.translate(trans,delete) return translate delete_some_speicl = translator(delete="/:\\?*><|") p_sheetName = re.compile('Blattname: (.+?)project') def getSheetName(str): m = p_sheetName.search(str) return m.group(1) def splitPdfToWord(srcFile): try: folderName,ext_ = os.path.splitext(srcFile) if ext_ != '.pdf': raise Exception(os.path.basename(srcFile) + " is not pdf!") input1 = file(srcFile,"rb") inputfile = PdfFileReader(input1) numofpages = inputfile.getNumPages() print "Total Pages: %d" % numofpages wordApp = win32com.client.Dispatch('Word.Application') wordApp.Visible = False wordApp.DisplayAlerts = 0 doc = wordApp.Documents.Add() sel = wordApp.Selection #new directory if not os.path.isdir(folderName): os.makedirs(folderName) for page_index in range(1,numofpages+1): output = PdfFileWriter() output.addPage(inputfile.getPage(page_index-1)) sheetName = getSheetName(inputfile.getPage(page_index-1).extractText().encode("utf-8")) sel.Style = constants.wdStyleHeading1 sel.TypeText("Page%d %s" % (page_index,sheetName)) sheetName = delete_some_speicl(sheetName) #save file saveFileName = os.path.join(folderName,"%d %s.pdf" % (page_index,sheetName)) print "Add Page %d" % page_index #print saveFileName outputFile = file(saveFileName,"wb") output.write(outputFile) outputFile.close() sel.TypeParagraph() sel.Style = constants.wdStyleBodyText sel.InlineShapes.AddOLEObject(ClassType="AcroExch.Document.11",FileName=saveFileName) sel.InsertBreak (Type = constants.wdPageBreak) input1.close() doc.SaveAs(folderName+".doc") print "Split success!" print "please find them at " + folderName print "create word document success!" print "Location:" + folderName + ".doc" except Exception,e: print e finally: wordApp.Quit() if __name__ == '__main__': if len(sys.argv) < 2: print 'usage: %s filename' % os.path.basename(sys.argv[0]) sys.exit(1) splitPdfToWord(sys.argv[1])
Reproduced in: https: //www.cnblogs.com/zhangyonghugo/p/3501065.html