python read word in a paragraph, table, FIG.

To read text, drawings, tables, extract information

Import
docx Import ZipFile Import OS Import the shutil '' ' to read the text of the word ' '' DEF gettxt (): File = docx.Document ( " gao.docx " ) Print ( " the number of paragraphs: " + STR (len (File .paragraphs))) # number of paragraphs 13, each section of the transport isolated # output each piece of content # for para in file.paragraphs: # Print (para.text) # output numbered paragraphs and paragraphs of for I in Range ( len (file.paragraphs)): IFlen (file.paragraphs [I] .text.replace ( ' ' , '' ))>. 4 : Print ( " first " + STR (I) + " of content is: " + file.paragraphs [I] .text ) '' ' to read the word in the table ' '' DEF GetTable (): DOC = docx.Document ( ' word.docx ' ) for table in doc.tables: # through all forms Print ( ' ---- the table- ----- ') for row intable.rows: # traversing all table rows # row_str = '\ t'.join ([cell.text for Cell in row.cells]) line of data # # Print row_str for Cell in row.cells: Print (cell.text , ' \ T ' ) '' 'file information acquisition decompression' '' DEF the getinfo (the wordfile): F = zipfile.ZipFile (the wordfile, ' R & lt ' ) for filename in f.namelist (): f.extract (filename ) Print (filename) '' ' information after decompressed output: the _rels / the _rels /.rels customXml/rels customXml/_rels/ customXML / the _rels / item1.xml.rels customXML / the _rels / item2.xml.rels customXML / item1.xml customXML / item2.xml customXML / itemProps1.xml customXML / itemProps2.xml docProps / docProps / app.xml docProps / core.xml docProps / Custom.xml docProps / Thumbnail.wmf Word / Word / the _rels / Word / the _rels / document.xml.rels Word / the document.xml Word / fontTable.xml Word / Media / Word / Media / image1.jpeg Word / Numbering. XML Word / the settings.xml Word / the styles.xml Word / Theme / Word / Theme / theme1.xml '' ' ' '' ------ FIG obtain: word文档的路径 ZIP archive path provisional decompression path tmp Finally, the need to preserve store_path path '' ' DEF getpic (path, zip_path, tmp_path, store_path): ' '' : param path: source file : param zip_path: docx rename ZIP : param tmp_path: transit Pictures folder : param store_path : Finally save the results of a folder (created manually) : return: '' ' ' '' ============= docx file will rename the zip file ======== ============ '' ' os.rename (path, zip_path) # decompress f = zipfile.ZipFile (zip_path, ' r ' ) # will extract and save images for File in f.namelist (): f.extract (File,tmp_path) #Releasing the zip file f.close () '' ' ============= to restore files from the zip docx docx ================= === '' ' os.rename (zip_path, path) # get the cache folder picture list PIC = os.listdir (os.path.join (tmp_path, ' Word / Media ' )) ' '' ==== ========= copy pictures to a folder on the final ==================== '' ' for i in PIC: # according to the word path generating image name NEW_NAME = path.replace ( ' \\ ' , ' _ ' ) NEW_NAME = new_name.replace ( ' :', '' ) + ' _ ' + I shutil.copy (the os.path.join (tmp_path + ' / Word / Media ' , I), the os.path.join (store_path, NEW_NAME)) '' ' ======= ====== delete cache file folder to store the file in a ==================== '' ' for I in the os.listdir (tmp_path): # If the folder is deleted IF os.path.isdir (os.path.join (tmp_path, i)): shutil.rmtree (os.path.join (tmp_path, i)) IF __name__ == ' __main__ ' : # source file path = R & lt 'E: \ dogcat \ extract pictures \ log.docx ' # docx rename ZIP zip_path = r ' E: \ dogcat \ extract pictures \ log.zip ' # transit Pictures folder tmp_path = r ' E: \ dogcat \ extract pictures \ tmp ' # finally save the results of a folder store_path = r ' E: \ dogcat \ extract pictures \ test ' m = getpic (path, zip_path, tmp_path, store_path)

 

 

 

As for processing doc docx files directly into a dump file on it

DEF docTTTTTdocx (DOC_NAME, docx_name):     the try :
         # is first converted into doc docx 
        Word = client.Dispatch ( " the Word.Application " ) 
        doc = word.Documents.Open (DOC_NAME)
         # parameters represents 16 to convert doc docx 
        doc .SaveAs (docx_name, 16 ) 
        doc.close () 
        word.Quit () 
    the except :
         Pass

 

 

reference:

https://blog.csdn.net/qq_40925239/article/details/83279957

https://blog.csdn.net/qq_15969343/article/details/81673970

Guess you like

Origin www.cnblogs.com/51python/p/11033002.html
Recommended