JAVA parses docx documents to extract text and pictures

<dependency>
      <groupId>fr.opensagres.xdocreport</groupId>
      <artifactId>org.apache.poi.xwpf.converter.pdf</artifactId>
      <version>1.0.6</version>
  </dependency>
package docxAnalyze;

import java.io.*;
import java.util.List;

import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFPictureData;

public class Word {
    
    

    public static String textPath = "src/main/resources/test.txt";
    public static String docPath = "src/main/resources/test.docx";
    public static String imagePath = "src/main/resources/test.docx";

    public static void main(String args[]) {
    
    
        readDocxTextAndImage();
    }

    public static String readDocxTextAndImage() {
    
    

        File file = new File(docPath);
        try {
    
    
            // 用XWPFWordExtractor来获取文字
            FileInputStream fis = new FileInputStream(file);
            XWPFDocument document = new XWPFDocument(fis);
            XWPFWordExtractor xwpfWordExtractor = new XWPFWordExtractor(document);
            String text = xwpfWordExtractor.getText();
            System.out.println(text);
            //将获取到的文字存放到对应文件名中的txt文件中
            PrintStream ps = new PrintStream(textPath);
            ps.println(text);


            // 用XWPFDocument的getAllPictures来获取所有的图片
            List<XWPFPictureData> picList = document.getAllPictures();
            for (XWPFPictureData pic : picList) {
    
    
                byte[] bytev = pic.getData();
                // 大于1000bites的图片我们才弄下来,消除word中莫名的小图片的影响
                if (bytev.length > 300) {
    
    
                    FileOutputStream fos = new FileOutputStream(imagePath + pic.getFileName());
                    fos.write(bytev);
                }
            }
            fis.close();
            return text;
        } catch (IOException e) {
    
    
            e.printStackTrace();
        }
        return null;
    }
}

insert image description here
insert image description here
insert image description here
insert image description here

You can see that the bodyelements object in a document object stores the content of the entire docx document. Other objects are to make statistics on each object of the document document.

word document content
insert image description here

Analysis effect
insert image description here

Guess you like

Origin blog.csdn.net/qq_43961619/article/details/109697486