Poi读取word(doc)文档的文本或图片

package org.jimmy.studyproject.util;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Pattern;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;

import com.google.common.base.CharMatcher;
import com.google.common.collect.Lists;

@SuppressWarnings({"resource", "unused"})
public class Utils {
    
    public static final String DIR_PATH = "D:/Resume/TopicSolutions/SourceFile/";
    public static final String PICTURE_DIR_PATH = "D:/Resume/TopicSolutions/Picture/";
    public static String unicodeRegStr = "[a-zA-Z0-9\\u4e00-\\u9fa5";
    public static final String PUNCTUATION = "\\u3001,\\u03c1,\\u3002,\\uff08,\\u03c8,\\uff09,\\u300a,\\u300b,\\uff0b,\\uff0c,\\uff0d,\\uff0e,\\u33d1,\\u2014,\\u00d7,\\u2019,\\uff1a,\\u005b,\\uff1c,\\u005d,\\uff1d,\\u221e,\\uff1e,\\uff1f,\\u0060,\\u2264,\\u0028,\\u0029,\\u002b,\\u222b,\\u002d,\\u002e,\\u002f,\\u00b1,\\u03b8,\\u007b,\\u043b,\\u007c,\\u003c,\\u003d,\\u007d,\\u003e";
    public static Pattern unicodeReg = null;

    public static void main(String[] args){
        try {
            String[] punctuationArr = PUNCTUATION.split(",");
            if(punctuationArr != null){
                for(String punctuation : punctuationArr){
                    unicodeRegStr += punctuation;
                }
            }
            unicodeRegStr += "]";
            System.out.println(unicodeRegStr);
            unicodeReg = Pattern.compile(unicodeRegStr);
            List<String> contextList = readWordFile();
            contextList = readSourceWordFile();
            writeWordFile(contextList);
            /*String text = getWordAndStyle();
            System.out.println(text);*/
            /*boolean flag = UNICODE_REG.matcher("").matches();
            System.out.println(flag);*/
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    
    public static void writeWordFile(List<String> contextList){
        if(contextList != null){
            contextList.forEach(c -> System.out.print(c));
        }
    }
    
    //读取每个字符样式
    public static String getWordAndStyle() throws Exception{
        String text = "";
        File dir = new File(DIR_PATH);
        if(dir != null && dir.isDirectory()){
            File[] fileArr = dir.listFiles();
            for(File file : fileArr){
                CharacterRun cr = null;
                FileInputStream in = new FileInputStream(file.getAbsolutePath()); 
                HWPFDocument doc = new HWPFDocument(in); 
                int length = doc.characterLength();
                PicturesTable pictruesTable = doc.getPicturesTable();
                String picFilePath = null;
                for (int i = 0; i < length; i++){
                    Range range = new Range(i, i + 1, doc);
                    cr = range.getCharacterRun(0);
                    if(pictruesTable.hasPicture(cr)){
                        //获取图片路径
                        picFilePath = readPicture(pictruesTable, cr);
                    }
                }
            }
        }
        return text;
    }
    
    public static String readPicture(PicturesTable pTable, CharacterRun cr) throws Exception{
        Picture pic = pTable.extractPicture(cr, false);
        String pictureFileName = pic.suggestFullFileName();
        String pictureFilePath = PICTURE_DIR_PATH + pictureFileName;
        OutputStream out = new FileOutputStream(new File(pictureFilePath));
        pic.writeImageContent(out);
        return pictureFilePath;
    }
    
    /**
     * Detail: 读取源文件,过滤乱码
     * Author: ラピスラズリ(Dawn)
     * Date: 2020年4月22日 下午5:25:17
     */
    public static <T>List<String> readWordFile() throws Exception {
        List<String> contextList = Lists.newArrayList();
        List<String> contextReplacedList = new ArrayList<String>();
        List<String> contextIndividualList = new ArrayList<String>();
        File dir = new File(DIR_PATH);
        if(dir != null && dir.isDirectory()){
            File[] fileArr = dir.listFiles();
            for(File file : fileArr){
                InputStream stream = new FileInputStream(file);
                String path = file.getAbsolutePath();
                if (path.endsWith(".doc")) {
                    HWPFDocument document = new HWPFDocument(stream);
                    WordExtractor extractor = new WordExtractor(document);
                    String[] contextArray = extractor.getParagraphText();
                    Arrays.asList(contextArray).forEach(context -> contextList.add(CharMatcher.whitespace().removeFrom(context)));
                    extractor.close();
                    document.close();
                } else if (path.endsWith(".docx")) {
                    XWPFDocument document = new XWPFDocument(stream).getXWPFDocument();
                    List<XWPFParagraph> paragraphList = document.getParagraphs();
                    paragraphList.forEach(paragraph -> contextList.add(CharMatcher.whitespace().removeFrom(paragraph.getParagraphText())));
                    document.close();
                } 
                if(stream != null){
                    stream.close();
                } 
            }
        }
        contextList.forEach(c -> contextReplacedList.add(c.replaceAll("EMBEDEquation.3", "")));
        contextReplacedList.forEach(c -> {
            for(int i = 0; i < c.length(); i++){
                String currentWord = c.charAt(i) + "";
                if(unicodeReg.matcher(currentWord).matches()){
                    contextIndividualList.add(currentWord);
                }
            }
        });
        return contextIndividualList;
    }
    
    /**
     * Detail: 读取源文件,有乱码
     * Author: ラピスラズリ(Dawn)
     * Date: 2020年4月22日 下午5:24:51
     */
    public static <T>List<String> readSourceWordFile() throws Exception {
        List<String> contextList = Lists.newArrayList();
        List<String> contextReplacedList = new ArrayList<String>();
        List<String> contextIndividualList = new ArrayList<String>();
        File dir = new File(DIR_PATH);
        if(dir != null && dir.isDirectory()){
            File[] fileArr = dir.listFiles();
            for(File file : fileArr){
                InputStream stream = new FileInputStream(file);
                String path = file.getAbsolutePath();
                if (path.endsWith(".doc")) {
                    HWPFDocument document = new HWPFDocument(stream);
                    WordExtractor extractor = new WordExtractor(document);
                    String[] contextArray = extractor.getParagraphText();
                    Arrays.asList(contextArray).forEach(context -> contextList.add(CharMatcher.whitespace().removeFrom(context)));
                    extractor.close();
                    document.close();
                } else if (path.endsWith(".docx")) {
                    XWPFDocument document = new XWPFDocument(stream).getXWPFDocument();
                    List<XWPFParagraph> paragraphList = document.getParagraphs();
                    paragraphList.forEach(paragraph -> contextList.add(CharMatcher.whitespace().removeFrom(paragraph.getParagraphText())));
                    document.close();
                } 
                if(stream != null){
                    stream.close();
                } 
            }
        }
        contextList.forEach(c -> contextReplacedList.add(c.replaceAll("EMBEDEquation.3", "")));
        contextReplacedList.forEach(c -> {
            for(int i = 0; i < c.length(); i++){
                String currentWord = c.charAt(i) + "";
                contextIndividualList.add(currentWord);
            }
        });
        return contextIndividualList;
    }
    
}

猜你喜欢

转载自www.cnblogs.com/JimmySeraph/p/12753631.html