【Java】使用poi+pdfbox实现office文件提取内容

引入maven依赖

<!-- poi -->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi</artifactId>
    <version>3.16</version>
</dependency>
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-ooxml</artifactId>
    <version>3.16</version>
</dependency>
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-scratchpad</artifactId>
    <version>3.16</version>
</dependency>

<!-- pdf -->
<dependency>
    <groupId>org.apache.pdfbox</groupId>
    <artifactId>pdfbox</artifactId>
    <version>2.0.4</version>
</dependency>

提取内容

private static String read(File file) {
    StringBuilder builder = new StringBuilder();

    String name = file.getName();
    boolean txt = name.endsWith(".txt");
    if (txt) {
        try (FileInputStream inputStream = new FileInputStream(file)) {
            int len;
            byte[] bytes = new byte[1024];
            while ((len = inputStream.read(bytes)) != -1) {
                builder.append(new String(bytes, 0, len));
            }
            inputStream.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    POITextExtractor extractor = null;
    boolean word = name.endsWith(".doc") || name.endsWith(".docx");
    if (word) {
        try {
            extractor = new WordExtractor(new HWPFDocument(new FileInputStream(file)));
        } catch (Exception e) {
            try {
                extractor = new XWPFWordExtractor(new XWPFDocument(new FileInputStream(file)));
            } catch (Exception ignored) {
            }
        }
    }
    boolean excel = name.endsWith(".xls") || name.endsWith(".xlsx");
    if (excel) {
        try {
            extractor = new ExcelExtractor(new HSSFWorkbook(new POIFSFileSystem(file)));
        } catch (Exception e) {
            try {
                extractor = new XSSFExcelExtractor(new XSSFWorkbook(file));
            } catch (Exception ignored) {
            }
        }
    }
    boolean slide = name.endsWith(".ppt") || name.endsWith(".pptx");
    if (slide) {
        try {
            extractor = new PowerPointExtractor(new FileInputStream(file));
        } catch (Exception e) {
            try {
                extractor = new XSLFPowerPointExtractor(new XSLFSlideShow(OPCPackage.open(file)));
            } catch (Exception ignored) {
            }
        }
    }
    if (extractor != null) {
        builder.append(extractor.getText());
        try {
            extractor.close();
        } catch (IOException ignored) {
        }
    }
    boolean pdf = name.endsWith(".pdf");
    if (pdf) {
        try {
            PDDocument document = PDDocument.load(file);
            PDFTextStripper stripper = new PDFTextStripper();
            builder.append(stripper.getText(document));
            document.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    return builder.toString();
}

猜你喜欢

转载自blog.csdn.net/T_amo/article/details/89493489
今日推荐