java 操作office套件工具类,持续更新中

package com.qyw.utils;

import com.aspose.words.HtmlSaveOptions;
import fr.opensagres.poi.xwpf.converter.xhtml.Base64EmbedImgManager;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.output.StringBuilderWriter;
import org.apache.commons.lang3.StringUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.poi.hwpf.HWPFDocumentCore;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.converter.WordToHtmlUtils;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.fit.pdfdom.PDFDomTree;
import org.icepdf.core.pobjects.graphics.text.PageText;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.zwobble.mammoth.DocumentConverter;
import org.zwobble.mammoth.Result;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;

/**
 * Description: office套件工具类
 * @version 1.0 
 */
public class OfficeUtils {

    private static final Logger log = LoggerFactory.getLogger(OfficeUtils.class);

    private OfficeUtils(){}

    public static final Set<String> WORD_FILE_EXTENSION  = new HashSet<>(Arrays.asList("doc","docx"));

    public static final String PDF_FILE_EXTENSION  = "pdf";

    /**
     * 判断无否为doc文件
     * @param inputStream
     * @return
     */
    public static boolean isDocFile(InputStream inputStream){
        boolean result = true;
        WordExtractor doc = null;
        try {
            doc = new WordExtractor(inputStream);
        } catch (Exception e) {
            return false;
        }

        return  result;
    }

    /**
     * 提取doc文档的原始文本
     * @param inputStream
     * @return
     * @throws Exception
     */
    public static String extractDocRawText(InputStream inputStream) throws Exception{
        WordExtractor doc = null;
        String text = StringUtils.EMPTY;
        try {
            doc = new WordExtractor(inputStream);
            text = doc.getText();
        }finally {
            IOUtils.closeQuietly(doc);
            IOUtils.closeQuietly(inputStream);
        }
        return text;
    }

    /**
     * 提取docx文档的原始文本,mammoth方式
     * @param inputStream
     * @return
     * @throws Exception
     */

    public static String extractDocxRawText(InputStream inputStream) throws IOException{
        String text = StringUtils.EMPTY;
        try {
            DocumentConverter converter = new DocumentConverter();
            Result<String> result = converter.extractRawText(inputStream);
            text = result.getValue();
        }finally {
            IOUtils.closeQuietly(inputStream);
        }
        return text;
    }

    /**
     * 提取doc文档的原始文本,poi方式
     * @param inputStream
     * @return
     * @throws Exception
     */
    public static String extractDocxRawTextByPoi(InputStream inputStream) throws Exception{
        XWPFWordExtractor doc = null;
        String text = StringUtils.EMPTY;
        try {
            doc = new XWPFWordExtractor(OPCPackage.open(inputStream));
            text = doc.getText();
        }finally {
            IOUtils.closeQuietly(doc);
            IOUtils.closeQuietly(inputStream);
        }
        return text;
    }

    /**
     * doc转html
     * @param inputStream
     * @return
     * @throws Exception
     */
    public static  String convertDocToHtml(InputStream inputStream) throws Exception {
        ByteArrayOutputStream out = null;
        try {
            HWPFDocumentCore hwpfDocumentCore = WordToHtmlUtils.loadDoc(inputStream);
            WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
                    DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
            wordToHtmlConverter.processDocument(hwpfDocumentCore);
            Document htmlDocument = wordToHtmlConverter.getDocument();
            out = new ByteArrayOutputStream();
            DOMSource domSource = new DOMSource(htmlDocument);
            StreamResult streamResult = new StreamResult(out);
            TransformerFactory tf = TransformerFactory.newInstance();
            Transformer serializer = tf.newTransformer();
            serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
            serializer.setOutputProperty(OutputKeys.INDENT, "yes");
            serializer.setOutputProperty(OutputKeys.METHOD, "HTML");
            serializer.transform(domSource, streamResult);
        } finally {
            IOUtils.closeQuietly(out);
        }
        return new String(out.toByteArray());
    }

    /**
     * docx转换html
     * @param inputStream
     * @return
     * @throws IOException
     */
    public static String convertDocxToHtml(InputStream inputStream) throws IOException {
        XWPFDocument docxDocument = new XWPFDocument(inputStream);
        XHTMLOptions options = XHTMLOptions.create();
        options.setIgnoreStylesIfUnused(true);
        options.setFragment(true);
        options.setOmitHeaderFooterPages(true);
        // 转换图片
        options.setImageManager(new Base64EmbedImgManager());
        // 转换htm11
        ByteArrayOutputStream htmlStream = new ByteArrayOutputStream();
        XHTMLConverter.getInstance().convert(docxDocument, htmlStream, options);
        String htmlStr = htmlStream.toString();
        return htmlStr;

    }

    /**
     * word转html(支持doc、docx)
     *
     * @param inputByte
     * @return
     */
    public String word2Html(byte[] inputByte) {
        ByteArrayInputStream inStream = new ByteArrayInputStream(inputByte);
        String html = "";
        try {
            com.aspose.words.Document document = new com.aspose.words.Document(inStream);
            ByteArrayOutputStream outStream = new ByteArrayOutputStream();
            HtmlSaveOptions htmlSaveOptions = new HtmlSaveOptions();
            htmlSaveOptions.setImagesFolder(System.getProperty("java.io.tmpdir"));
            document.save(outStream, htmlSaveOptions);
            html = new String(outStream.toByteArray());
        } catch (Exception e) {
            log.error("aspose Word2Html error", e);
        }
        return html;
    }

    public static String convertPdftohtml(InputStream inputStream) throws Exception {
        try (PDDocument document = PDDocument.load(inputStream);
             StringBuilderWriter out = new StringBuilderWriter(500)) {
            PDFDomTree pdfDomTree = new PDFDomTree();
            pdfDomTree.writeText(document, out);
            return out.toString();
        }
    }

    public static String extractPdfText(InputStream inputStream) throws Exception{
        StringBuilder sb = new StringBuilder(500);
        try{
            org.icepdf.core.pobjects.Document document = new org.icepdf.core.pobjects.Document();
            document.setInputStream(inputStream,null);
            int numberOfPages = document.getNumberOfPages();

            for (int i = 0; i < numberOfPages; i++) {
                PageText pageText = document.getPageText(i);
                if (pageText != null && pageText.getPageLines() != null) {
                    sb.append(pageText.toString().trim());
                }
            }
        }finally {
            IOUtils.closeQuietly(inputStream);
        }
        return sb.toString();
    }

 /**
     * pdf转化为图片(png)方法
     * @param pdfFiles pdf文件
     * @param toDirectory 图片存放目录
     * @return
     * @throws Exception
     */
    public List<File> pdfToImg(List<File> pdfFiles,String toDirectory) throws Exception{
        List<File> imageFiles = new ArrayList<>();
        File file1 = new File(toDirectory);
        if(!file1.exists()){
            file1.mkdirs();
        }
        if(!toDirectory.endsWith(File.separator)){
            toDirectory+=File.separator;
        }
        for (File pdfFile : pdfFiles) {
            if(!pdfFile.getName().toLowerCase().endsWith(PDF_FILE_EXTENSION)){
                continue;
            }
            Document document = new Document();
            try {
                document.setInputStream(new FileInputStream(pdfFile), null);
                float scale = 2.5f;//缩放比例
                float rotation = 0f;//旋转角度
                for (int i = 0; i < document.getNumberOfPages(); i++) {
                    BufferedImage image = (BufferedImage)
                    document.getPageImage(i, GraphicsRenderingHints.SCREEN, Page.BOUNDARY_CROPBOX, rotation, scale);
                    try {
                        File file = new File(toDirectory+FilenameUtils.getBaseName(pdfFile.getName()) + ".png");
                        ImageIO.write(image, "png", file);
                        imageFiles.add(file);
                    } catch (IOException e) {
                        e.printStackTrace();
                    } finally {
                        image.flush();
                    }
                }
            } finally {
                document.dispose();
            }
        }
        return imageFiles;
    }

}

附上pom依赖

   <repositories>
        <repository>
            <id>AsposeJavaAPI</id>
            <name>Aspose Java API</name>
            <url>https://repository.aspose.com/repo/</url>
        </repository>
    </repositories>

        <dependency>
            <groupId>org.icepdf.os</groupId>
            <artifactId>icepdf-core</artifactId>
            <version>6.2.2</version>
        </dependency>

        <dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>pdfbox</artifactId>
            <version>2.0.17</version>
        </dependency>

        <dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>pdfbox-tools</artifactId>
            <version>2.0.17</version>
        </dependency>

        <dependency>
            <groupId>net.sf.cssbox</groupId>
            <artifactId>pdf2dom</artifactId>
            <version>1.8</version>
        </dependency>

        <dependency>
            <groupId>fr.opensagres.xdocreport</groupId>
            <artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId>
            <version>1.0.6</version>
        </dependency>

          <dependency>
            <groupId>com.aspose</groupId>
            <artifactId>aspose-words</artifactId>
            <version>20.3</version>
            <classifier>jdk17</classifier>
        </dependency>

        <dependency>
            <groupId>fr.opensagres.xdocreport</groupId>
            <artifactId>fr.opensagres.xdocreport.converter.docx.xwpf</artifactId>
            <version>2.0.2</version>
        </dependency>

        <dependency>
            <groupId>fr.opensagres.xdocreport</groupId>
            <artifactId>fr.opensagres.xdocreport.document.docx</artifactId>
            <version>2.0.2</version>
        </dependency>

        <dependency>
           <groupId>org.apache.poi</groupId>
           <artifactId>poi-ooxml</artifactId>
           <version>4.1.2</version>
        </dependency>

        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
            <version>4.1.2</version>
        </dependency>

shadow_zed

发布了46 篇原创文章 · 获赞 251 · 访问量 90万+

他的留言板关注

java 操作office套件工具类,持续更新中

猜你喜欢