Java reads word and converts HTML format, retaining the style and format of the content


title: Java reads word and converts HTML format, retaining the style and format of the content
date: 2023-08-11
categories:

  • Backend
    tags:
  • file operation
  • Summarize

Java reads word and converts HTML format, retaining the style and format of the content

pom dependencies

   <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>3.15</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>3.15</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml-schemas</artifactId>
            <version>3.15</version>
        </dependency>
 
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
            <version>3.17</version>
        </dependency>
 
 
        <dependency>
            <groupId>fr.opensagres.xdocreport</groupId>
            <artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId>
            <version>1.0.6</version>
        </dependency>

java

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.fasterxml.jackson.core.JsonProcessingException;
import org.apache.commons.fileupload.FileItem;
import org.apache.commons.fileupload.FileItemFactory;
import org.apache.commons.fileupload.disk.DiskFileItemFactory;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.xwpf.converter.core.BasicURIResolver;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.springframework.web.multipart.MultipartFile;
import org.springframework.web.multipart.commons.CommonsMultipartFile;
import org.w3c.dom.Document;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.List;


/**
 * 功能描述
 *
 * @author: konglignxin
 * @date: 2023年04月04日 16:54
 */

public class Test {
    
    


    /**
     * 上传Word文档,返回解析后的Html
     */
    public static String docToHtmlText(MultipartFile file) throws Exception {
    
    
        //使用字符数组流获取解析的内容
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        OutputStream outStream = new BufferedOutputStream(baos);
        try {
    
    
            //将上传的文件传入Document转换
            HWPFDocument wordDocument = new HWPFDocument(file.getInputStream());
            Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
            WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(document);
            //将读取到的图片上传并添加链接地址
            //wordToHtmlConverter.setPicturesManager((imageStream, pictureType, name, width, height) -> {
    
    
            //    try {
    
    
            //        //首先要判断图片是否能识别
            //        if (pictureType.equals(PictureType.UNKNOWN)) {
    
    
            //            return "[不能识别的图片]";
            //        }
            //        //此处上传到自己的文件服务器 todo
            //        String qiNiuName = "";//文件名
            //        boolean upload = FileUtil.upload(new        FileInputStream(fileImage), qiNiuName);
            //        return "上传后的图片地址";
            //
            //    } catch (Exception e) {
    
    
            //        logger.info("upload exception", e);
            //    }
            //    return "[图片上传失败]";
            //});
            // word文档转Html文档
            wordToHtmlConverter.processDocument(wordDocument);
            Document htmlDocument = wordToHtmlConverter.getDocument();
            DOMSource domSource = new DOMSource(htmlDocument);
            StreamResult streamResult = new StreamResult(outStream);
            TransformerFactory factory = TransformerFactory.newInstance();
            Transformer serializer = factory.newTransformer();
            serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
            serializer.setOutputProperty(OutputKeys.INDENT, "yes");
            serializer.setOutputProperty(OutputKeys.METHOD, "html");
            serializer.transform(domSource, streamResult);
            String content = baos.toString();

            return content;
        } catch (Exception e) {
    
    

        } finally {
    
    
            baos.close();
            outStream.close();
        }
        return "";
    }

    /**
     * 上传docx文档,返回解析后的Html
     */
    public static String docxToHtmlText(MultipartFile file) throws Exception {
    
    
        ByteArrayOutputStream htmlStream = new ByteArrayOutputStream();
        try {
    
    
            // 将上传的文件传入Document转换
            XWPFDocument docxDocument = new XWPFDocument(file.getInputStream());
            XHTMLOptions options = XHTMLOptions.create();
            // 设置图片存储路径
            String path = System.getProperty("java.io.tmpdir");
            String firstImagePathStr = path + "/" + System.currentTimeMillis();
            options.setExtractor(new FileImageExtractor(new File(firstImagePathStr)));
            options.URIResolver(new BasicURIResolver(firstImagePathStr));
            // 转换html
            docxDocument.createNumbering();
            XHTMLConverter.getInstance().convert(docxDocument, htmlStream, options);
            String htmlStr = htmlStream.toString();

            String middleImageDirStr = "/word/media";
            String imageDirStr = firstImagePathStr + middleImageDirStr;
            File imageDir = new File(imageDirStr);
            String[] imageList = imageDir.list();
            if (imageList != null) {
    
    
                for (int i = 0; i < imageList.length; i++) {
    
    
                    try {
    
    
                        String oneImagePathStr = imageDirStr + "/" + imageList[i];
                        File fileImage = new File(oneImagePathStr);
                        if (fileImage.exists()) {
    
    
                            String name = fileImage.getName();
                            String suffix = name.substring(name.indexOf("."), name.length()).toLowerCase();
                            //此处上传到自己的文件服务器 todo
                            String qiNiuName = "";//文件名
                            //boolean upload = FileUtil.upload(new FileInputStream(fileImage), qiNiuName);
                            //if (!upload) {
    
    
                            //    continue;
                            //} else {
    
    
                            //    //修改文档中的图片信息
                            //    htmlStr = htmlStr.replace(oneImagePathStr, "上传后的图片地址");
                            //}
                        }
                    } catch (Exception e) {
    
    

                    }
                }
            }
            //删除图片路径
            File firstImagePath = new File(firstImagePathStr);
            FileUtils.deleteDirectory(firstImagePath);
            return htmlStr;
        } catch (Exception e) {
    
    

        } finally {
    
    
            if (htmlStream != null) {
    
    
                htmlStream.close();
            }
        }
        return "";
    }


    public static void main(String[] args) {
    
    
        try {
    
    
            String content = docxToHtmlText(getMulFileByPath("E:\\temp.docx"));
//            String div = HtmlUtil.unwrapHtmlTag(articleContent, "div");
//            String head = HtmlUtil.removeHtmlTag(div, "head");
            System.out.println(content);
        } catch (Exception e) {
    
    
            e.printStackTrace();
        }
    }


    /**
     * 获取MultipartFile文件
     *
     * @param picPath
     * @return
     */
    public static MultipartFile getMulFileByPath(String picPath) {
    
    
        FileItem fileItem = createFileItem(picPath);
        MultipartFile mfile = new CommonsMultipartFile(fileItem);
        return mfile;
    }

    private static FileItem createFileItem(String filePath) {
    
    
        FileItemFactory factory = new DiskFileItemFactory(16, null);
        String textFieldName = "textField";
        int num = filePath.lastIndexOf(".");
        String extFile = filePath.substring(num);
        FileItem item = factory.createItem(textFieldName, "text/plain", true,
                "MyFileName" + extFile);
        File newfile = new File(filePath);
        int bytesRead = 0;
        byte[] buffer = new byte[8192];
        try {
    
    
            FileInputStream fis = new FileInputStream(newfile);
            OutputStream os = item.getOutputStream();
            while ((bytesRead = fis.read(buffer, 0, 8192))
                    != -1) {
    
    
                os.write(buffer, 0, bytesRead);
            }
            os.close();
            fis.close();
        } catch (IOException e) {
    
    
            e.printStackTrace();
        }
        return item;
    }
    
}

Guess you like

Origin blog.csdn.net/china_coding/article/details/132642519