poi word转换为html

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.poi.hwpf.HWPFDocumentCore;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.converter.WordToHtmlUtils;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;

import com.maiyue.base.utils.FileUtils;

import fr.opensagres.poi.xwpf.converter.core.BasicURIResolver;
import fr.opensagres.poi.xwpf.converter.core.FileImageExtractor;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;

/**
* word文档转html工具
*
* @author cwl
*
*/
public class POIWordToHtmlUtils {

private static Logger logger = LoggerFactory.getLogger(POIWordToHtmlUtils.class);

/**
* .doc的word文档转换为html
*
* @return
* @throws Exception
*/
public static String docWordToHtml(String sourceFilePath, String targetFilePath) {
  FileUtils.createFileFolder(targetFilePath);
  try {
   InputStream input = new FileInputStream(sourceFilePath);
   HWPFDocumentCore wordDocument = WordToHtmlUtils.loadDoc(input);
         WordToHtmlConverter wordToHtmlConverter = new WordImageToHtmlConverter(
                 DocumentBuilderFactory.newInstance().newDocumentBuilder()
                         .newDocument());
         wordToHtmlConverter.processDocument(wordDocument);
      Document htmlDocument = wordToHtmlConverter.getDocument();
      DOMSource domSource = new DOMSource(htmlDocument);
      StreamResult streamResult = new StreamResult(new File(targetFilePath));
      TransformerFactory tf = TransformerFactory.newInstance();
      Transformer serializer = tf.newTransformer();
      serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
      serializer.setOutputProperty(OutputKeys.INDENT, "yes");
      serializer.setOutputProperty(OutputKeys.METHOD, "html");
      serializer.transform(domSource, streamResult);
      return targetFilePath;
  } catch (Exception e) {
   logger.error(".doc的word文档转换为html,发生异常,源文件={},目标文件={}", sourceFilePath, targetFilePath, e);
   return null;
  }
}

/**
* .docx的word文档转换为html
*
* @param sourceFilePath
* @param targetFilePath
* @param imagePath
* @return
*/
public static String docxToHtml(String sourceFilePath, String targetFilePath, String imagePath) {
  FileUtils.createFileFolder(targetFilePath);
     OutputStreamWriter outputStreamWriter = null;
     try {
     InputStream input = new FileInputStream(sourceFilePath);
         XWPFDocument document = new XWPFDocument(input);
         XHTMLOptions options = XHTMLOptions.create();
         // 存放图片的文件夹
         options.setExtractor(new FileImageExtractor(new File(imagePath)));
         // html中图片的路径
         options.URIResolver(new BasicURIResolver("image"));
         outputStreamWriter = new OutputStreamWriter(new FileOutputStream(targetFilePath), "utf-8");
         XHTMLConverter xhtmlConverter = (XHTMLConverter) XHTMLConverter.getInstance();
         xhtmlConverter.convert(document, outputStreamWriter, options);
         return targetFilePath;
     } catch (Exception e) {
   logger.error(".docx的word文档转换为html,发生异常,源文件={},目标文件={}", sourceFilePath, targetFilePath, e);
   return null;
  } finally {
         if (outputStreamWriter != null) {
             try {
     outputStreamWriter.close();
    } catch (IOException e) {
     logger.error(".docx的word文档转换为html,关闭流发生异常,源文件={},目标文件={}", sourceFilePath, targetFilePath, e);
     return null;
    }
         }
     }
}

/*public static void main(String[] args) throws Exception {
  docToHtml("D:/diagnosis/file/temp/test2003.doc", "D:/diagnosis/file/temp/test1.html");
  String imagePath = "D:/diagnosis/file/temp/image";
  String sourceFileName = "D:/diagnosis/file/temp/test2007.docx";
  String targetFileName = "D:/diagnosis/file/temp/test2.html";
  docxToHtml(sourceFileName, targetFileName, imagePath);
}*/

}

总结：

word的2007版的目录转换为空，实现了的同学麻烦告知如何实现的,word的2003版的目录虽然转换成功了，但是第一行目录不对。

猜你喜欢