Word文件转为HTML文件

整理了一天网上查到的把Word转为HTML的方法，包括了word07以上版本的转换

代码如下（整合了前辈们的代码）：

参考博客地址:http://blog.csdn.net/ptzrbin/article/details/43449701

http://blog.csdn.net/u011687117/article/details/29561027

package data.util; 

import java.io.BufferedWriter;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.util.List;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.jsoup.Jsoup;
import org.w3c.dom.Document;  

import org.xml.sax.ContentHandler;

import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.core.IURIResolver;
import org.apache.poi.xwpf.converter.xhtml.DefaultContentHandlerFactory;
import org.apache.poi.xwpf.converter.xhtml.IContentHandlerFactory;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;

/**
 * Word转换为Html并读取Html内容工具类
 */
public class WechatWord2Html {  

    //输出html文件   
    public static void writeFile(String content, String path) {  
        FileOutputStream fos = null;   
        BufferedWriter bw = null;  
        org.jsoup.nodes.Document doc = Jsoup.parse(content);  
         content=doc.html();  
        try {  
            File file = new File(path);  
            fos = new FileOutputStream(file);  
            bw = new BufferedWriter(new OutputStreamWriter(fos,"GB2312"));  
            bw.write(content);  
        } catch (FileNotFoundException fnfe) {  
            fnfe.printStackTrace();  
        } catch (IOException ioe) {  
            ioe.printStackTrace();  
        } finally {  
            try {  
                if (bw != null)  
                    bw.close();  
                if (fos != null)  
                    fos.close();  
            } catch (IOException ie) {  
            }  
        }  
    }  
  
    /**
     * Word 转 Html
     * 依赖jar包：	ooxml-schemas-1.1.jar ; 
     * 			org.apache.poi.xwpf.converter.core-1.0.4.jar ; 
     * 			org.apache.poi.xwpf.converter.xhtml-1.0.4.jar ;
     * @param fileName
     * @param outPutFile
     * @param fileNameExtension
     * @throws TransformerException
     * @throws IOException
     * @throws ParserConfigurationException
     */
    public static void convert2Html(String filePath, String outPutFile ,String fileNameExtension)  
            throws TransformerException, IOException, ParserConfigurationException {

    	//filePath ：Word文件路径	//outPutFile : 输出文件存放路径
	//fileNameExtension ： Word后缀
	if(fileNameExtension.equals("doc")){ //老版本

            HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(filePath));
            WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(  
                    DocumentBuilderFactory.newInstance().newDocumentBuilder()  
                            .newDocument());  
            wordToHtmlConverter.setPicturesManager( new PicturesManager() {  
                public String savePicture( byte[] content,  
                                           PictureType pictureType, String suggestedName,  
                                           float widthInches, float heightInches ) {  
                	return "test/"+suggestedName;  
                }  
            } );  
            wordToHtmlConverter.processDocument(wordDocument);  
            //save pictures  
            List pics=wordDocument.getPicturesTable().getAllPictures();  
            if(pics!=null){  
                for(int i=0;i<pics.size();i++){  
                    Picture pic = (Picture)pics.get(i);  
                    System.out.println();  
                    try {  
                        pic.writeImageContent(new FileOutputStream("D:/test/"  
                                + pic.suggestFullFileName()));  
                    } catch (FileNotFoundException e) {  
                        e.printStackTrace();  
                    }  
                }  
            }  
            Document htmlDocument = wordToHtmlConverter.getDocument();  
      
            ByteArrayOutputStream out = new ByteArrayOutputStream();  
            DOMSource domSource = new DOMSource(htmlDocument);  
            StreamResult streamResult = new StreamResult(out);  
      
      
            TransformerFactory tf = TransformerFactory.newInstance();  
            Transformer serializer = tf.newTransformer();  
            serializer.setOutputProperty(OutputKeys.ENCODING, "GB2312");  
            serializer.setOutputProperty(OutputKeys.INDENT, "yes");  
            serializer.setOutputProperty(OutputKeys.METHOD, "HTML");  
            serializer.transform(domSource, streamResult);  
            out.close();  
            writeFile(new String(out.toByteArray()), outPutFile);  
    	}else if(fileNameExtension.equals("docx")){ //新版本
    	    XWPFDocument document = new XWPFDocument(new FileInputStream(new File(filePath)));  
            XHTMLOptions options = XHTMLOptions.create();// .indent( 4 );  
            IContentHandlerFactory f = new DefaultContentHandlerFactory();  
            // Extract image  
            options.setExtractor(new FileImageExtractor(new File("D:/")));  
            // URI resolver  
            options.URIResolver(new IURIResolver() {  
                @Override  
                public String resolve(String uri) {  
                    return "D:/" + uri;  
                }  
            });  
            ByteArrayOutputStream out = new ByteArrayOutputStream();  
            ContentHandler contentHandler = f.create(out, null, options);  
            XHTMLConverter.getInstance().convert(document, out, options);  
            out.close();  
            writeFile(new String(out.toByteArray()), outPutFile);   
	}
    }  
    
    /**
     * Html内容提取为String
     * @param filePath
     * @return
     */
    public static String readfile(String filePath){
		File file = new File(filePath);
		InputStream input = null;
		try {
			input = new FileInputStream(file);
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		}
		StringBuffer buffer = new StringBuffer();
		byte[] bytes = new byte[1024];
		try {
			for (int n; (n = input.read(bytes)) != -1;) {
				buffer.append(new String(bytes, 0, n, "GBK"));
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
		// System.out.println(buffer);
		return buffer.toString();
	}
    /**
     * 读取html的body内容为String
     * @param val
     * @return
     */
	public static String getBody(String val) {
		String start = "<body>";
		String end = "</body>";
		int s = val.indexOf(start) + start.length();
		int e = val.indexOf(end);
		return val.substring(s, e);
	}
}

Word文件转为HTML文件

猜你喜欢