Merge Word .doc and docx format file

Jacob used before the merger .doc, but this can only use in a windows environment, so abandoned, following this approach is open word html format
 
My problem is to download and merge attachment, the attachment here mostly doc file also contains a small amount of docx file, but the file path is read out from the database, not the suffix name, not the traditional xwpfdocument and hwpfdocument completely solve my problem; in particular, the merged file can not be easily done, you need to parse the document.
Here is the method I used to convert Word files to HTML files, convert Word to merge to merge the HTML; such an action would reduce the difficulty, but is then also need to convert HTML file or files to doc docx file (this time you can specify what kind of files).
When converted, in two directions, one is the doc file conversion, and the other is docx file conversion; must contain (images and tables here, I did not test) document format conversion here. If there is no file extension, you need to determine is the doc or docx files, a utility class used here, is the file header to determine the file type by the file, because I am here just to distinguish doc and docx, so we compared the before the four-digit hexadecimal number, in accordance with the if and else to take the two conversion routes. (File header specific documents can check the internet, there are a variety of file).
When converting doc file, in accordance with the reading of the character, judgment font color and style for each character, to convert it to HTML code should be the last cumulative whole string of HTML-safe, because I am here is to merge, so I had to use a for loop overlay file body finished in circulation plus the HTML header and trailer information can be. In addition, according to the needs, the need to insert page breaks between different documents, page breaks can be represented by HTML code, "<br clear = all style = 'page-break-before: always' mce_style =' page-break- before: always'> "; need to be added directly.
Then convert docx file, where it can not be read doc files for each character, but the entire document is converted directly into HTML files, you can learn by printing a string, converted out of the HTML code is a big the div, if this code is directly combined when the format is not uniform, so the need to remove the style pattern diagram, taken directly string, and add an empty <div> to.

Finally, then HTML documents into doc or docx, paths can be placed on top of the server, and achieve download on the line (we advise you to empty After downloading the file, can be recycled). Following detailed code.

 
 
package com.landray.kmss.km.doc.util;
 
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;
 
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.hwpf.usermodel.Table;
import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableIterator;
import org.apache.poi.hwpf.usermodel.TableRow;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
 
public class WordExcelToHtml {
 
    /**
     * Carriage return ASCII code
     */
    private static final short ENTER_ASCII = 13;
 
    /**
     * Space character ASCII code
     */
    private static final short SPACE_ASCII = 32;
 
    /**
     * Horizontal tab ASCII code
     */
    private static final short TABULATION_ASCII = 9;
 
    // public static String htmlText = "";
    public static String mainText = "";
    public static String htmlTextTbl = "";
    public static int counter = 0;
    public static int beginPosi = 0;
    public static int endPosi = 0;
    public static int beginArray[];
    endArray public static int [];
    public static String htmlTextArray[];
    public static boolean tblExist = false;
 
    public static void main(String argv[]) {
        try {
            String htmlText = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8' />"
                    + "</ Head> <body>"; // each of the body portion out of the Word, HTML plus after the merger of the head and tail, it is to be noted that encoding
            List<String> list = new ArrayList<String>();
            String file1 = "D://file8";
            String file2 = "D://file9";
            String file3 = "D://file11";
            list.add(file1);
            list.add(file2);
            list.add(file3);
 
            // String mainText1 = "";
            for (int i = 0; i < list.size(); i++) {
                htmlText += getWordAndStyle(list.get(i))
                        + "<br clear=all style='page-break-before:always' mce_style='page-break-before:always'> ";
// read each document after the finish, add a page break, continued to accumulate
            }
            htmlText += "</body></html>";
            String filePath = "D://1.html";
            writeFile(htmlText, filePath);
            new HtmlToDoc().writeWordFile(filePath, "D://file10.doc");
        } catch (Exception e) {
            e.printStackTrace ();
        }
    }
 
    /**
     * Read each text style
     *
     * @param fileName
     * @throws Exception
     */
 
    public static String getWordAndStyle(String fileName) throws Exception {
        String htmlText = "";
        FileInputStream in = new FileInputStream(new File(fileName));
        // judgment is based on the text doc or docx
        byte[] b = new byte[4];
        in.read(b, 0, b.length);
        in.close();
        FileInputStream in1 = new FileInputStream(new File(fileName));
        System.out.println(bytesToHexString(b) + ";;;");
        if (bytesToHexString (b) .equalsIgnoreCase ( "d0cf11e0")) {// "d0cf11e0" represents the file doc
            HWPFDocument doc = new HWPFDocument(in1);
 
            Range rangetbl = doc.getRange (); // get the document read range
            TableIterator it = new TableIterator (rangetbl);
            int num = 1;
 
            beginning array = new int [num];
            endArray = new int [num];
            htmlTextArray = new String[num];
  readTable(it, rangetbl);
            // Get the total number of characters in the document
            int length = doc.characterLength();
            // Create a picture of the container;
            PicturesTable pTable = doc.getPicturesTable();
 
            int cur = 0;
            String tempString = "";
            for (int i = 0; i < length - 1; i++) {
                // entire article to judge character by a character, range to get the document range
                Range range = new Range(i, i + 1, doc);
 
                CharacterRun cr = range.getCharacterRun(0);
 
                if (tblExist && cur < beginArray.length) {
                    if (i == beginArray[cur]) {
                        htmlText += tempString + htmlTextArray[cur];
                        tempString = "";
                        i = endArray [cur] - 1;
                        cur++;
                        continue;
                    }
                }
                if (pTable.hasPicture(cr)) {
//htmlText += tempString;
// 读写图片
tempString = readPicture(pTable, cr);
//tempString = "";
htmlText += tempString;
                } else {
 
                    Range range2 = new Range(i + 1, i + 2, doc);
                    // The second character
                    CharacterRun cr2 = range2.getCharacterRun(0);
                    char c = cr.text().charAt(0);
 
                    // determines whether a carriage return
                    if (c == ENTER_ASCII) {
                        tempString += "<br/>";
 
                    }
                    // determine whether whitespace
                    else if (c == SPACE_ASCII)
                        tempString += " ";
                    // determines whether horizontal tab
                    else if (c == TABULATION_ASCII)
                        tempString += "    ";
                    2 // characters before and after comparison have the same format
                    boolean flag = compareCharStyle(cr, cr2);
                    String fontStyle = "<span class='text' style=\"font-family:"
                            + cr.getFontName()
                            + ";font-size:"
                            + cr.getFontSize()
                            / 2
                            + "pt;color:"
                            + ColorUtils.getHexColor(cr.getIco24()) + ";";
 
                    if (cr.isBold())
                        fontStyle += "font-weight:bold;";
                    if (cr.isItalic())
                        fontStyle += "font-style:italic;";
 
                    htmlText += fontStyle + "\" mce_style=\"font-family:"
                            + cr.getFontName() + ";font-size:"
                            + cr.getFontSize() / 2 + "pt;";
 
                    if (cr.isBold())
                        fontStyle += "font-weight:bold;";
                    if (cr.isItalic())
                        fontStyle += "font-style:italic;";
 
                    htmlText += fontStyle + "\">" + tempString + cr.text()
                            + "</span>";
                    tempString = "";
                }
            }
 
            htmlText += tempString;
            return htmlText;
        } else {
            Word2007ToHtml w = new Word2007ToHtml();
            String filepath = "";
            String fileName1 = fileName;
            String htmlName = "D://3.html";
            w.Word2007ToHtml (fileName1, Editor);
            String result = w.readFileByBytes(htmlName);
            int i = result.indexOf('>');
            String realreasult = "<div>"+result.substring(i+1);
            System.out.println(realreasult);
            htmlText += realreasult;
            return htmlText;
        }
 
    }
 
    /**
     * Table read and write documents
     *
     * @Param pTable
     * @param cr
     * @throws Exception
     */
    public static void readTable(TableIterator it, Range rangetbl)
            throws Exception {
 
        htmlTextTbl = "";
        // iteration of the document table
 
        counter = -1;
        while (it.hasNext()) {
            tblExist = true;
            htmlTextTbl = "";
            Table tb = (Table) it.next();
            beginPosi = tb.getStartOffset ();
            endPosi = tb.getEndOffset ();
 
            System.out.println("............" + beginPosi + "...." + endPosi);
            counter = counter + 1;
            // Iterate Row, the default starting from 0
            beginning array [counter] = initial positions;
            endArray [counter] = endPosi;
 
            htmlTextTbl += "<table border='1'>";
            for (int i = 0; i < tb.numRows(); i++) {
                TableRow tr = tb.getRow(i);
 
                htmlTextTbl += "<tr >";
                // iterative sequence, the default starting from 0
                for (int j = 0; j < tr.numCells(); j++) {
                    TableCell td = tr.getCell (j); // Get the cell
                    int cellWidth = td.getWidth();
 
                    // Get the contents of a cell
                    for (int k = 0; k < td.numParagraphs(); k++) {
                        Paragraph para = td.getParagraph (k);
                        String s = para.text().toString().trim();
                        if (s == "") {
                            s = " ";
                        }
                        System.out.println(s);
                        htmlTextTbl += "<td class='text' width=" + cellWidth
                                + ">" + s + "</td>";
                        System.out.println(i + ":" + j + ":" + cellWidth + ":"
                                + s);
                    } // end for
                } // end for
            } // end for
            htmlTextTbl += "</table>";
            htmlTextArray[counter] = htmlTextTbl;
 
        } // end while
    }
 
    /**
     * Read and write documents picture
     *
     * @Param pTable
     * @param cr
     * @throws Exception
     */
    public static void readPicture(PicturesTable pTable, CharacterRun cr)
            throws Exception {
        // extract pictures
        Picture pic = pTable.extractPicture(cr, false);
        // returns POI recommended image file name
        String afileName = pic.suggestFullFileName();
        OutputStream out = new FileOutputStream(new File("e://test"
                + File.separator + afileName));
        pic.writeImageContent(out);
        // htmlText += "<img src=\"e://test//" + afileName
        // + "\" mce_src=\"e://test//" + afileName + "\"/>";
    }
 
    public static boolean compareCharStyle(CharacterRun cr1, CharacterRun cr2) {
        boolean flag = false;
        if (cr1.isBold() == cr2.isBold() && cr1.isItalic() == cr2.isItalic()
                && cr1.getFontName().equals(cr2.getFontName())
                && cr1.getFontSize() == cr2.getFontSize()) {
            flag = true;
        }
        return flag;
    }
 
    /**
     * Write file
     *
     * @Param s
     */
    public static void writeFile(String s, String filePath) {
        FileOutputStream fos = null;
        BufferedWriter bw = null;
        try {
            File file = new File(filePath);
            fos = new FileOutputStream(file);
            bw = new BufferedWriter(new OutputStreamWriter(fos));
            bw.write(s);
        } catch (FileNotFoundException fnfe) {
            fnfe.printStackTrace ();
        } Catch (IOException yes) {
            ioe.printStackTrace ();
        } finally {
            try {
                if (bw != null)
                    bw.close();
                if (fos != null)
                    fos.close();
            } catch (IOException ie) {
            }
        }
    }
 
    // determine the file type
    public static String bytesToHexString(byte[] src) {
        StringBuilder stringBuilder = new StringBuilder();
        if (src == null || src.length <= 0) {
            return null;
        }
        for (int i = 0; i < src.length; i++) {
            int v = src[i] & 0xFF;
            String hv = Integer.toHexString(v);
            if (hv.length() < 2) {
                stringBuilder.append(0);
            }
            stringBuilder.append(hv);
        }
        return stringBuilder.toString();
    }
}
 
 
Gets the font color tools:
package com.landray.kmss.km.doc.util;
public   class  ColorUtils { 
 
     public  static  int  red( int c) { 
         return c & 0XFF; 
    } 
     
     public  static  int green( int c) { 
         return (c >> 8) & 0XFF; 
    } 
     
     public  static  int blue( int c) { 
         return (c >> 16) & 0XFF; 
    } 
     
     public  static  int rgb( int c) { 
         return ( red(c) << 16) | ( green(c) <<8) |  blue(c); 
    } 
 
     public  static String rgbToSix(String rgb) { 
         int length = 6 - rgb.length(); 
        String str = ""; 
         while(length > 0){ 
            str += "0"; 
            length--; 
        } 
         return str + rgb; 
    } 
     
     public  static String getHexColor( int color) { 
        color = color == -1 ? 0 : color; 
         int rgb =  rgb(color); 
         return "#" +  rgbToSix(Integer. toHexString(rgb)); 
    } 
}
 
Convert HTML file to doc file:
 package com.landray.kmss.km.doc.util;
 
import java.io.BufferedReader; 
import java.io.ByteArrayInputStream;
import java.io.File; 
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException; 
import java.io.InputStreamReader;
import java.nio.charset.Charset;   
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem; 
 
 
// will docx documents into HTML
package com.landray.kmss.km.doc.util;
 
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.Reader;
 
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
org.apache.poi.xwpf.converter.core.FileURIResolver import;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.junit.Test;
 
public  class Word2007ToHtml{
    @Test
       public void Word2007ToHtml(String fileName,String htmlName) throws IOException {
 
           final String file = fileName;
           File f = new File(file); 
           if (!f.exists()) { 
               System.out.println("Sorry File does not Exists!"); 
           } else { 
 
 
                   //) loaded word document generation XWPFDocument objects 
                   InputStream in = new FileInputStream(f); 
                   XWPFDocument document = new XWPFDocument(in); 
 
                   //) XHTML parsing configuration (settings here IURIResolver to set the picture stored in the directory) 
                   File imageFolderFile = new File("D://"); 
                   XHTMLOptions options = XHTMLOptions.create().URIResolver(new FileURIResolver(imageFolderFile)); 
                   options.setExtractor(new FileImageExtractor(imageFolderFile)); 
                   options.setIgnoreStylesIfUnused(false); 
                   options.setFragment(true); 
 
                   //) will be converted to XHTML XWPFDocument 
                   File file1 = new File(htmlName);
                   OutputStream out = new FileOutputStream(file1); 
                   XHTMLConverter.getInstance().convert(document, out, options);
 
 
                   // You can also use an array of characters parsed content stream acquisition
   //                ByteArrayOutputStream baos = new ByteArrayOutputStream();
   //                XHTMLConverter.getInstance().convert(document, baos, options); 
   //                String content = baos.toString();
   //                System.out.println(content);
   //                 baos.close();
 
           } 
       } 
    public static void main(String[] args) throws IOException {
        Word2007ToHtml w = new Word2007ToHtml();
        String fileName = "D://file1.docx";
        String htmlName = "D://3.html";
        w.Word2007ToHtml (filename, Editor);
        String result = readFileByBytes(htmlName);
        System.out.println(result);
    }
     public static String readFileByBytes(String fileName) {
 
             String s="";
            File file = new File(fileName);
            Reader reader = null;
            try {
                //System.out.println ( "in characters read the contents of the file, read one byte at:");
                // once read a character
                reader = new InputStreamReader(new FileInputStream(file),"utf-8");
                int tempchar;
                while ((tempchar = reader.read()) != -1) {
                    // For the windows, \ r \ n together these two characters, represents a line break.
                    // But if these two characters are shown separately, it will change two lines.
                    // Thus, masked \ R & lt, or shield \ n. Otherwise, it will be a lot more blank lines.
                    if (((char) tempchar) != '\r') {
                        s +=(char) tempchar;
                    }
                }
                reader.close();
            } catch (Exception e) {
                e.printStackTrace ();
            }
            return s;
}
}
/ ** * html files into the doc * @author soildwang * * /   
public class HtmlToDoc {
    /**
               * Read html files to word *
               * @Param filepath html file path * @return           
               * * @throws Exception           */         
    public boolean writeWordFile(String filepath,String outfile) throws Exception {               
        boolean flag = false;                 
        ByteArrayInputStream bais = null;
        FileOutputStream fos = null;                 
        // String outfile = "D: //file8.doc"; // write path according to the actual situation                 
        try {                       
            if (!"".equals(outfile)) {                               
                File fileDir = new File(outfile);                               
                if (fileDir.exists()) {                                     
                    String content = readFile(filepath);                                     
                    byte b[] = content.getBytes();                                     
                    bais = new ByteArrayInputStream(b);                                     
                    POIFSFileSystem poifs = new POIFSFileSystem();                                     
                    DirectoryEntry directory = poifs.getRoot();                                     
                    DocumentEntry documentEntry =  directory.createDocument("WordDocument", bais);                                     
                    fos = new FileOutputStream(outfile);                                                                         
                    poifs.writeFilesystem(fos);                                     
                    bais.close();                                     
                    fos.close();                               
                    }                       
                }                   
            } catch (IOException e) {                       
                e.printStackTrace ();                 
                } finally {
                    if(fos != null) fos.close();                       
                    if(bais != null) bais.close();                 
                    }                 return flag;         
                    }             
    /**           
     * * Read the html file into a string * @param filename           
     * * @return           * @throws Exception           
     * */         
    public String readFile(String filename) throws Exception {                 
        StringBuffer buffer = new StringBuffer("");                 
        BufferedReader br = null;               
        try {                       
            br = new BufferedReader(new InputStreamReader(new  FileInputStream(new File(filename)),Charset.forName("utf-8")));                       
            buffer = new StringBuffer();                       
            while (br.ready())                               
                buffer.append((char) br.read());                 
            } catch (Exception e) {
                e.printStackTrace ();                 
                } finally {                       
                    if(br!=null) br.close();                 
                    }                 
        return buffer.toString();         
        }  
// local test                
    public static void main(String[] args) throws Exception {                 
        new HtmlToDoc () writeWordFile. ( "d: //1.html", "D: //file8.doc"); // write the file path according to the actual situation         
        } 
    }
 

Guess you like

Origin www.cnblogs.com/muliu/p/12146186.html