代码生成docx数据表转换为xml文件

目录

 

背景

DOCX的数据表 

代码实现

生成XML格式

附件

总结


背景

  • 在参与到软件设计的文档修改时,需要将docx文档中的数据表录入到xml文档中,在第一次录入时,一个字段一个字段的录入实在是太过麻烦,遇到有几十个字段的表,眼睛都看花了还是录不完,于是想着使用代码去读取相应的表格,将其中的数据表生成xml文件。
  • 当然,在写脚本的时候,大家可能首先想到的是使用Python进行编写。由于我的电脑中没有安装Python环境,就使用已有的环境编写了Java代码进行转换。在读取表的时候,还有很多问题可以改进,但是由于文档的规范不同,处理还不是很全面,现在将自己实现的额进行记录。

DOCX的数据表 

表格式一

 表格式二

针对遇到的两种格式的表,进行编写代码,生成相应的xml文件

代码实现

Docx2XMLUtil.java

package docx2xml;

import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.poi.xwpf.usermodel.XWPFTableCell;
import org.apache.poi.xwpf.usermodel.XWPFTableRow;

import java.io.*;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;
import java.util.List;

/**
 * @ClassName Docx2XMLUtil
 * @Author StriveFarrell
 * @Date 2019/12/4 15:22
 * @Description
 * 将docx文档章的表格转化为xml文档
 */

public class Docx2XMLUtil {
    private  String docxFilePath ;
    private  String xmlFileSavePath ;
    private  String author;

    public String getDocxFilePath() {
        return docxFilePath;
    }

    public void setDocxFilePath(String docxFilePath) {
        this.docxFilePath = docxFilePath;
    }

    public String getXmlFileSavePath() {
        return xmlFileSavePath;
    }

    public void setXmlFileSavePath(String xmlFileSavePath) {
        this.xmlFileSavePath = xmlFileSavePath;
    }

    public String getAuthor() {
        return author;
    }

    public void setAuthor(String author) {
        this.author = author;
    }

    public void getTableData(){
        try {
            String filePath = getDocxFilePath();
            if (filePath.toLowerCase().endsWith("docx")){
                docx2xml();
            }else if (filePath.toLowerCase().endsWith(".doc")){
                doc2xml();
            }
        }catch (Exception e){
            e.printStackTrace();
        }
    }

    private void docx2xml(){
        XWPFDocument document = getXWPFDocument();

        Iterator<XWPFTable> tabItr = document.getTablesIterator();
        String tableHeaderInfo = getTableHeader();
        String remInfo = getRemInfo();
        int tableIndex = 1;
        while (tabItr.hasNext()){
            StringBuffer tablexml = new StringBuffer(tableHeaderInfo);
            tablexml.append(remInfo);

            XWPFTable table = tabItr.next();
            String tableColumnInfo = getTableColumn(table);
            tablexml.append(tableColumnInfo);

            String xmlString = tablexml.toString()+getEndTableTag()+"\n\n\n\n";
            testPrint(String.valueOf(tableIndex), xmlString);
            saveXml(xmlString);
            tableIndex++;
        }
    }
    private  void doc2xml(){

    }

    /**
     * 打印测试
     * @param message
     * @param out
     */
    private  void testPrint(String message,String out){
        System.out.println(message+":\n"+out);
    }

    /**
     * 获取当前的日期,格式为yyyy.MM.dd
     * @return
     */
    private  String getDate(){
        SimpleDateFormat df = new SimpleDateFormat("yyyy.MM.dd");
        return df.format(new Date());
    }

    private  void saveXml(String data){
        String saveXmlPath = getXmlFileSavePath();
        try {
            FileWriter fw = new FileWriter(saveXmlPath, true);
            BufferedWriter bw = new BufferedWriter(fw);
            bw.write(data);
            bw.close();
            fw.close();
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    /**
     * 判断数据类型是否有长度
     * @param cell
     * @return
     */
    private  boolean hasSize(String cell){
        if ("DATETIME".equalsIgnoreCase(cell)||"TEXT".equalsIgnoreCase(cell) || "TIMESTAMP".equalsIgnoreCase(cell) || "LONGTEXT".equalsIgnoreCase(cell)){
            return false;
        }
        return true;
    }

    /**
     * 判断表格格式,tableStyle标记表的格式,字段和长度在一起为true,分开为false
     * @param header
     * @return
     */
    private boolean getTableStyle(List<XWPFTableCell> header) {
        boolean isContains = false ;
        Iterator<XWPFTableCell> cellIterator = header.iterator();
        while (cellIterator.hasNext()){
            String cel = cellIterator.next().getText();
            if (cel.contains("长度")){
                isContains = true;
                break;
            }
        }
        return isContains;
    }

    /**
     * 获取文件输入流
     * @return
     */
    private FileInputStream getFileInputStream(){
        FileInputStream in = null;
        try {
            in = new FileInputStream(getDocxFilePath());
        }catch (FileNotFoundException e) {
            e.printStackTrace();
        }
        return in;
    }

    /**
     * 获取docx文件流
     * @return
     */
    private XWPFDocument getXWPFDocument(){
       FileInputStream in = getFileInputStream();
       XWPFDocument document = null;
        try {
            document = new XWPFDocument(in);
        } catch (IOException e) {
            e.printStackTrace();
        }
        return document;
    }

    /**
     * 获取表同的格式
     * @return
     */
    private String getTableHeader(){
        String tableTagStart = "<table ";
        String tableId = "id=";
        String javaId = "javaId=";
        String tableName = "name=";
        String tableTagEnd = ">";
        StringBuffer headBuffer = new StringBuffer(tableTagStart);
        headBuffer.append(tableId+"\"\" ");
        headBuffer.append(javaId+"\"\" ");
        headBuffer.append(tableName+"\"\" ");
        headBuffer.append(tableTagEnd+"\n");
        return headBuffer.toString();
    }

    /**
     * 返回table的闭合标签
     * @return
     */
    private String getEndTableTag(){
        return "<\\table>";
    }

    /**
     * 获取创建人的信息
     * @return
     */
    private  String getRemInfo(){
        String remInfo = "\t<rem>====================================================================</rem>\n" +
                        "\t<rem> 输入人:"+ author +"\t输入时间:"+ getDate()+"</rem>\n" +
                         "\t<rem>table description</rem>\n"+
                        "\t<rem>====================================================================</rem>\n";
        return remInfo;
    }

    /**
     * 遍历获取每一列的数据
     * @param table
     * @return
     */
    private String getTableColumn(XWPFTable table) {
        String tag = "\t<column ";
        String id = "id=";
        String type = "type=";
        String size = "size=";
        String primaryKey = "primaryKey=";
        String required = "required=";
        String name = "name=";
        String end = " />\n";
        StringBuffer tableColumsBuffer = new StringBuffer();
        List<XWPFTableRow> rowList = table.getRows();
        //tableStyle标记表的格式,字段和长度在一起为true,分开为false
        boolean tableStyle = false;
        tableStyle = getTableStyle(rowList.get(0).getTableCells());

        for (int i = 1; i < rowList.size(); i++) {
            StringBuffer rowBUffer = new StringBuffer(tag);
            XWPFTableRow row = rowList.get(i);
            List<XWPFTableCell> cellList = row.getTableCells();
            boolean isHasSize = false;
            for (int j = 0; j < cellList.size(); j++) {
                String cell = cellList.get(j).getText().trim().toUpperCase();
                switch (j) {
                    case 0:
                        String newId = id + "\"" + cell + "\" ";
                        rowBUffer.append(newId);
                        break;
                    case 1:
                        if (!tableStyle){
                            if (cell.contains("(")) {
                                int startIndex = cell.indexOf("(");
                                int endIndex = cell.indexOf(")");
                                String cellType = cell.substring(0, startIndex);
                                String cellSize = cell.substring(startIndex + 1, endIndex);
                                String newType = type + "\"" + cellType + "\" ";
                                rowBUffer.append(newType);
                                String newSize = size + "\"" + cellSize + "\" ";
                                rowBUffer.append(newSize);
                            } else {
                                String newType = type + "\"" + cell + "\" ";
                                rowBUffer.append(newType);
                            }
                        }else {
                            isHasSize = hasSize(cell);
                            String newType = type + "\"" + cell + "\" ";
                            rowBUffer.append(newType);
                        }
                        break;
                    case 2:
                        if (isHasSize) {
                            String newSize = size + "\"" +cell + "\" ";
                            rowBUffer.append(newSize);
                            isHasSize = false;
                        }
                        break;
                    case 3:
                        String newPrimaryKey = "";
                        String newRequired = "";
                        if (cell.contains("主键")) {
                            newPrimaryKey = primaryKey + "\"true\" ";
                        } else {
                            newPrimaryKey = primaryKey + "\"false\" ";
                        }
                        if (cell.contains("非空")) {
                            newRequired = required + "\"true\" ";
                        } else {
                            newRequired = required + "\"false\" ";
                        }
                        rowBUffer.append(newPrimaryKey);
                        rowBUffer.append(newRequired);
                        break;
                    case 4:
                        String newName = name + "\"" + cell + "\"";
                        rowBUffer.append(newName);
                        rowBUffer.append(end);
                        break;
                    default:
                }
            }
            tableColumsBuffer.append(rowBUffer.toString());
        }
        return tableColumsBuffer.toString();
    }
}
Docx2XMLUtilTest.java
package docx2xml;


/**
 * @ClassName Docx2XMLUtilTest
 * @Author StriveFarrell
 * @Date 2019/12/4 16:12
 * @Description
 * docx文档转换为xml文档的测试类
 */

public class Docx2XMLUtilTest {
    //docx文件所在文件路径
    private static final String docxFilePath = "D:\\MyFile\\workLearning\\spark\\src\\files\\docx2xml3.docx";
    //生成的xml文件保存路径
    private static final String xmlFileSavePath = "D:\\MyFile\\workLearning\\spark\\src\\files\\docx2xml3.xml";
    //表格录入人
    private static final String author = "Hello Table";
    public static void main(String[] args){
        Docx2XMLUtil util = new Docx2XMLUtil();
        util.setDocxFilePath(docxFilePath);
        util.setAuthor(author);
        util.setXmlFileSavePath(xmlFileSavePath);
        util.getTableData();
    }
}

生成XML格式

表格式一XML

<table id="" javaId="" name="" >
   <rem>====================================================================</rem>
   <rem> 输入人:Hello Table    输入时间:2019.12.05</rem>
   <rem>table description</rem>
   <rem>====================================================================</rem>
   <column id="COLUMN_ID" type="VARCHAR" size="20" primaryKey="true" required="true" name="信息项定义主键" />
   <column id="IS_PRIMARY_KEY" type="INT" size="2" primaryKey="false" required="true" name="表明是否为主键(0:否;1:是)" />
   <column id="NOT_NULL" type="INT" size="2" primaryKey="false" required="true" name="0:可以为空;1:不可为空。" />
   <column id="LENGTH" type="VARCHAR" size="11" primaryKey="false" required="true" name="长度" />
   <column id="TYPE" type="VARCHAR" size="22" primaryKey="false" required="true" name="类型" />
   <column id="NAME_EN" type="VARCHAR" size="2048" primaryKey="false" required="false" name="英文名称" />
   <column id="NAME_CN" type="VARCHAR" size="2048" primaryKey="false" required="false" name="中文名称" />
   <column id="CATA_ID" type="VARCHAR" size="20" primaryKey="false" required="true" name="目录ID" />
<\table>

表格式二XML

<table id="" javaId="" name="" >
   <rem>====================================================================</rem>
   <rem> 输入人:zhangqx02    输入时间:2019.12.05</rem>
   <rem>table description</rem>
   <rem>====================================================================</rem>
   <column id="COLUMN_ID" type="VARCHAR" size="20" primaryKey="true" required="true" name="信息项定义主键" />
   <column id="IS_PRIMARY_KEY" type="INT" size="2" primaryKey="false" required="true" name="表明是否为主键(0:否;1:是)" />
   <column id="NOT_NULL" type="INT" size="2" primaryKey="false" required="true" name="0:可以为空;1:不可为空。" />
   <column id="LENGTH" type="VARCHAR" size="11" primaryKey="false" required="true" name="长度" />
   <column id="TYPE" type="VARCHAR" size="22" primaryKey="false" required="true" name="类型" />
   <column id="NAME_EN" type="VARCHAR" size="2048" primaryKey="false" required="false" name="英文名称" />
   <column id="NAME_CN" type="VARCHAR" size="2048" primaryKey="false" required="false" name="中文名称" />
   <column id="CATA_ID" type="VARCHAR" size="20" primaryKey="false" required="true" name="目录ID" />
<\table>

这个表还有很多不完善的地方,比如没有生成table标签的id,javaId和name的一些字段,以后有时间在去处理。

附件

表格式一

表格式二

总结

  • 将docx数据表录入到xml中,如果纯手动录入时一个枯燥头大的事情,一不小心就搞得自己眼花缭乱了。
  • 实现的方式还不是很完整,还有很多可以改进的地方。

猜你喜欢

转载自blog.csdn.net/someby/article/details/103400699