解析word文档段落以及表格

word文档解析有点麻烦,其中分了.docx和.doc的模版,方便兼容。

1.获取段落内容:

 public JsonResult readWordTemporaryExit(String filePath) throws Exception {
        JsonResult jsonResult = new JsonResult();
        PageData pd = new PageData();
        try {
            FileInputStream in = new FileInputStream(filePath);
            if(filePath.toLowerCase().endsWith("docx")){
                XWPFDocument document = new XWPFDocument(in);
                List<IBodyElement> elements = document.getBodyElements();
                //获取表格
                JsonResult word44 = word44(filePath);
                if(!word44.isSuccess()){
                    return word44;
                }

                pd = (PageData) word44.getObj();

                String p1 = StringUtils.deleteWhitespace(getParagraphText((XWPFParagraph) elements.get(3)));
                pd.put("carryingPeople", p1.substring(p1.indexOf(":") + 1, p1.length()));

                String p2 = StringUtils.deleteWhitespace(getParagraphText((XWPFParagraph) elements.get(4))).replace(" ", "");
                String a = p2.substring(p2.indexOf(":") + 1, p2.length());
                String b = a.replaceAll("年", "-");
                String c = b.replaceAll("月", "-");
                String d = c.replaceAll("日", "");
                pd.put("appraisalApplyTime",d);

                String p3 = StringUtils.deleteWhitespace(getParagraphText((XWPFParagraph) elements.get(7))).replace(" ", "");
                pd.put("auditors",p3);

            }else{
                HWPFDocument document = new HWPFDocument(in);
                //获取表格和图片
                JsonResult word44 = word44(filePath);
                if(!word44.isSuccess()){
                    return word44;
                }
                pd = (PageData) word44.getObj();
                // 获取word中的所有段落与表格
                Range range = document.getRange();

                Paragraph p = range.getParagraph(range.numParagraphs()-15);
                String carryingPeople = StringUtils.deleteWhitespace(p.text());
                pd.put("carryingPeople",carryingPeople.substring(carryingPeople.indexOf(":")+1,carryingPeople.length()));

                Paragraph p2 = range.getParagraph(range.numParagraphs()-14);
                String appraisalApplyTime = StringUtils.deleteWhitespace(p2.text());
                String a = appraisalApplyTime.substring(appraisalApplyTime.indexOf(":")+1,appraisalApplyTime.length());
                String b = a.replaceAll("年", "-");
                String c = b.replaceAll("月", "-");
                String d = c.replaceAll("日", "");
                pd.put("appraisalApplyTime",d);

                Paragraph p3 = range.getParagraph(range.numParagraphs()-1);
                String auditors = StringUtils.deleteWhitespace(p3.text());
                pd.put("auditors",auditors);


                document.close();
            }
            in.close();
            jsonResult.setMsg("解析成功");
            jsonResult.setSuccess(true);
        } catch(Exception e){
            e.printStackTrace();
            jsonResult.setMsg("word格式异常,请检查word格式有无数据缺失,无效数据行!");
        }
        System.out.println("----------------------");
        System.out.println(pd);
        jsonResult.setObj(pd);
        return jsonResult;
    }

2.获取第二个表格的信息:

    public JsonResult word44(String filePath) throws Exception {
        JsonResult jsonResult = new JsonResult();
        PageData pd = new PageData();
        try {
            FileInputStream in = new FileInputStream(filePath);
            if(filePath.toLowerCase().endsWith("docx")){
                //word 2007 图片不会被读取, 表格中的数据会被放在字符串的最后
                XWPFDocument xwpf = new XWPFDocument(in);//得到word文档的信息
                Iterator<XWPFTable> it = xwpf.getTablesIterator();//得到word中的表格
                // 设置需要读取的表格  set是设置需要读取的第几个表格,total是文件中表格的总数
                int set = 1, total = 2;
                int num = set;
                // 过滤前面不需要的表格
                for (int i = 0; i < set - 1; i++) {
                    it.hasNext();
                    it.next();
                }
                //获取表格
                JsonResult word33 = word33(filePath);
                if(!word33.isSuccess()){
                    return word33;
                }
                pd = (PageData) word33.getObj();

                while(it.hasNext()) {
                    XWPFTable table = it.next();
                    System.out.println("这是第" + num + "个表的数据");
                    List<XWPFTableRow> rows = table.getRows();

                    XWPFTableRow  row = rows.get(0);
                    List<XWPFTableCell> cells = row.getTableCells();
                    XWPFTableCell cell = cells.get(1);
                    if(StringUtil.isEmpty(cell.getText())){
                        jsonResult.setMsg("XXX不能为空");
                        return  jsonResult;
                    }else {
                        pd.put("XXXX",cell.getText());
                    }
                    cell = cells.get(3);
                    String a = cell.getText();
                    if(StringUtil.isEmpty(cell.getText())){
                        jsonResult.setMsg("X不能为空");
                        return  jsonResult;
                    }else {
                        String b = a.replaceAll("年", "-");
                        String c = b.replaceAll("月", "-");
                        String d = c.replaceAll("日", "");
                        pd.put("X",d);
                    }
            }else{
                //获取表格和图片
                JsonResult word33 = word33(filePath);
                if(!word33.isSuccess()){
                    return word33;
                }
                pd = (PageData) word33.getObj();
                // 获取word中的所有段落与表格
                // 处理doc格式 即office2003版本
                POIFSFileSystem pfs = new POIFSFileSystem(in);
                HWPFDocument hwpf = new HWPFDocument(pfs);
                Range range = hwpf.getRange();//得到文档的读取范围
                TableIterator it = new TableIterator(range);
                // 迭代文档中的表格
                // 如果有多个表格只读取需要的一个 set是设置需要读取的第几个表格,total是文件中表格的总数
                int set = 2, total = 2;
                int num = set;
                for (int i = 0; i < set-1; i++) {
                    it.hasNext();
                    it.next();
                }
                while (it.hasNext()) {
                    Table tb = (Table) it.next();
                    System.out.println("这是第" + num + "个表的数据");
                    //申报单位
                    TableRow tr = tb.getRow(0);//取得行
                    TableCell td = tr.getCell(1);//取得单元格
                    String XXX= getParagraph(td);
                    if (StringUtil.isEmpty(XXX)) {
                        jsonResult.setMsg("XXX不能为空");
                        return jsonResult;
                    }
                    pd.put("XXX", XXX);
                    //审核时间
                    td = tr.getCell(3);//取得单元格
                    String X= getParagraph(td);
                    if (StringUtil.isEmpty(X)) {
                        jsonResult.setMsg("X不能为空");
                        return jsonResult;
                    }
                    String b = X.replaceAll("年", "-");
                    String c = b.replaceAll("月", "-");
                    String d = c.replaceAll("日", "");
                    pd.put("X", d);
}
                in.close();
            }
            in.close();
            jsonResult.setMsg("解析成功");
            jsonResult.setSuccess(true);
        } catch(Exception e){
            e.printStackTrace();
            jsonResult.setMsg("word格式异常,请检查word格式有无数据缺失,无效数据行!");
        }
        System.out.println("----------------------");
        System.out.println(pd);
        jsonResult.setObj(pd);
        return jsonResult;
    }

依次类推,word文档中有几个excel表格就写几个

(如有侵权,联系就删)

猜你喜欢

转载自blog.csdn.net/a1099380657/article/details/117918867