word文档解析有点麻烦,其中分了.docx和.doc的模版,方便兼容。
1.获取段落内容:
public JsonResult readWordTemporaryExit(String filePath) throws Exception {
JsonResult jsonResult = new JsonResult();
PageData pd = new PageData();
try {
FileInputStream in = new FileInputStream(filePath);
if(filePath.toLowerCase().endsWith("docx")){
XWPFDocument document = new XWPFDocument(in);
List<IBodyElement> elements = document.getBodyElements();
//获取表格
JsonResult word44 = word44(filePath);
if(!word44.isSuccess()){
return word44;
}
pd = (PageData) word44.getObj();
String p1 = StringUtils.deleteWhitespace(getParagraphText((XWPFParagraph) elements.get(3)));
pd.put("carryingPeople", p1.substring(p1.indexOf(":") + 1, p1.length()));
String p2 = StringUtils.deleteWhitespace(getParagraphText((XWPFParagraph) elements.get(4))).replace(" ", "");
String a = p2.substring(p2.indexOf(":") + 1, p2.length());
String b = a.replaceAll("年", "-");
String c = b.replaceAll("月", "-");
String d = c.replaceAll("日", "");
pd.put("appraisalApplyTime",d);
String p3 = StringUtils.deleteWhitespace(getParagraphText((XWPFParagraph) elements.get(7))).replace(" ", "");
pd.put("auditors",p3);
}else{
HWPFDocument document = new HWPFDocument(in);
//获取表格和图片
JsonResult word44 = word44(filePath);
if(!word44.isSuccess()){
return word44;
}
pd = (PageData) word44.getObj();
// 获取word中的所有段落与表格
Range range = document.getRange();
Paragraph p = range.getParagraph(range.numParagraphs()-15);
String carryingPeople = StringUtils.deleteWhitespace(p.text());
pd.put("carryingPeople",carryingPeople.substring(carryingPeople.indexOf(":")+1,carryingPeople.length()));
Paragraph p2 = range.getParagraph(range.numParagraphs()-14);
String appraisalApplyTime = StringUtils.deleteWhitespace(p2.text());
String a = appraisalApplyTime.substring(appraisalApplyTime.indexOf(":")+1,appraisalApplyTime.length());
String b = a.replaceAll("年", "-");
String c = b.replaceAll("月", "-");
String d = c.replaceAll("日", "");
pd.put("appraisalApplyTime",d);
Paragraph p3 = range.getParagraph(range.numParagraphs()-1);
String auditors = StringUtils.deleteWhitespace(p3.text());
pd.put("auditors",auditors);
document.close();
}
in.close();
jsonResult.setMsg("解析成功");
jsonResult.setSuccess(true);
} catch(Exception e){
e.printStackTrace();
jsonResult.setMsg("word格式异常,请检查word格式有无数据缺失,无效数据行!");
}
System.out.println("----------------------");
System.out.println(pd);
jsonResult.setObj(pd);
return jsonResult;
}
2.获取第二个表格的信息:
public JsonResult word44(String filePath) throws Exception {
JsonResult jsonResult = new JsonResult();
PageData pd = new PageData();
try {
FileInputStream in = new FileInputStream(filePath);
if(filePath.toLowerCase().endsWith("docx")){
//word 2007 图片不会被读取, 表格中的数据会被放在字符串的最后
XWPFDocument xwpf = new XWPFDocument(in);//得到word文档的信息
Iterator<XWPFTable> it = xwpf.getTablesIterator();//得到word中的表格
// 设置需要读取的表格 set是设置需要读取的第几个表格,total是文件中表格的总数
int set = 1, total = 2;
int num = set;
// 过滤前面不需要的表格
for (int i = 0; i < set - 1; i++) {
it.hasNext();
it.next();
}
//获取表格
JsonResult word33 = word33(filePath);
if(!word33.isSuccess()){
return word33;
}
pd = (PageData) word33.getObj();
while(it.hasNext()) {
XWPFTable table = it.next();
System.out.println("这是第" + num + "个表的数据");
List<XWPFTableRow> rows = table.getRows();
XWPFTableRow row = rows.get(0);
List<XWPFTableCell> cells = row.getTableCells();
XWPFTableCell cell = cells.get(1);
if(StringUtil.isEmpty(cell.getText())){
jsonResult.setMsg("XXX不能为空");
return jsonResult;
}else {
pd.put("XXXX",cell.getText());
}
cell = cells.get(3);
String a = cell.getText();
if(StringUtil.isEmpty(cell.getText())){
jsonResult.setMsg("X不能为空");
return jsonResult;
}else {
String b = a.replaceAll("年", "-");
String c = b.replaceAll("月", "-");
String d = c.replaceAll("日", "");
pd.put("X",d);
}
}else{
//获取表格和图片
JsonResult word33 = word33(filePath);
if(!word33.isSuccess()){
return word33;
}
pd = (PageData) word33.getObj();
// 获取word中的所有段落与表格
// 处理doc格式 即office2003版本
POIFSFileSystem pfs = new POIFSFileSystem(in);
HWPFDocument hwpf = new HWPFDocument(pfs);
Range range = hwpf.getRange();//得到文档的读取范围
TableIterator it = new TableIterator(range);
// 迭代文档中的表格
// 如果有多个表格只读取需要的一个 set是设置需要读取的第几个表格,total是文件中表格的总数
int set = 2, total = 2;
int num = set;
for (int i = 0; i < set-1; i++) {
it.hasNext();
it.next();
}
while (it.hasNext()) {
Table tb = (Table) it.next();
System.out.println("这是第" + num + "个表的数据");
//申报单位
TableRow tr = tb.getRow(0);//取得行
TableCell td = tr.getCell(1);//取得单元格
String XXX= getParagraph(td);
if (StringUtil.isEmpty(XXX)) {
jsonResult.setMsg("XXX不能为空");
return jsonResult;
}
pd.put("XXX", XXX);
//审核时间
td = tr.getCell(3);//取得单元格
String X= getParagraph(td);
if (StringUtil.isEmpty(X)) {
jsonResult.setMsg("X不能为空");
return jsonResult;
}
String b = X.replaceAll("年", "-");
String c = b.replaceAll("月", "-");
String d = c.replaceAll("日", "");
pd.put("X", d);
}
in.close();
}
in.close();
jsonResult.setMsg("解析成功");
jsonResult.setSuccess(true);
} catch(Exception e){
e.printStackTrace();
jsonResult.setMsg("word格式异常,请检查word格式有无数据缺失,无效数据行!");
}
System.out.println("----------------------");
System.out.println(pd);
jsonResult.setObj(pd);
return jsonResult;
}
依次类推,word文档中有几个excel表格就写几个
(如有侵权,联系就删)