JAVA读取DOC报名表数据

1 应用领域

在各种各样考试中,尤其是在艰苦边远地区,组织考试报名常常采取向邮箱发报名表的方式,缺乏专门的考试报名系统,此时需要将DOC报名表的数据提取出来,对数据进行筛选和统计,本次讲一下如何用JAVA提取类似报名表中的表格数据

2 maven依赖库文件

<dependency>
   <groupId>org.apache.poi</groupId>
   <artifactId>poi-ooxml</artifactId>
  <version>3.8</version>
</dependency>
<dependency>
  <groupId>org.apache.poi</groupId>
  <artifactId>poi-scratchpad</artifactId>
  <version>3.8</version>
</dependency>

3 java读取文件夹doc文档

public class ReadDocFileFromJava {

    public static void main(String[] args) throws Exception {
        /**This is the document that you want to read using Java.**/
    	String dirPath = "D:\\test\\";
        List<List<String>> lists = readMyDocuments(dirPath);
        for(List<String> list : lists) {
        	for(String s : list ) {
        		System.out.print(s+"\t");
        	}
        	System.out.println();
        }
    }
    
    /**
     * 读取文档
     * @param dirPath
     * @return
     */
    public static List<List<String>> readMyDocuments(String dirPath){
        List<List<String>> lists = new ArrayList<List<String>>();
        try {
        	List<String> docPaths = readDocFilePaths(dirPath);
        	/** 循环读取文件夹内容 **/
        	for(String docPath:docPaths) {
        		POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(docPath));
                HWPFDocument doc = new HWPFDocument(fs);
                List<String> strings = readTable(doc);
                lists.add(strings);
        	}
        } catch (Exception e) {
            e.printStackTrace();
        }
        return lists;
    }
    
    /**
     * 读取表格table
     * @param doc
     * @return
     * @throws Exception
     */
    public static List<String> readTable(HWPFDocument doc) throws Exception{
    	Range range =doc.getRange();
    	TableIterator it=new TableIterator(range);
    	List<String> result = new ArrayList<String>();
		while(it.hasNext()){
			Table tb=(Table)it.next();
			for(int i=0;i<tb.numRows();i++){
				TableRow tr=tb.getRow(i);
				for(int j=0;j<tr.numCells();j++){
					TableCell td=tr.getCell(j);
					String text = "";
					for(int k=0;k<td.numParagraphs();k++){
						Paragraph para=td.getParagraph(k);
						text = text + para.text().trim();
					}
					result.add(text);
				}
			}
		}
		return result;
    }
    
    /**
     * 读取文件夹路径
     * @param dirPath
     * @return
     */
    public static List<String> readDocFilePaths(String dirPath){
    	List<String> result = new ArrayList<String>();
    	File dir = new File(dirPath);
		File[] files = dir.listFiles();
		if (files != null) {
			for (int i = 0; i < files.length; i++) {
				if (files[i].isDirectory()) {
					/** 是文件夹,就递归读取 **/
					String path = files[i].getAbsolutePath();
					List<String> tmp = readDocFilePaths(path);
					result.addAll(tmp);
				}else {
					/** 不是是文件夹,放入结果集 **/
					if(files[i].getPath().endsWith(".doc")) {
						result.add(files[i].getPath());
					}
				}
			}
		}
    	return result;
    }
}

猜你喜欢

转载自blog.csdn.net/qq_23078359/article/details/90519342