POI3.8读取WORD(简洁实例)

目前最新版POI为3.8:
poi-3.8-20120326.jar
poi-examples-3.8-20120326.jar
poi-excelant-3.8-20120326.jar
poi-ooxml-3.8-20120326.jar
poi-ooxml-schemas-3.8-20120326.jar
poi-scratchpad-3.8-20120326.jar

import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;

import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;



public class WordDemo extends HttpServlet {

	private static final long serialVersionUID = 1L;

	public void doGet(HttpServletRequest request, HttpServletResponse response)
			throws ServletException, IOException {
		this.doPost(request, response);
	}

	public void doPost(HttpServletRequest request, HttpServletResponse response)
			throws ServletException, IOException {
		
		//从硬盘读取一个doc文档
		InputStream in = new FileInputStream("F:\\test.doc");
		//类从word文档中提取文本,非特殊情况下,都将使用getParagraphText()与getText()
		WordExtractor word = new WordExtractor(in);
		
		//获取段文本
		String [] strArray = word.getParagraphText();
		String str = word.getText();  
		
		for(int i=0 ; i<strArray.length ; i++){
			System.out.println(strArray[i]+"\ti循环:"+i);
		}
		System.out.println(str +"\t --");
		
		//这个构造函数从InputStream中加载Word文档。
		HWPFDocument doc = new HWPFDocument((InputStream)new FileInputStream("F:\\test.doc"));
		
		//这个类为HWPF对象模型,对文档范围段操作
		Range range = doc.getRange(); //
		
		//看看此文档有多少个段落
		int num = range.numParagraphs();
		System.out.println(num+"段");   
		
		//得到word数据流
		byte [] dataStream = doc.getDataStream();
		System.out.println("数据流长度:"+dataStream.length);
		
		//用于在一段范围内获得段落数
		int numChar = range.numCharacterRuns();
		System.out.println("CharacterRuns 数:"+numChar);
		 
		//负责图像提取  和    确定一些文件某块是否包含嵌入的图像。
		PicturesTable table = new PicturesTable(doc, dataStream, null);
		
		for(int j=0 ; j<numChar ; j++){
			//这个类表示一个文本运行,有着共同的属性。
			CharacterRun run = range.getCharacterRun(j);
			//是否存在图片
			boolean bool = table.hasPicture(run);
			System.out.println("是否存在图片:"+bool);
			if(bool){
				//返回图片对象绑定到指定的CharacterRun
				Picture pic = table.extractPicture(run, true);
				//图片的内容字节写入到指定的输出流。
				pic.writeImageContent(new FileOutputStream("F:\\"+j+".bmp"));
				System.out.println("成功提取图片"+j+":");
			}
		}
		request.getRequestDispatcher("ok.jsp").forward(request, response);
	}


}


可正常运行及提取图片

猜你喜欢

转载自newerdragon.iteye.com/blog/1675653