使用POI读写PowerPoint文件(兼容ppt与pptx版本)

调用示例:

File powerPointFile = new File("D:\\temp.ppt");

//读取PowerPoint文档中所有文本内容,以字符串形式返回  
System.out.println(PowerPointFileUtil.extractTextFromPowerPointFile(powerPointFile , "," , ";"));

工具类源码:

/**
 * BasePowerPointFileUtil.java
 * Copyright ® 2017 窦海宁
 * All right reserved
 */

package org.aiyu.core.common.util.file.office;

import java.util.ArrayList;
import java.util.List;

import org.apache.poi.sl.usermodel.AutoShape;
import org.apache.poi.sl.usermodel.Shape;
import org.apache.poi.sl.usermodel.Slide;
import org.apache.poi.sl.usermodel.SlideShow;

/**
 * <p>PowerPoint文件工具基类
 * 
 * <p>通用的PowerPoint文件工具基类,可用于从PowerPoint文档中抽取文本信息
 * 
 * @author  窦海宁, [email protected]
 * @since   AiyuCommonCore-1.0
 * @version AiyuCommonCore-1.0
 */
public abstract class BasePowerPointFileUtil {

	/**
	 * <p>读取PowerPoint文件中的幻灯片对象
	 * 
	 * @param  slideShow SlideShow对象
	 * 
	 * @return 读取出的工作薄列表
	 * 
	 * @modify 窦海宁, 2017-01-18
	 */
	protected static List readSlideShow(SlideShow slideShow) {

		List slideList = null;
		if (slideShow != null) {

			slideList = new ArrayList();
			List slides = slideShow.getSlides();
			for (int i = 0 ; i < slides.size() ; i++) {

				slideList.add(BasePowerPointFileUtil.readSlide((Slide) slides.get(i)));
			}
		}
		return slideList;
	}

	/**
	 * <p>读取指定的Slide中的数据
	 * 
	 * @param  slide Slide对象
	 * 
	 * @return 读取出的Slide数据列表
	 * 
	 * @modify 窦海宁, 2017-01-18
	 */
	protected static List readSlide(Slide slide) {

		List shapeList = null;
		if (slide != null) {

			shapeList = new ArrayList();
			List shapes = slide.getShapes();
			for (int i = 0 ; i < shapes.size() ; i++) {

				shapeList.add(BasePowerPointFileUtil.readShape((Shape) shapes.get(i)));
			}
		}
		return shapeList;
	}

	/**
	 * <p>读取指定的图形的数据
	 * 
	 * @param  shape Slide中的图形对象
	 * 
	 * @return 读取出的图形数据
	 * 
	 * @modify 窦海宁, 2017-01-18
	 */
	protected static Object readShape(Shape shape) {

		String returnValue = null;
		if (shape != null) {

			if (shape instanceof AutoShape) {
				try {

					returnValue = ((AutoShape) shape).getText();
				} catch (Exception ex) {

					ex.printStackTrace();
				}
			}
		}
		return returnValue;
	}

}
PowerPoint2003版本工具类:  
/**
 * PowerPoint2003FileUtil.java
 * Copyright &reg; 2010 窦海宁
 * All right reserved
 */

package org.aiyu.core.common.util.file.office;

import java.io.File;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.lang3.StringUtils;
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
import org.apache.poi.hslf.usermodel.HSLFSlideShowImpl;
import org.apache.poi.sl.usermodel.SlideShow;

/**
 * <p>PowerPoint2003版文件工具类
 * 
 * <p>通用的PowerPoint2003版文件工具类,可用于从PowerPoint文档中抽取文本信息
 * 
 * @author  窦海宁, [email protected]
 * @since   AiyuCommonCore-1.0
 * @version AiyuCommonCore-1.0
 */
public abstract class PowerPoint2003FileUtil extends BasePowerPointFileUtil {

	/**
	 * <p>从PowerPoint文档中提取文本信息
	 * 
	 * @param  powerPointFile PowerPoint文件
	 * @param  shapeSeparator Shape分隔符
	 * @param  slideSeparator Slide分隔符
	 * 
	 * @return 提取后的文本信息
	 * 
	 * @modify 窦海宁, 2017-01-18
	 */
	protected static String extractTextFromPowerPointFile(File powerPointFile , String shapeSeparator , String slideSeparator) {

		StringBuffer returnValue = new StringBuffer();
		if (powerPointFile != null && slideSeparator != null && shapeSeparator != null) {

			if (powerPointFile.isFile()) {

				try {

					SlideShow slideShow     = new HSLFSlideShow(new HSLFSlideShowImpl(powerPointFile.getCanonicalPath()));
					Iterator  slideIterator = PowerPoint2003FileUtil.readSlideShow(slideShow).iterator();
					//遍历Slide
					while (slideIterator.hasNext()) {

						Iterator shapeIterator = ((List) slideIterator.next()).iterator();
						//遍历Shape
						while (shapeIterator.hasNext()) {

							Object shapeValue = shapeIterator.next();
							if (shapeValue != null) {

								returnValue.append((String) shapeValue);
								if (shapeIterator.hasNext()) {

									returnValue.append(shapeSeparator);
								}
							}
						}
						if (slideIterator.hasNext()) {

							returnValue.append(slideSeparator);
						}
					}
				} catch (Exception ex) {

					ex.printStackTrace();
				}
			}
		}
		return StringUtils.trimToNull(returnValue.toString());
	}

}
  PowerPoint2007版本工具类:  
/**
 * PowerPoint2007FileUtil.java
 * Copyright &reg; 2017 窦海宁
 * All right reserved
 */

package org.aiyu.core.common.util.file.office;

import java.io.File;
import java.io.FileInputStream;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.lang3.StringUtils;
import org.apache.poi.xslf.usermodel.XMLSlideShow;

/**
 * <p>PowerPoint2007版文件工具类
 * 
 * <p>通用的PowerPoint2007版文件工具类,可用于从PowerPoint文档中抽取文本信息
 * 
 * @author  窦海宁, [email protected]
 * @since   AiyuCommonCore-1.0
 * @version AiyuCommonCore-1.0
 */
public abstract class PowerPoint2007FileUtil extends BasePowerPointFileUtil {

	/**
	 * <p>从PowerPoint文档中提取文本信息
	 * 
	 * @param  powerPointFile PowerPoint文件
	 * @param  shapeSeparator Shape分隔符
	 * @param  slideSeparator Slide分隔符
	 * 
	 * @return 提取后的文本信息
	 * 
	 * @modify 窦海宁, 2017-01-18
	 */
	protected static String extractTextFromPowerPointFile(File powerPointFile , String shapeSeparator , String slideSeparator) {

		StringBuffer returnValue = new StringBuffer();
		if (powerPointFile != null && slideSeparator != null && shapeSeparator != null) {

			if (powerPointFile.isFile()) {

				try {

					XMLSlideShow slideShow     = new XMLSlideShow(new FileInputStream(powerPointFile));
					Iterator     slideIterator = PowerPoint2007FileUtil.readSlideShow(slideShow).iterator();
					//遍历Slide
					while (slideIterator.hasNext()) {

						Iterator shapeIterator = ((List) slideIterator.next()).iterator();
						//遍历Shape
						while (shapeIterator.hasNext()) {

							Object shapeValue = shapeIterator.next();
							if (shapeValue != null) {

								returnValue.append((String) shapeValue);
								if (shapeIterator.hasNext()) {

									returnValue.append(shapeSeparator);
								}
							}
						}
						if (slideIterator.hasNext()) {

							returnValue.append(slideSeparator);
						}
					}
				} catch (Exception ex) {

					ex.printStackTrace();
				}
			}
		}
		return StringUtils.trimToNull(returnValue.toString());
	}

}
  统一调用工具类:  
/**
 * PowerPointFileUtil.java
 * Copyright &reg; 2017 窦海宁
 * All right reserved
 */

package org.aiyu.core.common.util.file.office;

import java.io.File;

import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang3.StringUtils;

/**
 * <p>PowerPoint文件工具类
 * 
 * <p>通用的PowerPoint文件工具类,可用于从PowerPoint文档中抽取文本信息
 * 
 * @author  窦海宁, [email protected]
 * @since   AiyuCommonCore-1.0
 * @version AiyuCommonCore-1.0
 */
public abstract class PowerPointFileUtil extends BasePowerPointFileUtil {

	/**
	 * <p>从PowerPoint文档中提取文本信息
	 * 
	 * @param  powerPointFile PowerPoint文件
	 * @param  shapeSeparator Shape分隔符
	 * @param  slideSeparator Slide分隔符
	 * 
	 * @return 提取后的文本信息
	 * 
	 * @modify 窦海宁, 2017-02-06
	 */
	public static String extractTextFromPowerPointFile(File powerPointFile , String shapeSeparator , String slideSeparator) {

		String resultText = null;

		if (powerPointFile != null && powerPointFile.exists()) {

			String extension = FilenameUtils.getExtension(powerPointFile.getName());
			if (StringUtils.equalsIgnoreCase("ppt" , extension)) {

				//Office2003版文件处理
				resultText = PowerPoint2003FileUtil.extractTextFromPowerPointFile(powerPointFile , shapeSeparator , slideSeparator);
			} else if (StringUtils.equalsIgnoreCase("pptx" , extension)) {

				//Office2007版文件处理
				resultText = PowerPoint2003FileUtil.extractTextFromPowerPointFile(powerPointFile , shapeSeparator , slideSeparator);
			} else {

				//文件类型有误
			}
		}

		return resultText;
	}

}
  统一调用工具类通过文件扩展名(PPT与PPTX,不区分大小写)判断文件版本,暂时没有想到更好的办法;本工具类使用POI_3.15实现,无须目标机器安装OFFICE软件也可进行文件读写。

猜你喜欢

转载自chong0660.iteye.com/blog/1923760