java读取各类文件工具类(pdf,doc,docx,txt,ppt,xls,json,md.....其他自定义)

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Calendar;
import java.util.List;

import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.syweb.mydemo.common.model.ImExtInfo;

/** 
* @ClassName: FileInfoUtil 
* @Description: 读取各类文件工具类 
* @author wl
* @date 2018年5月9日 下午6:00:07 
*/
public class FileInfoUtil {
	
	private static final Logger logger = LoggerFactory.getLogger(FileInfoUtil.class);

	public static ImExtInfo readXFile(String inputPath){
		ImExtInfo ImExtInfo = new ImExtInfo();
		File file = new File(inputPath.trim());
		String fileExt = FileUtil.getFileType(file);
		switch(fileExt){
		case "pdf":
			ImExtInfo = readPdfFile(inputPath);
			break;
		case "docx":
			ImExtInfo = readDocxFile(inputPath);
			break;
		case "doc":
			ImExtInfo = readDocFile(inputPath);
			break;
		case "txt":
			ImExtInfo = readTxtFile(inputPath);
			break;
		case "ppt":
			ImExtInfo = readPPtFile(inputPath);
			break;
		case "xls":
			ImExtInfo = readXlsFile(inputPath);
			break;
		case "json":
			ImExtInfo = readTextFile(inputPath);
			break;
		case "md":
			ImExtInfo = readTextFile(inputPath);
			break;
		default:
			ImExtInfo = null;
		}
		return ImExtInfo;
	}

	
	/** 
	* @Title: readDocxFile 
	* @Description: docx读取
	* @param @param inputPath
	* @param @return  参数说明 
	* @return ImExtInfo 返回类型 
	* @author wl
	* @date 2018年5月9日 下午5:11:01 
	*/
	public static ImExtInfo readDocxFile(String inputPath) {
		ImExtInfo ImExtInfo = new ImExtInfo();
		try {
			File file = new File(inputPath.trim());
			// 1.获取文件名称
			String fileTitle = file.getName();
			// 2.获取绝对路径
			String filePath = file.getAbsolutePath();
			// 3.获取文件内容
			FileInputStream fis = new FileInputStream(filePath);
			XWPFDocument document=new XWPFDocument(fis);
			List<XWPFParagraph> paragraphs = document.getParagraphs();
			StringBuilder contentBuilder = new StringBuilder();
			for (XWPFParagraph para : paragraphs) {
				contentBuilder.append(para.getText().trim());
				}
			String content = contentBuilder.toString();
			ImExtInfo.setTitle(fileTitle);
			ImExtInfo.setPath(filePath);
			ImExtInfo.setContent(content.trim());
			// 获取当前时间...
			String curTimeStamp = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(Calendar.getInstance().getTime());
			ImExtInfo.setCreate_time(curTimeStamp);
			fis.close();
		} catch (Exception e) {
			e.printStackTrace();
			logger.error("读取docx文件失败");
		}
		return ImExtInfo;
	}
	/** 
	* @Title: readDocFile 
	* @Description: doc读取
	* @param @param inputPath
	* @param @return  参数说明 
	* @return ImExtInfo 返回类型 
	* @author wl
	* @date 2018年5月9日 下午5:11:19 
	*/
	public static ImExtInfo readDocFile(String inputPath) {
		ImExtInfo ImExtInfo = new ImExtInfo();
		try {
			File file = new File(inputPath.trim());
			// 1.获取文件名称
			String fileTitle = file.getName();
			// 2.获取绝对路径
			String filePath = file.getAbsolutePath();
			// 3.获取文件内容
			FileInputStream fis = new FileInputStream(file.getAbsolutePath());
			WordExtractor document=new WordExtractor(fis);
			String paragraphs = document.getText(); 
			StringBuilder contentBuilder = new StringBuilder();
			contentBuilder.append(paragraphs.trim());
			String content = contentBuilder.toString();
			ImExtInfo.setTitle(fileTitle);
			ImExtInfo.setPath(filePath);
			ImExtInfo.setContent(content.trim());
			// 获取当前时间...
			String curTimeStamp = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(Calendar.getInstance().getTime());
			ImExtInfo.setCreate_time(curTimeStamp);
			document.close();
			fis.close();
		} catch (Exception e) {
			e.printStackTrace();
			logger.error("读取doc文件失败");
		}
		return ImExtInfo;
	}
	/** 
	* @Title: readPdfFile 
	* @Description: 读取PDF
	* @param @param inputPath
	* @param @return  参数说明 
	* @return ImExtInfo 返回类型 
	* @author wl
	* @date 2018年5月9日 下午5:16:40 
	*/
	public static ImExtInfo readPdfFile(String inputPath) {
		ImExtInfo ImExtInfo = new ImExtInfo();
		COSDocument cosDoc=null;
		PDDocument pdDoc = null;
		String paragraphs =null;
		try {
			File file = new File(inputPath.trim());
			// 1.获取文件名称
			String fileTitle = file.getName();
			// 2.获取绝对路径
			String filePath = file.getAbsolutePath();
			// 3.获取文件内容
			FileInputStream fis = new FileInputStream(file.getAbsolutePath());
			cosDoc=parseDocument(fis);
			 if (cosDoc.isEncrypted()) {   
				   if (cosDoc != null)   
				    closeCOSDocument(cosDoc);   
				   logger.info("该PDF文档是加密文档,无法处理");   
			}  
			PDFTextStripper stripper = new PDFTextStripper();   
			String docText = stripper.getText(new PDDocument(cosDoc));
			pdDoc = new PDDocument(cosDoc);   
		    PDDocumentInformation docInfo = pdDoc.getDocumentInformation();   
		    if(docInfo.getTitle()!=null && !docInfo.getTitle().equals("")){   
		    	paragraphs = docInfo.getTitle();   
		    }   
			StringBuilder contentBuilder = new StringBuilder();
			contentBuilder.append(docText.trim());
			String content = contentBuilder.toString();
			ImExtInfo.setTitle(fileTitle);
			ImExtInfo.setPath(filePath);
			ImExtInfo.setContent(content.trim());
			// 获取当前时间...
			String curTimeStamp = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(Calendar.getInstance().getTime());
			ImExtInfo.setCreate_time(curTimeStamp);
			fis.close();
			closeCOSDocument(cosDoc);
			closePDDocument(pdDoc);
		} catch (Exception e) {
			e.printStackTrace();
			closeCOSDocument(cosDoc);   
			closePDDocument(pdDoc); 
			logger.error("读取PDF文件失败");
		}
		return ImExtInfo;
	}
	/** 
	* @Title: readTxtFile 
	* @Description: 读取txt
	* @param @param inputPath
	* @param @return  参数说明 
	* @return ImExtInfo 返回类型 
	* @author wl
	* @date 2018年5月9日 下午5:37:11 
	*/
	public static ImExtInfo readTxtFile(String inputPath) {
		ImExtInfo ImExtInfo = new ImExtInfo();
		try {
			File file = new File(inputPath.trim());
			// 1.获取文件名称
			String fileTitle = file.getName();
			// 2.获取绝对路径
			String filePath = file.getAbsolutePath();
			// 3.获取文件内容
			//构造一个BufferedReader类来读取文件
            BufferedReader br = new BufferedReader(new FileReader(file));
            String str = null;
            String result =null;
            while ((str = br.readLine()) != null) {
                result = result + "\n" + str;
            }
			StringBuilder contentBuilder = new StringBuilder();
			contentBuilder.append(result.trim());
			String content = contentBuilder.toString();
			ImExtInfo.setTitle(fileTitle);
			ImExtInfo.setPath(filePath);
			ImExtInfo.setContent(content.trim());
			// 获取当前时间...
			String curTimeStamp = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(Calendar.getInstance().getTime());
			ImExtInfo.setCreate_time(curTimeStamp);
			br.close();
		} catch (Exception e) {
			e.printStackTrace();
			logger.error("读取txt文件失败");
		}
		return ImExtInfo;
	}
	/** 
	* @Title: readPPtFile 
	* @Description: PPT读取
	* @param @param inputPath
	* @param @return  参数说明 
	* @return ImExtInfo 返回类型 
	* @author wl
	* @date 2018年5月9日 下午5:47:21 
	*/
	public static ImExtInfo readPPtFile(String inputPath) {
		ImExtInfo ImExtInfo = new ImExtInfo();
		try {
			File file = new File(inputPath.trim());
			// 1.获取文件名称
			String fileTitle = file.getName();
			// 2.获取绝对路径
			String filePath = file.getAbsolutePath();
			// 3.获取文件内容 
			FileInputStream fis = new FileInputStream(file.getAbsolutePath());
			PowerPointExtractor document=new PowerPointExtractor(fis);
			String paragraphs = document.getText(); 
			StringBuilder contentBuilder = new StringBuilder();
			contentBuilder.append(paragraphs.trim());
			String content = contentBuilder.toString();
			ImExtInfo.setTitle(fileTitle);
			ImExtInfo.setPath(filePath);
			ImExtInfo.setContent(content.trim());
			// 获取当前时间...
			String curTimeStamp = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(Calendar.getInstance().getTime());
			ImExtInfo.setCreate_time(curTimeStamp);
			fis.close();
			document.close();
		} catch (Exception e) {
			e.printStackTrace();
			logger.error("读取ppt文件失败");
		}
		return ImExtInfo;
	}
	
	/** 
	* @Title: readXlsFile 
	* @Description: 读取xls
	* @param @param inputPath
	* @param @return  参数说明 
	* @return ImExtInfo 返回类型 
	* @author wl
	* @date 2018年5月9日 下午5:54:13 
	*/
	public static ImExtInfo readXlsFile(String inputPath) {
		ImExtInfo ImExtInfo = new ImExtInfo();
		try {
			File file = new File(inputPath.trim());
			// 1.获取文件名称
			String fileTitle = file.getName();
			// 2.获取绝对路径
			String filePath = file.getAbsolutePath();
			// 3.获取文件内容 
			FileInputStream fis = new FileInputStream(file.getAbsolutePath());
			HSSFWorkbook wb=new HSSFWorkbook(new POIFSFileSystem(fis));
		    ExcelExtractor extractor=new ExcelExtractor(wb);
		    extractor.setFormulasNotResults(false);
	        extractor.setIncludeSheetNames(true);
			String paragraphs = extractor.getText(); 
			StringBuilder contentBuilder = new StringBuilder();
			contentBuilder.append(paragraphs.trim());
			String content = contentBuilder.toString();
			ImExtInfo.setTitle(fileTitle);
			ImExtInfo.setPath(filePath);
			ImExtInfo.setContent(content.trim());
			// 获取当前时间...
			String curTimeStamp = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(Calendar.getInstance().getTime());
			ImExtInfo.setCreate_time(curTimeStamp);
			fis.close();
			extractor.close();
		} catch (Exception e) {
			e.printStackTrace();
			logger.error("读取xls文件失败");
		}
		return ImExtInfo;
	}
	
	/** 
	* @Title: readTextFile 
	* @Description: 读取文本
	* @param @param inputPath
	* @param @return  参数说明 
	* @return ImExtInfo 返回类型 
	* @author wl
	* @date 2018年5月9日 下午5:57:30 
	*/
	public static ImExtInfo readTextFile(String inputPath) {
		ImExtInfo ImExtInfo = new ImExtInfo();
		try {
			File file = new File(inputPath.trim());
			// 1.获取文件名称
			String fileTitle = file.getName();
			// 2.获取绝对路径
			String filePath = file.getAbsolutePath();
			// 3.获取文件内容 
			FileInputStream fis = new FileInputStream(file.getAbsolutePath());
			WordExtractor extractor = new WordExtractor(fis);
			String paragraphs = extractor.getText(); 
			StringBuilder contentBuilder = new StringBuilder();
			contentBuilder.append(paragraphs.trim());
			String content = contentBuilder.toString();
			ImExtInfo.setTitle(fileTitle);
			ImExtInfo.setPath(filePath);
			ImExtInfo.setContent(content.trim());
			// 获取当前时间...
			String curTimeStamp = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(Calendar.getInstance().getTime());
			ImExtInfo.setCreate_time(curTimeStamp);
			fis.close();
			extractor.close();
		} catch (Exception e) {
			e.printStackTrace();
			logger.error("读取文件失败");
		}
		return ImExtInfo;
	}


	/** 
	* @Title: parseDocument 
	* @Description: PD路径读取
	* @author wl
	* @date 2018年5月9日 下午5:20:29 
	*/
	private static COSDocument parseDocument(InputStream is) throws IOException {
		PDFParser parser = new PDFParser(is);
		parser.parse();
		return parser.getDocument();
	}

	/** 
	* @Title: closeCOSDocument 
	* @Description: COS关闭
	* @author wl
	* @date 2018年5月9日 下午5:20:59 
	*/
	private static void closeCOSDocument(COSDocument cosDoc) {
		if (cosDoc != null) {
			try {
				cosDoc.close();
			} catch (IOException e) {
			}
		}
	}
	/** 
	* @Title: closeCOSDocument 
	* @Description: PDD关闭
	* @author wl
	* @date 2018年5月9日 下午5:20:59 
	*/
	private static void closePDDocument(PDDocument pdDoc) {
		if (pdDoc != null) {
			try {
				pdDoc.close();
			} catch (IOException e) {
			}
		}
	}
}

注意:第一个方法及其他方法中传参路径为文件本地磁盘路径,不是网络路径


猜你喜欢

转载自blog.csdn.net/sinat_36716743/article/details/80492352