pdf转图片、提取pdf文本、提取pdf图片

 
  package com.midevip.common.util;
  import com.itextpdf.text.pdf.PdfReader;
  import net.coobird.thumbnailator.Thumbnails;
  import org.apache.pdfbox.cos.COSName;
  import org.apache.pdfbox.pdmodel.*;
  import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
  import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
  import org.apache.pdfbox.rendering.PDFRenderer;
  import org.apache.pdfbox.text.PDFTextStripper;
   
  import javax.imageio.IIOImage;
  import javax.imageio.ImageIO;
  import javax.imageio.ImageWriteParam;
  import javax.imageio.ImageWriter;
  import java.awt.image.BufferedImage;
  import java.io.File;
  import java.io.IOException;
  import java.io.InputStream;
  import java.text.SimpleDateFormat;
  import java.util.Calendar;
  import java.util.Iterator;
   
  /**
  * 使用pdfbox提取pdf文档的文字和图片内容
  * pdfbox官网:https://pdfbox.apache.org/
  * maven依赖如下:
  * <dependency>
  * <groupId>org.apache.pdfbox</groupId>
  * <artifactId>fontbox</artifactId>
  * <version>2.0.1</version>
  * </dependency>
  * <dependency>
  * <groupId>org.apache.pdfbox</groupId>
  * <artifactId>pdfbox</artifactId>
  * <version>2.0.1</version>
  * </dependency>
  * <dependency>
  * <groupId>com.itextpdf</groupId>
  * <artifactId>itextpdf</artifactId>
  * <version>5.5.13</version>
  * </dependency>
  * <dependency>
  * <groupId>net.coobird</groupId>
  * <artifactId>thumbnailator</artifactId>
  * <version>0.4.8</version>
  * </dependency>
  */
  public class PdfTest {
  public static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
   
  /**
  *
  *
  * @param pdfFilePath
  * @throws Exception
  */
  public static void extractText(String pdfFilePath) throws Exception{
  try (PDDocument document = PDDocument.load(new File(pdfFilePath)))
  {
  AccessPermission ap = document.getCurrentAccessPermission();
  if (!ap.canExtractContent())
  {
  throw new IOException("You do not have permission to extract text");
  }
  PDFTextStripper stripper = new PDFTextStripper();
  stripper.setSortByPosition(true);
   
  for (int p = 1; p <= document.getNumberOfPages(); ++p)
  {
  // 这里分为一页一页的提取,如果不设置,默认会把所有页的内容一次性提取出来,根据需要选择
  stripper.setStartPage(p);
  stripper.setEndPage(p);
   
  //提取内容就这一行代码
  //提取内容很彻底,包括了页眉页脚的内容也都会被提出来
  String text = stripper.getText(document);
   
  String pageStr = String.format("page %d:", p);
  System.out.println(pageStr);
  //为了打印出来更美观
  for (int i = 0; i < pageStr.length(); ++i)
  {
  System.out.print("-");
  }
  System.out.println();
  System.out.println(text.trim());
  System.out.println();
  }
  }
  }
   
  public static void pdfParse(String pdfPath) throws Exception {
  InputStream input = null;
  PDDocument document = null;
  try {
  document = PDDocument.load(new File(pdfPath));
   
  /** 文档属性信息 **/
  PDDocumentInformation info = document.getDocumentInformation();
  System.out.println("标题:" + info.getTitle());
  System.out.println("主题:" + info.getSubject());
  System.out.println("作者:" + info.getAuthor());
  System.out.println("关键字:" + info.getKeywords());
   
  System.out.println("应用程序:" + info.getCreator());
  System.out.println("pdf 制作程序:" + info.getProducer());
   
  System.out.println("作者:" + info.getTrapped());
   
  System.out.println("创建时间:" + dateFormat(info.getCreationDate()));
  System.out.println("修改时间:" + dateFormat(info.getModificationDate()));
   
   
  //获取内容信息
  PDFTextStripper pts = new PDFTextStripper();
  String content = pts.getText(document);
  System.out.println("内容:" + content);
   
  /** 文档页面信息 **/
  PDDocumentCatalog cata = document.getDocumentCatalog();
  int count = 1;
  for (int i = 0; i < document.getNumberOfPages(); i++) {
  PDPage page = document.getPage(i);
  if (null != page) {
  //获取到所有rescourse信息
  PDResources res = page.getResources();
  Iterable<COSName> xit = res.getXObjectNames();
  Iterator<COSName> iterator = xit.iterator();
  while (iterator.hasNext()){
  COSName cosName = iterator.next();
  System.out.println(cosName.getName());
  //判断是否图片资源,这个提取图片也很彻底,包括页眉页脚的图片也会被获取到
  if(res.isImageXObject(cosName)){
  PDImageXObject pdImageXObject = (PDImageXObject)res.getXObject(cosName);
  //这里保存图片我用了谷歌的thumbnailator框架,也可以用自己的方法去保存BufferedImage对象到本地图片
  Thumbnails.of(pdImageXObject.getImage()).scale(0.9).toFile(new File("D:\\pdf\\"+System.currentTimeMillis()+".jpg"));
  }
  }
  }
  }
  } catch (Exception e) {
  throw e;
  } finally {
  if (null != input)
  input.close();
  if (null != document)
  document.close();
  }
  }
   
  /***
  * PDF文件转PNG图片,全部页数
  *
  * @param PdfFilePath pdf完整路径
  * @param dpi dpi越大转换后越清晰,相对转换速度越慢
  * @return
  */
  private static boolean pdf2Image(String PdfFilePath, String dstImgFolder, int dpi) {
  File file = new File(PdfFilePath);
  PDDocument pdDocument;
  try {
  String imgPDFPath = file.getParent();
  int dot = file.getName().lastIndexOf('.');
  String imagePDFName = file.getName().substring(0, dot); // 获取图片文件名
  String imgFolderPath = null;
  if (dstImgFolder.equals("")) {
  imgFolderPath = imgPDFPath + File.separator + imagePDFName;// 获取图片存放的文件夹路径
  } else {
  imgFolderPath = dstImgFolder + File.separator + imagePDFName;
  }
   
  if (createDirectory(imgFolderPath)) {
   
  pdDocument = PDDocument.load(file);
  PDFRenderer renderer = new PDFRenderer(pdDocument);
  /* dpi越大转换后越清晰,相对转换速度越慢 */
  PdfReader reader = new PdfReader(PdfFilePath);
  int pages = reader.getNumberOfPages();
  StringBuffer imgFilePath = null;
  for (int i = 0; i < pages; i++) {
  String imgFilePathPrefix = imgFolderPath + File.separator + imagePDFName;
  imgFilePath = new StringBuffer();
  imgFilePath.append(imgFilePathPrefix);
  imgFilePath.append("_");
  imgFilePath.append(String.valueOf(formatNumber(i+1)));
  imgFilePath.append(".jpg");
  File dstFile = new File(imgFilePath.toString());
  BufferedImage image = renderer.renderImageWithDPI(i, dpi);
   
   
  ImageWriter writer = ImageIO.getImageWritersByFormatName("jpg").next();
  writer.setOutput(ImageIO.createImageOutputStream(dstFile));
  ImageWriteParam param = writer.getDefaultWriteParam();
  param.setCompressionMode(ImageWriteParam.MODE_EXPLICIT);
  param.setCompressionQuality(0.3f);
  writer.write(null, new IIOImage(image, null, null), param);
   
  // ImageIO.write(image, "jpg", dstFile);
  }
  System.out.println("PDF文档转图片成功!"+dstImgFolder);
  return true;
  } else {
  System.out.println("PDF文档转图片失败:" + "创建" + imgFolderPath + "失败");
  }
   
  } catch (IOException e) {
  e.printStackTrace();
  }
  return false;
  }
   
  private static String formatNumber(int i){
  if(i<10){
  return "00"+i;
  }else if(i<100){
  return "0"+i;
  }else{
  return i+"";
  }
  }
   
  private static boolean createDirectory(String folder) {
  File dir = new File(folder);
  if (dir.exists()) {
  return true;
  } else {
  return dir.mkdirs();
  }
  }
   
  public static String dateFormat(Calendar calendar) throws Exception {
  if (null == calendar)
  return null;
  String date = null;
  try {
  String pattern = DATE_FORMAT;
  SimpleDateFormat format = new SimpleDateFormat(pattern);
  date = format.format(calendar.getTime());
  } catch (Exception e) {
  throw e;
  }
  return date == null ? "" : date;
  }
   
  }

猜你喜欢

转载自www.cnblogs.com/ysySelf/p/10239517.html