pdf读取关键字的方法

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/hehyyoulan/article/details/88909526

package com.fineway.hcs.file.util;

import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.ImageRenderInfo;
import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
import com.itextpdf.text.pdf.parser.RenderListener;
import com.itextpdf.text.pdf.parser.TextRenderInfo;
import org.apache.commons.lang3.StringUtils;
import org.assertj.core.util.Lists;

import java.io.IOException;
import java.util.List;

public class pdfReaderUtil {
// 定义返回页码
private static int i = 0;
private static com.itextpdf.awt.geom.Rectangle2D.Float boundingRectange =null;

private static StringBuilder content;

private static List<Object[]> arrays = Lists.newArrayList();

public static void main(String[] argus){
    String filePath = "C:\\hhywk\\test.pdf";
    String keyWord = "签章位置";
    List<Object[]> result = getKeyWords(filePath, keyWord);
    System.out.println(result);
}

private static List<Object[]> getKeyWords(String filePath, final String keyWord) {

    try {
        PdfReader pdfReader = new PdfReader(filePath);
        int pageNum = pdfReader.getNumberOfPages();
        PdfReaderContentParser pdfReaderContentParser = new PdfReaderContentParser(pdfReader);

        for (i = 1; i < (pageNum + 1); i++) {
            content = new StringBuilder();
            boundingRectange =new com.itextpdf.awt.geom.Rectangle2D.Float();
            pdfReaderContentParser.processContent(i, new RenderListener() {
                @Override
                public void renderText(TextRenderInfo textRenderInfo) {
                    String text = textRenderInfo.getText(); // 整页内容
                    content.append(text);

                    boundingRectange= textRenderInfo.getBaseline().getBoundingRectange();
                    /*if (null != text && StringUtils.contains(content, keyWord)) {
                        float[] resu = new float[3];
                        resu[0] = boundingRectange.x;
                        resu[1] = boundingRectange.y;
                        resu[2] = i;
                        arrays.add(resu);
                    }*/
                }

                @Override
                public void renderImage(ImageRenderInfo arg0) {
                    // TODO Auto-generated method stub

                }

                @Override
                public void endTextBlock() {
                    // TODO Auto-generated method stub

                }

                @Override
                public void beginTextBlock() {
                    // TODO Auto-generated method stub

                }
            });

            if (null != content && StringUtils.contains(content, keyWord)) {
                Object[] resu = new Object[4];
                resu[0] = content;
                resu[1] = boundingRectange.x;
                resu[2] = boundingRectange.y;
                resu[3] = i;
                arrays.add(resu);
            }

            //    System.out.println("第"+i+"页,内容:"+content);
        }


    } catch (IOException e) {
        e.printStackTrace();
    }
    return arrays;
}

}

猜你喜欢

转载自blog.csdn.net/hehyyoulan/article/details/88909526
今日推荐