记录:PDF关键字寻找

关于获取PDF中关键字位置可以直接用的demo

package com.sign;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.List;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;

public class BoxKeyPosition extends PDFTextStripper {

    private char[] key;
    private byte[] src;
    private List<float[]> list = new ArrayList<float[]>();
    private List<float[]> pagelist = new ArrayList<float[]>();

    public BoxKeyPosition(String keyWords, byte[] src) throws IOException {
        super();
        super.setSortByPosition(true);
        this.src = src;

        char[] key = new char[keyWords.length()];
        for (int i = 0; i < keyWords.length(); i++) {
            key[i] = keyWords.charAt(i);
        }
        this.key = key;
    }

    public char[] getKey() {
        return key;
    }

    public void setKey(char[] key) {
        this.key = key;
    }

    public byte[] getSrc() {
        return src;
    }

    public void setSrc(byte[] src) {
        this.src = src;
    }

    public List<float[]> getPosition() throws IOException {
        try {
            document = PDDocument.load(src);
            int pages = document.getNumberOfPages();

            for (int i = 1; i <= pages; i++) {
                pagelist.clear();
                super.setSortByPosition(true);
                super.setStartPage(i);
                super.setEndPage(i);
                Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
                super.writeText(document, dummy);
                for (float[] li : pagelist) {
                    li[2] = i;
                }
                list.addAll(pagelist);
            }
            return list;

        } finally {
            if (document != null) {
                document.close();
            }
        }

    }

    @Override
    protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
        for (int i = 0; i < textPositions.size(); i++) {

            String str = textPositions.get(i).getUnicode();
            if (str.equals(key[0] + "")) {
                int count = 0;
                for (int j = 1; j < key.length; j++) {
                    String s = "";
                    try {
                        s = textPositions.get(i + j).getUnicode();
                    } catch (Exception e) {
                        s = "";
                    }
                    if (s.equals(key[j] + "")) {
                        count++;
                    }

                }
                if (count == key.length - 1) {
                    float[] idx = new float[3];
                    idx[0] = textPositions.get(i).getX()+key.length*textPositions.get(i).getWidth()/2;
                    idx[1] = textPositions.get(i).getY()-textPositions.get(i).getHeight();
                    //  idx[3] = textPositions.get(i).getUnicode();
                    pagelist.add(idx);
                }
            }

        }
    }
}

package com.sign;



import java.io.*;
import java.util.List;

/**
 * @ClassName SignPostionTest
 * @Description TODD
 * @Author MG01857
 * @Date 2018/12/15
 * @Version 1.0
 **/

public class SignPostionTest {
    private static byte[] toByteArray(InputStream in) throws IOException {

        ByteArrayOutputStream out = new ByteArrayOutputStream();
        byte[] buffer = new byte[1024 * 4];
        int n = 0;
        while ((n = in.read(buffer)) != -1) {
            out.write(buffer, 0, n);
        }
        return out.toByteArray();
    }
    public static void main(String[] args) throws Exception {
       /* SignPostion signPostion = new SignPostion();
        List<float[]> keyWords = signPostion.getKeyWords("C:\\Users\\MG01857\\Desktop\\AZ新合同生成\\埋点测试\\爱家分期服务合同(金融机构、消费者).pdf",
                null, "borrower ");
        System.out.println(keyWords);*/
        String filePath = "C:\\Users\\MG01857\\Desktop\\pdf生成浏览\\世联信贷征信查询授权书_word转PDF_黄智炜.pdf";
        InputStream in = new FileInputStream(filePath);
        byte[] data = toByteArray(in);
        in.close();
        BoxKeyPosition boxKeyPosition = new BoxKeyPosition("borrower",data);
        //List<float[]> position = boxKeyPosition.getPosition();
        List<float[]> position = boxKeyPosition.getPosition();
        for (float[] f : position){
            System.out.println(f.toString());
        }
    }
}

转载于:https://www.jianshu.com/p/60a72111ba85

猜你喜欢

转载自blog.csdn.net/weixin_34326179/article/details/91078170