Cómo obtener información de coordenadas en archivos PDF según palabras clave

paquete com.dhcc.zhfc.elesign.util;

importar org.apache.commons.lang.StringUtils;
importar org.apache.pdfbox.pdmodel.PDDocument;
importar org.apache.pdfbox.pdmodel.PDPageContentStream;
importar org.apache.pdfbox.text.PDFTextStripper;
importar org.apache.pdfbox.text.TextPosition;
importar org.hibernate.annotations.common.util.StringHelper;

importar java.io.*;
importar java.util.ArrayList;
importar java.util.List;

/**
* @ClassName PdfBoxKeyWordPosition
* @Description TODO
* @Author 86173
* @Date 2020/5/11 15:14 *
@Version 1.0
*/
public class PdfBoxKeyWordPosition extends PDFTextStripper { // Matriz de caracteres de palabras clave private char[] key; / / Ruta del archivo PDF private String pdfPath; byte privado[] fileBytes; // Coordinar la recopilación de información lista privada<float[]> list = new ArrayList<float[]>(); // Colección de información de la página actual lista privada<float[ ] > pagelist = new ArrayList<float[]>(); // Constructor con parámetros public PdfBoxKeyWordPosition(String keyWords, String pdfPath,byte[] bin) throws IOException { super();

super.setSortByPosition(verdadero);
this.pdfPath = pdfPath;
this.fileBytes= bin;
char[] clave = nuevo char[keyWords.length()];
for (int i = 0; i < palabras clave.length(); i++) { clave[i] = palabras clave.charAt(i); } this.key = clave; } public char[] getKey() { clave de retorno; } public void setKey(char[] clave) { this.key = clave; } public String getPdfPath() { return pdfPath; } public void setPdfPath(String pdfPath) { this.pdfPath = pdfPath; }

// 获取坐标信息
lista pública<float[]> getCoordinate() lanza IOException { try { if(!StringHelper.isEmpty(pdfPath)){ documento = PDDocument.load(new File(pdfPath)); } if(document==null&&fileBytes!=null){ documento = PDDocument.load(fileBytes); } int páginas = document.getNumberOfPages(); for (int i = 1; i <= páginas; i++) { pagelist.clear(); super.setSortByPosition(verdadero); super.setStartPage(i); super.setEndPage(i);

Escritor ficticio = new OutputStreamWriter(new ByteArrayOutputStream());
super.writeText(documento, ficticio);
for (float[] li: lista de páginas) { li[2] = i; } lista.addAll(lista de páginas); } lista de retorno; } captura (Excepción e) { e.printStackTrace(); } finalmente { if (documento!= nulo) { document.close(); } } lista de retorno; }

// 获取坐标信息
@Anular
protected void writeString(String string, List<TextPosition> textPositions) lanza IOException { for (int i = 0; i < textPositions.size(); i++) { String fonts = textPositions.get(i) .getFont().getName(); Cadena cadena = textPositions.get(i).getUnicode(); if (str.equals(clave[0] + "")) { int recuento = 0; for (int j = 0; j < key.length-1; j++) { String s = ""; prueba { s = textPositions.get(i + j). obtenerUnicode(); } captura (Excepción e) { s = "";

}
if (s.equals(key[j] + "")) { count++; } } if (count == key.length-1) { float[] idx = new float[3]; // Se necesitan algunos ajustes Haga que el sello cubra la fuente // La longitud de la fuente se agrega aquí a la coordenada X, o puede hacerlo directamente idx[0] = textPositions.get(i).getX() idx[0] = textPositions.get(i ).getX ()+textPositions.get(i).getFontSize(); //La longitud de la fuente restada de la coordenada Y aquí también puede ser directamente idx[1] = textPositions.get(i).getPageHeight()- posiciones de texto.get(i).getY()

idx[1] = textPositions.get(i).getHeight()-textPositions.get(i).getY()-4*textPositions.get(i).getFontSize();
System.out.println("x=" + idx[0] + ",y=" + idx[1]);
lista de páginas.add(idx);
devolver;
}
} }
}

public static void main(String[] args) lanza IOException { String pdfPath = "C:\\Users\\pangq\\Desktop\\555.pdf"; Archivo archivo = nuevo archivo (pdfPath); //PDDocument doc = PDDocument.load(archivo); Palabras clave de cadena = "纪海祥"; //PDImageXObject pdImage = PDImageXObject.createFromFile("C:/Programs/test/sign.png", doc); byte[] bytes = Archivo2byte(archivo); PdfBoxKeyWordPosition pdf = nueva PdfBoxKeyWordPosition(palabras clave, "", bytes); PDPageContentStream contentStream = nulo; Lista<float[]> lista = pdf.getCoordinate(); Lista<Integer> convertResult = convert(lista); Cadena a = convert2String(lista);

// 多页pdf的处理*/
for (float[] fs : lista) { float x = fs[0]; flotante y = fs[1]; } //doc.cerrar(); } byte estático público[] File2byte(Archivo tradeFile){ byte[] buffer = null; FileInputStream fis = nulo; ByteArrayOutputStream bos = nulo; intente { fis = new FileInputStream(tradeFile); bos = nuevo ByteArrayOutputStream(); byte[] b = nuevo byte[1024]; int n; mientras ((n = fis.read(b)) != -1) { bos.write(b, 0, n);

}
fis.cerrar();
bos.cerrar();
buffer = bos.toByteArray();
}catch (FileNotFoundException e){ e.printStackTrace(); }catch (IOException e){ e.printStackTrace(); }finalmente { if(fis !=null){ intenta { fis.close(); }catch (IOException io){ io.printStackTrace(); } } if(bos !=null){ prueba { bos.close();

}catch (IOException io){ io.printStackTrace(); } } } búfer de retorno; }

Lista estática pública<Integer> convert(List<float[]> list){ Lista<Integer> res = new ArrayList<Integer>(); if(list!=null&&list.size()>0) { for (float[] fs : list) { int página = (int) fs[2]; if(!res.contains(página)){ res.add(página); } } } devolver resolución; } cadena estática pública convert2String(List<float[]> list){ List<Integer> res = convert(list); Cadena str = StringUtils.join(res.iterator(),","); devolver cadena; }

/**
* Obtener el número de página en pdf
* @param bystes
* @return
*/
public static int getPdfNubers(byte[] bystes){

int páginas = 0;
ByteArrayInputStream en = nuevo ByteArrayInputStream (bytes);
PDDocumento pdfReader = nulo;
intente { pdfReader = PDDocument.load(in); páginas= pdfReader.getNumberOfPages(); } catch (IOException e) { páginas de retorno; } }

páginas de retorno;

}

Cómo obtener información de coordenadas en archivos PDF según palabras clave

Supongo que te gusta