1, because recently there is a demand in this area, used after recording it.
2, this PDF function with Ctrl + F in nature, as if in a picture in the form of PDF does not support targeted to keywords.
import com.itextpdf.awt.geom.Rectangle2D.Float; import com.itextpdf.text.pdf.PdfDictionary; import com.itextpdf.text.pdf.PdfName; import com.itextpdf.text.pdf.PdfReader; import com.itextpdf.text.pdf.parser.*; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.List; /** * 消失的太阳 */ public class MyTest { public static void main(String[] args) throws{IOException // 1. given file File pdfFile = new new File ( "D: //P44190003265911.pdf" ); // 2. Define a byte array, the length of the file length byte [] = pdfData new new byte [( int ) pdfFile.length ()]; // 3.IO read stream file content byte array to the FileInputStream inputStream = null ; the try { inputStream = new new the FileInputStream (pdfFile); InputStream.read (pdfData); } the catch (IOException E) { the throw E; }the finally { IF (inputStream =! null ) { the try { inputStream.close (); } the catch (IOException E) { } } } // 4. Keyword String keyword = "Sun disappears:" ; // 5. Call method, given keyword and document List < float []> positions = findKeywordPostions (pdfData, keyword); // 6. the return type is List <float []> each list element represents a matching position, respectively float [0] where p float [1] where the x-axis float [2] where the y-axis System.out.println ( "Total:" + positions.size ()); IF (! Positions = null && positions.size ()> 0 ) { for ( a float [] position: Positions) { of System.out.print ( "pageNum:" + ( int ) position [0 ]); the System.out. Print ( "\ TX:" + position [. 1 ]); System.out.println ( "\ TY:" + position [2 ]); } } } / ** * findKeywordPostions * @param pdfData stream converted by the PDF file IO byte array * @param keyword keyword *@return List<float [ ]> : float[0]:pageNum float[1]:x float[2]:y * @throws IOException */ public static List<float[]> findKeywordPostions(byte[] pdfData, String keyword) throws IOException { List<float[]> result = new ArrayList<>(); List<PdfPageContentPositions> pdfPageContentPositions = getPdfContentPostionsList(pdfData); for (PdfPageContentPositions pdfPageContentPosition : pdfPageContentPositions) { List<float[]> charPositions = findPositions(keyword, pdfPageContentPosition); if (charPositions == null || charPositions.size() < 1) { continue; } result.addAll(charPositions); } return result; } private static List<PdfPageContentPositions> getPdfContentPostionsList(byte[] pdfData) throws IOException { PdfReader reader = new PdfReader(pdfData); List<PdfPageContentPositions> result = new ArrayList<>(); int pages = reader.getNumberOfPages (); for ( int pageno = 1; pageno <= pages pageno ++ ) { float width = reader.getPageSize (pageno) .getWidth (); float height = reader.getPageSize (pageno) .getHeight (); PdfRenderListener pdfRenderListener = new PdfRenderListener (pageno, width, height); // 解析doc,定位位置 PdfContentStreamProcessor processors = new PdfContentStreamProcessor (pdfRenderListener); PdfDictionary pageDic = reader.getPageN (pageno); PdfDictionary resourcesDic = pageDic.getAsDict(PdfName.RESOURCES); try { processor.processContent(ContentByteUtils.getContentBytesForPage(reader, pageNum), resourcesDic); } catch (IOException e) { reader.close(); throw e; } String content = pdfRenderListener.getContent(); List<CharPosition> charPositions = pdfRenderListener.getcharPositions(); List<float[]> positionsList = new ArrayList<>(); for (CharPosition charPosition : charPositions) { float[] positions = new float[]{charPosition.getPageNum(), charPosition.getX(), charPosition.getY()}; positionsList.add(positions); } PdfPageContentPositions pdfPageContentPositions = new PdfPageContentPositions(); pdfPageContentPositions.setContent(content); pdfPageContentPositions.setPostions(positionsList); result.add(pdfPageContentPositions); } reader.close(); return result; } private static List<float[]> findPositions(String keyword, PdfPageContentPositions pdfPageContentPositions) { List<float[]> result = new ArrayList<>(); String content = pdfPageContentPositions.getContent(); List<float[]> charPositions = pdfPageContentPositions.getPositions(); for (int pos = 0; pos < content.length(); ) { int positionIndex = content.indexOf(keyword, pos); if (positionIndex == -1) { break; } float[] postions = charPositions.get(positionIndex); result.add(postions); pos = positionIndex + 1; } return result; } private static class PdfPageContentPositions { private String content; private List<float[]> positions; public String getContent() { return content; } public void setContent(String content) { this.content = content; } public List<float[]> getPositions() { return positions; } public void setPostions(List<float[]> positions) { this.positions = positions; } } private static class PdfRenderListener implements RenderListener { private int pageNum; private float pageWidth; private float pageHeight; private StringBuilder contentBuilder = new StringBuilder(); private List<CharPosition> charPositions = new ArrayList<>(); public PdfRenderListener(int pageNum, float pageWidth, float pageHeight) { this.pageNum = pageNum; this.pageWidth = pageWidth; this.pageHeight = pageHeight; } public void beginTextBlock() { } public void renderText(TextRenderInfo renderInfo) { List<TextRenderInfo> characterRenderInfos = renderInfo.getCharacterRenderInfos(); for (TextRenderInfo textRenderInfo : characterRenderInfos) { String word = textRenderInfo.getText(); if (word.length() > 1) { word = word.substring(word.length() - 1, word.length()); } Float rectangle = textRenderInfo.getAscentLine().getBoundingRectange(); float x = (float) rectangle.getX (); a float Y = ( a float ) rectangle.getY (); // a float X = (a float) rectangle.getCenterX (); // a float Y = (a float) rectangle.getCenterY (); // Double rectangle.getMinX = X (); // Double rectangle.getMaxY Y = (); // this is a keyword in percentage of two XY axes page where a float xPercent Math.round = (X / pageWidth * 10000) / 10000f ; a float yPercent Math.round = ((. 1 - Y / pageHeight) * 10000) / 10000f; // CharPosition charPosition = new new CharPosition (pageNum, xPercent, yPercent); CharPosition charPosition = new CharPosition(pageNum, (float)x, (float)y); charPositions.add(charPosition); contentBuilder.append(word); } } public void endTextBlock() { } public void renderImage(ImageRenderInfo renderInfo) { } public String getContent() { return contentBuilder.toString(); } public List<CharPosition> getcharPositions() { return charPositions; } } Private static class CharPosition { private int pageno = 0 ; private float x = 0 ; private float y = 0 ; public CharPosition ( int pageno, float x, float y) { this .pageNum = pageno; this x = x; this y = y; } Public int getPageNum () { return pageno; } Public float getX () { return x; } Public float getY () { return y; } @Override public String toString () { return "[pagens =" + this .pageNum + "x =" + this .x + "y =" + this y + "]" ; } } }