java realize the search page PDF containing the keywords and coordinates

1, because recently there is a demand in this area, used after recording it.

2, this PDF function with Ctrl + F in nature, as if in a picture in the form of PDF does not support targeted to keywords.

import com.itextpdf.awt.geom.Rectangle2D.Float;
import com.itextpdf.text.pdf.PdfDictionary;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.*;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * 消失的太阳
 */
public class MyTest {

    public static void main(String[] args) throws{IOException
         // 1. given file 
        File pdfFile = new new File ( "D: //P44190003265911.pdf" );
         // 2. Define a byte array, the length of the file length 
        byte [] = pdfData new new  byte [( int ) pdfFile.length ()]; 

        // 3.IO read stream file content byte array to 
        the FileInputStream inputStream = null ;
         the try { 
            inputStream = new new the FileInputStream (pdfFile); 
            InputStream.read (pdfData); 
        } the catch (IOException E) {
             the throw E; 
        }the finally {
             IF (inputStream =! null ) {
                 the try { 
                    inputStream.close (); 
                } the catch (IOException E) { 
                } 
            } 
        } 

        // 4. Keyword 
        String keyword = "Sun disappears:" ; 

        // 5. Call method, given keyword and document 
        List < float []> positions = findKeywordPostions (pdfData, keyword); 

        // 6. the return type is List <float []> each list element represents a matching position, respectively float [0] where p float [1] where the x-axis float [2] where the y-axis 
        System.out.println ( "Total:" + positions.size ());
        IF (! Positions = null && positions.size ()> 0 ) {
             for ( a float [] position: Positions) { 
                of System.out.print ( "pageNum:" + ( int ) position [0 ]); 
                the System.out. Print ( "\ TX:" + position [. 1 ]); 
                System.out.println ( "\ TY:" + position [2 ]); 
            } 
        } 
    } 


    / ** 
     * findKeywordPostions 
     * @param pdfData stream converted by the PDF file IO byte array 
     * @param keyword keyword 
     *@return List<float [ ]> : float[0]:pageNum float[1]:x float[2]:y
     * @throws IOException
     */
    public static List<float[]> findKeywordPostions(byte[] pdfData, String keyword) throws IOException {
        List<float[]> result = new ArrayList<>();
        List<PdfPageContentPositions> pdfPageContentPositions = getPdfContentPostionsList(pdfData);


        for (PdfPageContentPositions pdfPageContentPosition : pdfPageContentPositions) {
            List<float[]> charPositions = findPositions(keyword, pdfPageContentPosition);
            if (charPositions == null || charPositions.size() < 1) {
                continue;
            }
            result.addAll(charPositions);
        }
        return result;
    }


    private static List<PdfPageContentPositions> getPdfContentPostionsList(byte[] pdfData) throws IOException {
        PdfReader reader = new PdfReader(pdfData);


        List<PdfPageContentPositions> result = new ArrayList<>(); 


        int pages = reader.getNumberOfPages ();
        for ( int pageno = 1; pageno <= pages pageno ++ ) {
             float width = reader.getPageSize (pageno) .getWidth ();
            float height = reader.getPageSize (pageno) .getHeight (); 


            PdfRenderListener pdfRenderListener = new PdfRenderListener (pageno, width, height); 


            // 解析doc,定位位置 
            PdfContentStreamProcessor processors = new PdfContentStreamProcessor (pdfRenderListener); 
            PdfDictionary pageDic = reader.getPageN (pageno);
            PdfDictionary resourcesDic = pageDic.getAsDict(PdfName.RESOURCES);
            try {
                processor.processContent(ContentByteUtils.getContentBytesForPage(reader, pageNum), resourcesDic);
            } catch (IOException e) {
                reader.close();
                throw e;
            }


            String content = pdfRenderListener.getContent();
            List<CharPosition> charPositions = pdfRenderListener.getcharPositions();


            List<float[]> positionsList = new ArrayList<>();
            for (CharPosition charPosition : charPositions) {
                float[] positions = new float[]{charPosition.getPageNum(), charPosition.getX(), charPosition.getY()};
                positionsList.add(positions);
            }


            PdfPageContentPositions pdfPageContentPositions = new PdfPageContentPositions();
            pdfPageContentPositions.setContent(content);
            pdfPageContentPositions.setPostions(positionsList);


            result.add(pdfPageContentPositions);
        }
        reader.close();
        return result;
    }


    private static List<float[]> findPositions(String keyword, PdfPageContentPositions pdfPageContentPositions) {


        List<float[]> result = new ArrayList<>();


        String content = pdfPageContentPositions.getContent();
        List<float[]> charPositions = pdfPageContentPositions.getPositions();


        for (int pos = 0; pos < content.length(); ) {
            int positionIndex = content.indexOf(keyword, pos);
            if (positionIndex == -1) {
                break;
            }
            float[] postions = charPositions.get(positionIndex);
            result.add(postions);
            pos = positionIndex + 1;
        }
        return result;
    }


    private static class PdfPageContentPositions {
        private String content;
        private List<float[]> positions;


        public String getContent() {
            return content;
        }


        public void setContent(String content) {
            this.content = content;
        }


        public List<float[]> getPositions() {
            return positions;
        }


        public void setPostions(List<float[]> positions) {
            this.positions = positions;
        }
    }



    private static class PdfRenderListener implements RenderListener {
        private int pageNum;
        private float pageWidth;
        private float pageHeight;
        private StringBuilder contentBuilder = new StringBuilder();
        private List<CharPosition> charPositions = new ArrayList<>();


        public PdfRenderListener(int pageNum, float pageWidth, float pageHeight) {
            this.pageNum = pageNum;
            this.pageWidth = pageWidth;
            this.pageHeight = pageHeight;
        }


        public void beginTextBlock() {
        }


        public void renderText(TextRenderInfo renderInfo) {
            List<TextRenderInfo> characterRenderInfos = renderInfo.getCharacterRenderInfos();
            for (TextRenderInfo textRenderInfo : characterRenderInfos) {
                String word = textRenderInfo.getText();
                if (word.length() > 1) {
                    word = word.substring(word.length() - 1, word.length());
                }
                Float rectangle = textRenderInfo.getAscentLine().getBoundingRectange();

                float x = (float) rectangle.getX ();
                 a float Y = ( a float ) rectangle.getY ();
 //                 a float X = (a float) rectangle.getCenterX ();
 //                 a float Y = (a float) rectangle.getCenterY ();
 //                 Double rectangle.getMinX = X ();
 //                 Double rectangle.getMaxY Y = (); 




                // this is a keyword in percentage of two XY axes page where 
                a float xPercent Math.round = (X / pageWidth * 10000) / 10000f ;
                 a float yPercent Math.round = ((. 1 - Y / pageHeight) * 10000) / 10000f; 


//                 CharPosition charPosition = new new CharPosition (pageNum, xPercent, yPercent);
                CharPosition charPosition = new CharPosition(pageNum, (float)x, (float)y);
                charPositions.add(charPosition);
                contentBuilder.append(word);
            }
        }


        public void endTextBlock() {
        }


        public void renderImage(ImageRenderInfo renderInfo) {
        }


        public String getContent() {
            return contentBuilder.toString();
        }


        public List<CharPosition> getcharPositions() {
            return charPositions;
        } 
    } 


    Private  static  class CharPosition {
         private  int pageno = 0 ;
        private  float x = 0 ;
        private  float y = 0 ; 


        public CharPosition ( int pageno, float x, float y) {
             this .pageNum = pageno;
            this x = x;
            this y = y; 
        } 


        Public  int getPageNum () {
             return pageno;
        } 


        Public  float getX () {
             return x; 
        } 


        Public  float getY () {
             return y; 
        } 


        @Override 
        public String toString () {
             return "[pagens =" + this .pageNum + "x =" + this .x + "y =" + this y + "]" ; 
        } 
    } 
}

 

Guess you like

Origin www.cnblogs.com/xsdty/p/11463174.html