1. Introduce MAVEN dependency

insert image description here

 <dependency>
            <groupId>com.aspose.cells</groupId>
            <artifactId>aspose-cells</artifactId>
            <version>cell-8.5.2</version>
            <scope>system</scope>
            <systemPath>${project.basedir}/src/main/resources/lib/aspose-cells-17.7.jar</systemPath>
        </dependency>

 <!-- https://mvnrepository.com/artifact/com.itextpdf/itextpdf -->
        <dependency>
            <groupId>com.itextpdf</groupId>
            <artifactId>itextpdf</artifactId>
            <version>5.5.12</version>
        </dependency>

2.Excel to PDF

/**
     * excel 转换  Pdf
     *
     * @param fileFullPath 旧文件全路径   /home/2022-07/保密协议.xlsx
     */
    public static String excelPdf(String fileFullPath) {
        if (!getLicense()) {          // 验证License 若不验证则转化出的pdf文档会有水印产生
            return null;
        }
        FileInputStream excelstream = null;
        Workbook wb = null;
        FileOutputStream fileOS = null;
        PdfSaveOptions pdfSaveOptions = null;
        try {
            String path = fileFullPath.substring(0, fileFullPath.lastIndexOf("/") + 1);
            String fileName = fileFullPath.substring(fileFullPath.lastIndexOf("/") + 1, fileFullPath.lastIndexOf("."));
            String outFIleFullPath = path + fileName + ".pdf";
            File pdfFile = new File(outFIleFullPath); // 输出路径
            excelstream = new FileInputStream(fileFullPath);
            wb = new Workbook(excelstream);// excel路径，这里是先把数据放进缓存表里，然后把缓存表转化成PDF
            fileOS = new FileOutputStream(pdfFile);
            pdfSaveOptions = new PdfSaveOptions();
            pdfSaveOptions.setOnePagePerSheet(true);//参数true把内容放在一张PDF页面上；
            wb.save(fileOS, pdfSaveOptions);
            fileOS.close();
            return outFIleFullPath;
        } catch (Exception e) {
            e.printStackTrace();
            log.info("excel转换异常");
            return null;
        } finally {
            try {
                excelstream.close();
                fileOS.close();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }

        }
    }

    public static boolean getLicense() {
        boolean result = false;
        InputStream is = null;
        try {
            is =
                    FileUtil.class
                            .getClassLoader()
                            .getResourceAsStream(
                                    "license.xml"); //
            // license.xml这个文件你放在静态文件资源目录下就行了
            License aposeLic = new License();
            aposeLic.setLicense(is);
            result = true;
        } catch (Exception e) {
            e.printStackTrace();
        }
        return result;
    }

3. word to PDF

    public static String wordTurnPdf(String doc, String pdf) {

        try {
            //word转PDF   pdf文件名
            File outputFile = new File(pdf);
            InputStream doc1 = new FileInputStream(doc);
            OutputStream outputStream = new FileOutputStream(outputFile);
            IConverter converter = LocalConverter.builder().build();
            converter.convert(doc1).as(DocumentType.DOCX).to(outputStream).as(DocumentType.PDF).execute();
            outputStream.close();
            converter.shutDown();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return pdf;
    }

4. PDF keyword positioning and recognition

package com.datago.robot.common.utils;



import com.itextpdf.awt.geom.Rectangle2D.Float;
import com.itextpdf.text.pdf.PdfDictionary;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.*;
import lombok.extern.slf4j.Slf4j;

import java.io.*;
import java.util.ArrayList;
import java.util.List;

/**
 * 获取pdf 劳模所在页数
 * HBO
 * 2022-10-27
 */
@Slf4j
public class ReadPDF {


    public static List<Integer> readPDF(String pdfFilePath, String keyword) {
        //1.给定文件
//        File pdfFile = new File("C:\\Users\\Administrator\\Desktop\\劳模先进基本信息模板-有数据.pdf");
        File pdfFile = new File(pdfFilePath);
        //2.定义一个byte数组，长度为文件的长度
        byte[] pdfData = new byte[(int) pdfFile.length()];
        //3.IO流读取文件内容到byte数组
        FileInputStream inputStream = null;
        try {
            inputStream = new FileInputStream(pdfFile);
            inputStream.read(pdfData);
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            if (inputStream != null) {
                try {
                    inputStream.close();
                } catch (IOException e) {
                }
            }
        }
        //4.指定关键字
//        String keyword = "分部";
        //5.调用方法，给定关键字和文件
        List<float[]> positions = null;
        try {
            positions = findKeywordPostions(pdfData, keyword);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        //6.返回值类型是 List<float[]> 每个list元素代表一个匹配的位置，分别为 float[0]所在页码 float[1]所在x轴 float[2]所在y轴
//        System.out.println("total:" + positions.size());
        List<  Integer> list=new ArrayList();
        if (positions != null && positions.size() > 0) {
            for (float[] position : positions) {
//                System.out.print("pageNum: " + (int) position[0]);
//                System.out.print("\tx: " + position[1]);
//                System.out.println("\ty: " + position[2]);
//                map.put("pageNum", (int) position[0]);
                list.add((int) position[0]);
            }
        }
        return list;
    }

    /**
     * findKeywordPostions
     *
     * @param pdfData 通过IO流 PDF文件转化的byte数组
     * @param keyword 关键字
     * @return List<float [ ]> : float[0]:pageNum float[1]:x float[2]:y
     * @throws IOException
     */
    public static List<float[]> findKeywordPostions(byte[] pdfData, String keyword) throws IOException {
        List<float[]> result = new ArrayList<>();
        List<PdfPageContentPositions> pdfPageContentPositions = getPdfContentPostionsList(pdfData);
        for (PdfPageContentPositions pdfPageContentPosition : pdfPageContentPositions) {
            List<float[]> charPositions = findPositions(keyword, pdfPageContentPosition);
            if (charPositions == null || charPositions.size() < 1) {
                continue;
            }
            result.addAll(charPositions);
        }
        return result;
    }

    private static List<PdfPageContentPositions> getPdfContentPostionsList(byte[] pdfData) throws IOException {
        PdfReader reader = new PdfReader(pdfData);
        List<PdfPageContentPositions> result = new ArrayList<>();
        int pages = reader.getNumberOfPages();
        for (int pageNum = 1; pageNum <= pages; pageNum++) {
            float width = reader.getPageSize(pageNum).getWidth();
            float height = reader.getPageSize(pageNum).getHeight();
            PdfRenderListener pdfRenderListener = new PdfRenderListener(pageNum, width, height);
            //解析pdf，定位位置
            PdfContentStreamProcessor processor = new PdfContentStreamProcessor(pdfRenderListener);
            PdfDictionary pageDic = reader.getPageN(pageNum);
            PdfDictionary resourcesDic = pageDic.getAsDict(PdfName.RESOURCES);
            try {
                processor.processContent(ContentByteUtils.getContentBytesForPage(reader, pageNum), resourcesDic);
            } catch (IOException e) {
                reader.close();
                throw e;
            }
            String content = pdfRenderListener.getContent();
            List<CharPosition> charPositions = pdfRenderListener.getcharPositions();
            List<float[]> positionsList = new ArrayList<>();
            for (CharPosition charPosition : charPositions) {
                float[] positions = new float[]{charPosition.getPageNum(), charPosition.getX(), charPosition.getY()};
                positionsList.add(positions);
            }
            PdfPageContentPositions pdfPageContentPositions = new PdfPageContentPositions();
            pdfPageContentPositions.setContent(content);
            pdfPageContentPositions.setPostions(positionsList);
            result.add(pdfPageContentPositions);
        }
        reader.close();
        return result;
    }

    private static List<float[]> findPositions(String keyword, PdfPageContentPositions pdfPageContentPositions) {
        List<float[]> result = new ArrayList<>();
        String content = pdfPageContentPositions.getContent();
        List<float[]> charPositions = pdfPageContentPositions.getPositions();
        for (int pos = 0; pos < content.length(); ) {
            int positionIndex = content.indexOf(keyword, pos);
            if (positionIndex == -1) {
                break;
            }
            float[] postions = charPositions.get(positionIndex);
            result.add(postions);
            pos = positionIndex + 1;
        }
        return result;
    }

    private static class PdfPageContentPositions {
        private String content;
        private List<float[]> positions;

        public String getContent() {
            return content;
        }

        public void setContent(String content) {
            this.content = content;
        }

        public List<float[]> getPositions() {
            return positions;
        }

        public void setPostions(List<float[]> positions) {
            this.positions = positions;
        }
    }

    private static class PdfRenderListener implements RenderListener {
        private int pageNum;
        private float pageWidth;
        private float pageHeight;
        private StringBuilder contentBuilder = new StringBuilder();
        private List<CharPosition> charPositions = new ArrayList<>();

        public PdfRenderListener(int pageNum, float pageWidth, float pageHeight) {
            this.pageNum = pageNum;
            this.pageWidth = pageWidth;
            this.pageHeight = pageHeight;
        }

        public void beginTextBlock() {
        }

        public void renderText(TextRenderInfo renderInfo) {
            List<TextRenderInfo> characterRenderInfos = renderInfo.getCharacterRenderInfos();
            for (TextRenderInfo textRenderInfo : characterRenderInfos) {
                String word = textRenderInfo.getText();
                if (word.length() > 1) {
                    word = word.substring(word.length() - 1, word.length());
                }
                Float rectangle = textRenderInfo.getAscentLine().getBoundingRectange();
                float x = (float) rectangle.getX();
                float y = (float) rectangle.getY();
                //这两个是关键字在所在页面的XY轴的百分比
                float xPercent = Math.round(x / pageWidth * 10000) / 10000f;
                float yPercent = Math.round((1 - y / pageHeight) * 10000) / 10000f;
                CharPosition charPosition = new CharPosition(pageNum, (float) x, (float) y);
                charPositions.add(charPosition);
                contentBuilder.append(word);
            }
        }

        public void endTextBlock() {
        }

        public void renderImage(ImageRenderInfo renderInfo) {
        }

        public String getContent() {
            return contentBuilder.toString();
        }

        public List<CharPosition> getcharPositions() {
            return charPositions;
        }
    }

    private static class CharPosition {
        private int pageNum = 0;
        private float x = 0;
        private float y = 0;

        public CharPosition(int pageNum, float x, float y) {
            this.pageNum = pageNum;
            this.x = x;
            this.y = y;
        }

        public int getPageNum() {
            return pageNum;
        }

        public float getX() {
            return x;
        }

        public float getY() {
            return y;
        }

        @Override
        public String toString() {
            return "[pageNum=" + this.pageNum + ",x=" + this.x + ",y=" + this.y + "]";
        }
    }

}

5. Aspose various dependent packages

aspose-dependency

Identify PDF keywords, page numbers and coordinates in the document

1. Introduce MAVEN dependency

2.Excel to PDF

3. word to PDF

4. PDF keyword positioning and recognition

5. Aspose various dependent packages

Guess you like