java处理pdf代码

版权声明:原创文章转载请声明出处https://blog.csdn.net/qq_40374604 https://blog.csdn.net/qq_40374604/article/details/89337683

pdf转txt

小工具

package utilw;

import java.io.File;
import java.io.FileOutputStream;
import java.io.PrintWriter;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;

public class PdfUtil {
    public static void main(String[] args) {
        PdfUtil pdfUtil = new PdfUtil();
        pdfUtil.pdfToTxt();
    }

    /** @param pdf转txt */
    public void pdfToTxt() {
        try {
            // 是否排序
            boolean sort = false;
            // 开始提取页数
            int startPage = 1;
            // 结束提取页数
            int endPage = Integer.MAX_VALUE;
            String content = null;
            PrintWriter writer = null;
            // pdf文本路径
            String path = "E:/数据文件/2019_PDF.pdf";
            // 输出txt文本路径
            String target = "E:/数据文件/2019_PDF.txt";
            PDDocument document = PDDocument.load(new File(path));
            PDFTextStripper pts = new PDFTextStripper();
            endPage = document.getNumberOfPages();
            System.out.println("Total Page: " + endPage);
            pts.setStartPage(startPage);
            pts.setEndPage(endPage);
            try {
                // content就是从pdf中解析出来的文本
                content = pts.getText(document);
                writer = new PrintWriter(new FileOutputStream(target));
                writer.write(content);// 写入文件内容
                writer.flush();
                writer.close();
            } catch (Exception e) {
                throw e;
            } finally {
                if (null != document)
                    document.close();
            }
            System.out.println("Get PDF Content ...");
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

猜你喜欢

转载自blog.csdn.net/qq_40374604/article/details/89337683