PDF parser

Table of contents

To run this program, you must first deploy the Java operating environment. Please search online for how to deploy JDK in Windows.

1GUI

Use Java to implement the functions of extracting text from PDF files, converting them to Word, converting them to Excel, and extracting pictures. The GUI code is as follows:

package com.example.yrz;

/**
 * @Author yrz
 * @create 2023/6/20 17:45
 * @Description TODO
 */


import com.example.yrz.authorization.AuthorizationCheck;
import com.example.yrz.encoder.PDFEncoder;

import javax.swing.*;
import javax.swing.filechooser.FileNameExtensionFilter;
import java.awt.*;
import java.awt.event.*;
import java.io.*;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;

public class FileChooser extends JFrame implements ActionListener, ItemListener {
    JButton openButton;
    JPanel radioPanel;
    ButtonGroup radioGroup;
    JTextArea log;
    JFileChooser fc;
    JRadioButton extractTextButton;
    JRadioButton convertToWordButton;
    JRadioButton convertToExcelButton;
    JRadioButton extractImagesButton;
    DateTimeFormatter dateTimeFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
    // 共享变量 0提取文字 1转Word 2转Excel 3提取图片
    public static Integer PARSE_TYPE = 0;

    public FileChooser() {
        super("PDF解析器");
        openButton = new JButton("点击此处选择文件");
        openButton.addActionListener(this);
        log = new JTextArea(11, 20);
        log.setEditable(false);
        JScrollPane logScrollPane = new JScrollPane(log);
        fc = new JFileChooser();
        // 文件类型为pdf
        FileNameExtensionFilter filter = new FileNameExtensionFilter("PDF Documents", "pdf");
        fc.addChoosableFileFilter(filter);
        // create radio button group
        radioPanel = new JPanel(new GridLayout(1, 0));
        radioGroup = new ButtonGroup();
        extractTextButton = new JRadioButton("提取文字");
        convertToWordButton = new JRadioButton("转Word");
        convertToExcelButton = new JRadioButton("转Excel");
        extractImagesButton = new JRadioButton("提取图片");
        radioGroup.add(extractTextButton);
        radioGroup.add(convertToWordButton);
        radioGroup.add(convertToExcelButton);
        radioGroup.add(extractImagesButton);
        radioPanel.add(extractTextButton);
        radioPanel.add(convertToWordButton);
        radioPanel.add(convertToExcelButton);
        radioPanel.add(extractImagesButton);
        // set default selected radio button
        extractTextButton.setSelected(true);
        // add item listeners to radio buttons
        extractTextButton.addItemListener(this);
        convertToWordButton.addItemListener(this);
        convertToExcelButton.addItemListener(this);
        extractImagesButton.addItemListener(this);
        // add components to frame
        add(radioPanel, BorderLayout.PAGE_START);
        add(openButton, BorderLayout.CENTER);
        add(logScrollPane, BorderLayout.PAGE_END);
        setSize(400, 300);
        setVisible(true);
        // 关闭窗口后停止进程
        setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
    }

    @Override
    public void actionPerformed(ActionEvent e) {
        if (e.getSource() == openButton) {
            int returnVal = fc.showOpenDialog(FileChooser.this);
            if (returnVal == JFileChooser.APPROVE_OPTION) {
                File file = fc.getSelectedFile();
                log.append(LocalDateTime.now().format(dateTimeFormatter) + "正在解析: " + file.getName() + "!" + "\n");
                switch (PARSE_TYPE){
                    case 1:
                        // 转Word
                        PDFEncoder.turnIntoWord(file);
                        break;
                    case 2:
                        // 转Excel
                        PDFEncoder.turnIntoExcel(file);
                        break;
                    case 3:
                        // 提取图片
                        PDFEncoder.extractImg(file);
                        break;
                    default:
                        // 提取pdf中的文字
                        PDFEncoder.textExtraction(file);
                        break;
                }
                log.append(LocalDateTime.now().format(dateTimeFormatter) + "解析完成: " + file.getName() + "!" + "\n");
            } else {
                log.append(LocalDateTime.now().format(dateTimeFormatter) + "用户取消选择文件！" + "\n");
            }
            log.setCaretPosition(log.getDocument().getLength());
        }
    }

    @Override
    public void itemStateChanged(ItemEvent e) {
        if (e.getStateChange() == ItemEvent.SELECTED) {
            if (e.getSource() == extractTextButton) {
                // extract text option selected
                PARSE_TYPE = 0;
            } else if (e.getSource() == convertToWordButton) {
                // convert to Word option selected
                PARSE_TYPE = 1;
            } else if (e.getSource() == convertToExcelButton) {
                // convert to Excel option selected
                PARSE_TYPE = 2;
            } else if (e.getSource() == extractImagesButton) {
                // extract images option selected
                PARSE_TYPE = 3;
            }
        }
    }

    public static void main(String[] args) {
        // 校验授权码
        AuthorizationCheck.check();
        new FileChooser();
    }
}