【Tika】Quick use: Convert file (word/pdf) content to text

1. Introduce dependencies

        <dependency>
            <groupId>org.apache.tika</groupId>
            <artifactId>tika-parsers</artifactId>
            <version>1.17</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>3.17</version>
        </dependency>

2. Implementation tools

package com.xiaobai.util;

import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ToXMLContentHandler;
import org.json.JSONArray;
import org.json.JSONObject;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

/**
 * @Author xiaobai
 * @Date 2023/7/12 14:04
 * @Title: PageContentHandler
 * @Package com.xiaobai.util
 * @description:
 */
public class ReadContentHandler extends ToXMLContentHandler {
    
    
    private String pageTag = "div";
    private String pageClass = "page";
    private int pageNumber = 0;

    private Map<Integer,StringBuilder> pageMap;

    public ReadContentHandler(){
    
    
        super();
        pageMap = new HashMap<>();
    }

    private void startPage() {
    
    
        pageNumber++;
        pageMap.put(pageNumber,new StringBuilder());
    }

    private void endPage() {
    
    
    }

    @Override
    public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
    
    
        if(pageTag.equals(qName) && pageClass.equals(atts.getValue("class"))){
    
    
            startPage();
        }
    }

    @Override
    public void endElement(String uri, String localName, String qName) throws SAXException {
    
    
        if(pageTag.equals(qName)){
    
    
            endPage();
        }
    }

    @Override
    public void characters(char[] ch, int start, int length) throws SAXException {
    
    
        if(length > 0 && pageNumber > 0){
    
    
            if(ch.length == 1 && ch[0] == '\n'){
    
    
                return;
            }
            pageMap.get(pageNumber).append(ch);
//            pageMap.get(pageNumber).append('\n');
        }
    }

    /**
     * 文件基本信息
     * @param file
     * @return
     * @throws IOException
     * @throws SAXException
     * @throws TikaException
     */
    public static Metadata fileData(File file) throws IOException, SAXException, TikaException {
    
    
        FileInputStream input=new FileInputStream(file);//可以写文件路径,pdf,word,html等
        BodyContentHandler textHandler=new BodyContentHandler();//获取内容
        Metadata matadata=new Metadata();//Metadata对象保存了作者,标题等元数据
        AutoDetectParser parser = new AutoDetectParser();//当调用parser,AutoDetectParser会自动估计文档MIME类型,此处输入PDP文件,因此可以使用PDFParser
        ParseContext context=new ParseContext();
        parser.parse(input, textHandler, matadata, context);//执行解析过程
        input.close();

        return matadata;
    }

    /**
     * 读取文件内容
     * @param file 支持txt/word/excle/pdf等多种格式
     * @return
     * @throws TikaException
     * @throws IOException
     */
    public static String parseText(File file) throws TikaException, IOException {
    
    
        Tika tika = new Tika();
        return tika.parseToString(file);
    }


    /**
     * 按页读取文件内容
     * @param file 仅支持pdf
     * @return
     * @throws TikaException
     * @throws IOException
     */
    public static JSONArray parsePageToPdf(File file) throws Exception{
    
    
        JSONArray jsonArray = new JSONArray();
        JSONObject jsonObject = null;

        ReadContentHandler handler = new ReadContentHandler();
        Metadata metadata = new Metadata();

        FileInputStream inputstream = new FileInputStream(file);
        ParseContext pcontext = new ParseContext();

        //parsing the document using PDF parser
        PDFParser pdfparser = new PDFParser();
        pdfparser.parse(inputstream, handler, metadata,pcontext);

        //getting the content of the document by pages.
        for(Map.Entry<Integer,StringBuilder> entry:handler.pageMap.entrySet()){
    
    
            jsonObject = new JSONObject();
            jsonObject.put("page",entry.getKey());
            jsonObject.put("content",entry.getValue().toString());

            jsonArray.put(jsonObject);
        }

        return jsonArray;
    }


}

parseText()    File to text content
parsePageToPdf()pdf to page text content

Guess you like

Origin blog.csdn.net/ruisasaki/article/details/131767061