Java's Tesseract implements OCR

1. Implementation logic

package com.vue.demo.service.serviceimpl;

import com.vue.demo.service.OCRService;
import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException;
import net.sourceforge.tess4j.util.ImageHelper;
import org.apache.commons.io.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;

/**
 * @author yangwj
 * @date 2020/4/1 9:29
 */
@Service
public class OCRServiceImpl implements OCRService {
    private  static  final Logger ocrServiceImplLog = LoggerFactory.getLogger(OCRServiceImpl.class);
    String language = "";

    /**
     * method one
     * @param file
     * @return
     */
    @Override
    public String getCharacterFromPic(MultipartFile file) {
//        String modelPath = "D:\\software\\ocr-tesseract\\tessdata";
        String modelPath = "/root/project/java/tesseract_model";

        Tesseract tessreact = new Tesseract ();
         // You need to specify the training set to download the training set to https://github.com/tesseract-ocr/tessdata . 
        tessreact.setDatapath (modelPath);

        if (language.equals ("ch" )) {
             // Note that the default is English recognition, if you do Chinese recognition, you need to set it separately. 
            tessreact.setLanguage ("chi_sim" );
        }
        try {
            File imageFile = new File(file.getOriginalFilename());
            FileUtils.copyInputStreamToFile(file.getInputStream(), imageFile);
            String result = tessreact.doOCR(imageFile);
            ocrServiceImplLog.info(result);
            System.out.println("----------------");
            String handleResult  =  this.ocr(imageFile,modelPath);
            ocrServiceImplLog.info(handleResult);
            return result+"----------------------------------\n\r"+handleResult;
        } catch (TesseractException e) {
            System.err.println(e.getMessage());
        } catch (IOException e) {
            e.printStackTrace ();
        }
        return null;
    }

    @Override
    public String getLanguage(String language) {
        if(language == null || language == "" ) {
            return null;
        }
        this.language = language;
        return "success";
    }

    /**
     * Method Two
     * @param file
     * @param modelPath
     * @return
     */
    private  String ocr(File file,String modelPath) {
        String result = null;
        try {
            double start = System.currentTimeMillis();
            BufferedImage textImage = ImageIO.read (file);
             // Here, the image is processed in black and white to enhance the recognition rate. Here, first take a screenshot to intercept the part of the image that needs to be recognized 
            textImage = ImageHelper.convertImageToGrayscale (textImage);
             // Image sharpening 
            textImage = ImageHelper.convertImageToBinary (textImage);
             // Image magnification, enhanced recognition rate (many pictures themselves cannot be recognized, they can be easily recognized when magnified 5 times, but the problem is that the customer's computer configuration is low and the dot printer does not print consistently Here it is enlarged by 5 times) 
            textImage = ImageHelper.getScaledInstance (textImage, textImage.getWidth () * 1, textImage.getHeight () * 1 );

            textImage = ImageHelper.convertImageToBinary(textImage);
            String saveImgPath = "/root/project/java/tesseract_model/temp_img";
//            String saveImgPath = "D:\\software\\ocr-tesseract\\img_tem\\temp.img";
            ImageIO.write(textImage, "png", new File(saveImgPath));

            Tesseract instance = new Tesseract ();
             // Set the location of the training library
 //             String modelPath = "/ root / project / java / tesseract_model";


            instance.setDatapath(modelPath);
            // Chinese recognition 
            instance.setLanguage ("chi_sim" );
            result = instance.doOCR(textImage);
            double end = System.currentTimeMillis();
            System.out.println("耗时" + (end - start) / 1000 + " s");
        } catch (Exception e) {
            e.printStackTrace ();
        }
        return result;
    }

}

2. Deploy to centos, you can see this problem

 

Guess you like

Origin www.cnblogs.com/ywjfx/p/12757461.html