使用tesseract识别图片(主要用于验证码)

tesseract工具需要设置两个环境变量:Path 和TESSDATA_PREFIX ,变量值都是tesseract的安装目录;

工具的安装和训练字库:https://www.cnblogs.com/zhongtang/p/5555950.html

将训练的字库合并:https://blog.csdn.net/woaipangruimao/article/details/78740270

设置好字符库后需要跟java程序结合:

1. String temp = "";
  int count = 0;
  do {
   Request request = new Request();
   request.Cookie = "ASP.NET_SessionId=1qestr2g3jvzcvq0yg22ymj1";
   Status status = new Status();
   Response rs = new Response();
   String picUrl = "http://www.afgl.gov.cn/user/login.aspx?AuthCode1$codeText=c705888&___clientRandom=0.8445339654723959";
   URL url = new URL(null);
   url.url = picUrl;
   temp = Download.downloadFile_local(url, request, rs, status,
     "/www/spider.soufun.com/templet/anshan/");
   System.out.println("Path:"+temp);
   File file = new File("/www/spider.soufun.com/templet/anshan/"+(Run.date_dir == null ? "" : Run.date_dir)+temp);
   try {
    cleanImage(file, "/www/spider.soufun.com/templet/anshan/");//处理图片杂色
   } catch (IOException e1) {
    // TODO Auto-generated catch block
    e1.printStackTrace();
   }
   
   File imageFile = new File("/www/spider.soufun.com/templet/anshan/35765be6cd12a1ffbf782012f13db3ac.jpg");
   Tesseract test = new Tesseract();
   test.setDatapath("C:\\Program Files (x86)\\Tesseract-OCR\\tessdata");//训练的字库地址
   test.setLanguage("font");//训练的字库名称
   String result = "";
   try {
    result = test.doOCR(imageFile);
    result = result.replace(" ", "");
    System.out.println(result);
   } catch (TesseractException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
   }
   

2.

public static BufferedImage cleanImage(File sfile, String destDir)
   throws IOException {
  File destF = new File(destDir);
  if (!destF.exists()) {
   destF.mkdirs();
  }

  BufferedImage bufferedImage = ImageIO.read(sfile);
  int h = bufferedImage.getHeight();
  int w = bufferedImage.getWidth();

  // 灰度化
  int[][] gray = new int[w][h];
  for (int x = 0; x < w; x++) {
   for (int y = 0; y < h; y++) {
    int argb = bufferedImage.getRGB(x, y);
    // 图像加亮(调整亮度识别率非常高)
    int r = (int) (((argb >> 16) & 0xFF) * 1.1 + 30);
    int g = (int) (((argb >> 8) & 0xFF) * 1.1 + 30);
    int b = (int) (((argb >> 0) & 0xFF) * 1.1 + 30);
    if (r >= 255) {
     r = 255;
    }
    if (g >= 255) {
     g = 255;
    }
    if (b >= 255) {
     b = 255;
    }
    gray[x][y] = (int) Math
      .pow((Math.pow(r, 2.2) * 0.2973 + Math.pow(g, 2.2)
        * 0.6274 + Math.pow(b, 2.2) * 0.0753), 1 / 2.2);
   }
  }

  // 二值化
  int threshold = ostu(gray, w, h);
  BufferedImage binaryBufferedImage = new BufferedImage(w, h,
    BufferedImage.TYPE_BYTE_BINARY);
  for (int x = 0; x < w; x++) {
   for (int y = 0; y < h; y++) {
    if (gray[x][y] > threshold) {
     gray[x][y] |= 0x00FFFF;
    } else {
     gray[x][y] &= 0xFF0000;
    }
    binaryBufferedImage.setRGB(x, y, gray[x][y]);
   }
  }

  // 矩阵打印
  for (int y = 0; y < h; y++) {
   for (int x = 0; x < w; x++) {
    if (isBlack(binaryBufferedImage.getRGB(x, y))) {
     System.out.print("*");
    } else {
     System.out.print(" ");
    }
   }
   System.out.println();
  }

  ImageIO.write(binaryBufferedImage, "jpg", new File(destDir, sfile.getName().replace("Gif", "jpg")));
  return binaryBufferedImage;
 }

猜你喜欢

转载自www.cnblogs.com/srp750115867/p/9229833.html