python调用百度api表格识别

版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接: https://blog.csdn.net/qq_36810544/article/details/85272602

如果自己实现的话,需要先做一次直线检测,然后划分出一个个的单元格,将单元格图像送字符识别识别,最后根据行列写xls文件。不过既然有api加上又不是公司的项目就直接调api了。
申请一个百度开发者账号,新建一个ocr的应用,下载sdk。百度的表格识别api,分成同步和异步两种,同步的调用需要提申请,异步的可以直接使用。每天50次的免费

# -*- coding: utf-8 -*-
# -------------------------------------------------------------------------------
# Name:        ocr_online.py
# Purpose:     ocr表格识别
#
# Author:      BQH
#
# Created:     2018-12-06
# Copyright:   (c) Administrator 2018
# Licence:     <your licence>
# -------------------------------------------------------------------------------

import cv2
import os
import base64
from aip import AipOcr
import requests
import time

data_dir = r'E:\code\ocr_online\data'
img_dir = r'E:\code\ocr_online\img'
result_dir = r'E:\code\ocr_online\result'

APP_ID = '你的app id'
API_KEY = '你的 api key'
SECRET_KEY = '你的 key'

client = AipOcr(APP_ID, API_KEY, SECRET_KEY)

def image_process():
    for name in os.listdir(data_dir):
        img = cv2.imread(os.path.join(data_dir, name), 0)        
        ret, binary = cv2.threshold(img, 230, 255, cv2.THRESH_BINARY)
        binary = cv2.GaussianBlur(binary, (3, 3), 0)
        kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]], np.float32)
        dst = cv2.filter2D(binary, -1, kernel=kernel)        
        cv2.imwrite(os.path.join(img_dir, name), dst)


def get_file_content(filePath):
    with open(filePath, 'rb') as fp:
        return fp.read()

def file_download(url, file_path):
    r = requests.get(url)
    with open(file_path, 'wb') as f:
        f.write(r.content)


def main():
    num = 0
    for name in os.listdir(img_dir):
        image = get_file_content(os.path.join(img_dir,name))
        res = client.tableRecognitionAsync(image)
        try:
            req_id = res['result'][0]['request_id']
            # print(req_id)
            while True:
            	 time.sleep(3)
                res = client.getTableRecognitionResult(req_id)                
                try:
                    msg = res['result']['ret_msg']
                    if msg == '已完成':
                        url = res['result']['result_data']
                        xls_name = name.split('.')[0] + '.xls'
                        file_download(url, os.path.join(result_dir, xls_name))
                        num = num + 1
                        print('{0}: {1} 完成!'.format(num, xls_name))
                        os.remove(os.path.join(img_dir,name))
                        break
                    else:
                        time.sleep(2)
                except Exception as e:
                    break
        except Exception as e:
            print(res)
            break


if __name__ == '__main__':
    main()

猜你喜欢

转载自blog.csdn.net/qq_36810544/article/details/85272602