python-增值税发票识别

程序功能：调用百度的OCR接口，对指定目录下的所有发票(jpg、png、pdf)进行识别，最后将识别结果保存至excel中
相关资源：百度发票识别
代码比较简单，其中access_token为发票识别api的token，fp_path是发票存放的目录，这两个填好后直接运行程序就可，最后生成以增值税发票命名的excel
# encoding:utf-8
import time
import requests
import base64
import os
import xlwt

target_fields = ['InvoiceCode','InvoiceNum','InvoiceType','InvoiceDate','SellerName','SellerRegisterNum','PurchaserName'
    ,'TotalAmount','TotalTax','AmountInFiguers','ServiceType']
'''
增值税发票识别
'''
# 获取发票正文内容
def get_normal_context(pic):
    # print('正在获取图片正文内容！')
    data = {
    
    }
    try:
        request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice"
        # 二进制方式打开图片文件
        f = open(pic, 'rb')
        img = base64.b64encode(f.read()).decode("utf8")
        if pic.endswith('jpg') or pic.endswith('png'):
            params = {
    
    
                "image":img,
            }
        elif pic.endswith('pdf'):
            params = {
    
    
                "pdf_file":img,
            }
        else:
            print('文件格式有误')
            return False
        # 这里需要替换成自己的access_token
        access_token = ''

        request_url = request_url + "?access_token=" + access_token
        headers = {
    
    'content-type': 'application/x-www-form-urlencoded'}
        response = requests.post(request_url, data=params, headers=headers)
        if response:
            print (response.json())
            json1 = response.json()
            try:
                if json1['error_code'] == 282103:
                    return False
            except:
                for field in target_fields:
                    try:
                        data[field] = json1['words_result'][field]
                    except:
                        data[field] = ''
        # print('正文内容获取成功！')
        return data

    except Exception as e:
        print(e)
    return data

def get_roll_context(pic):
    # print('正在获取图片正文内容！')
    data = {
    
    }
    try:
        request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice"
        # 二进制方式打开图片文件
        f = open(pic, 'rb')
        img = base64.b64encode(f.read()).decode("utf8")
        if pic.endswith('jpg') or pic.endswith('png'):
            params = {
    
    
                "image":img,
                "type":'roll'

            }
        elif pic.endswith('pdf'):
            params = {
    
    
                "pdf_file":img,
                "type":'roll'

            }
        else:
            print('文件格式有误')
            return False
        # 这里需要替换成自己的access_token
        access_token = ''

        request_url = request_url + "?access_token=" + access_token
        headers = {
    
    'content-type': 'application/x-www-form-urlencoded'}
        response = requests.post(request_url, data=params, headers=headers)
        if response:
            print(response.json())
            json1 = response.json()
            try:
                if json1['error_code'] == 282103:
                    return False
            except:
                for field in target_fields:
                    try:
                        data[field] = json1['words_result'][field]
                    except:
                        data[field] = ''
            # print(data['AmountInFiguers'])
        # print('正文内容获取成功！')
        return data

    except Exception as e:
        print(e)
    return data

# 定义生成图片路径的函数
def pics(path):
    print('正在生成图片路径')
    #生成一个空列表用于存放图片路径
    pics = []
    # 遍历文件夹，找到后缀为jpg和png的文件，整理之后加入列表
    for filename in os.listdir(path):
        if filename.endswith('jpg') or filename.endswith('png') or filename.endswith('pdf'):
            pic = path + '/' + filename
            pics.append(pic)
    print('图片路径生成成功！')
    return pics

# 定义一个获取文件夹内所有文件正文内容的函数，每次返回一个字典，把返回的所有字典存放在一个列表里
def datas(pics):
    datas = []
    for p in pics:
        data = get_normal_context(p)
        if data:
            datas.append(data)
            continue
        data = get_roll_context(p)
        if data:
            datas.append(data)
            continue
        print(f'{
      
      p} 该发票不能被识别')
        time.sleep(0.5)
    return datas

# 定义一个写入将数据excel表格的函数
def save(datas):
    print('正在写入数据！')
    book = xlwt.Workbook(encoding='utf-8', style_compression=0)
    sheet = book.add_sheet('增值税发票内容登记', cell_overwrite_ok=True)
    title = ['发票代码','发票号码','发票类型','开票日期', '卖方名称','卖方纳税人识别号', '购买方名称','合计金额','合计税额' , '价税合计','消费类型']
    for i in range(len(title)):
        sheet.write(0, i, title[i])
    for d in range(len(datas)):
        for j in range(len(title)):
            sheet.write(d + 1, 0, datas[d]['InvoiceCode']) # 发票代码
            sheet.write(d + 1, 1, datas[d]['InvoiceNum']) # 发票号码
            sheet.write(d + 1, 2, datas[d]['InvoiceType']) # 发票类型
            sheet.write(d + 1, 3, datas[d]['InvoiceDate'])  # 开票日期
            sheet.write(d + 1, 4, datas[d]['SellerName']) # 卖方名称
            sheet.write(d + 1, 5, datas[d]['SellerRegisterNum']) # 卖方纳税人识别号
            sheet.write(d + 1, 6, datas[d]['PurchaserName']) # 购买方名称
            sheet.write(d + 1, 7, datas[d]['TotalAmount'])  # 合计金额
            sheet.write(d + 1, 8, datas[d]['TotalTax'])  # 合计税额
            sheet.write(d + 1, 9, datas[d]['AmountInFiguers'])  # 价税合计
            sheet.write(d + 1, 10, datas[d]['ServiceType'])  # 消费类型
    print('数据写入成功！')
    book.save('增值税发票.xls')

def main():
    print('开始执行！！！')
    # 这是你发票的存放地址，自行更改
    fp_path = r''
    Pics = pics(fp_path)
    Datas = datas(Pics)
    save(Datas)
    print('执行结束！')


if __name__ == '__main__':
    main()
python-增值税发票识别

猜你喜欢