视频字幕识别（百度AI开放平台OCR | python

目标：提取位于视频下方的字幕

机缘：某些要写报告的学习视频太长了orz，弄字幕来参考一下

难点：

调参（不同视频字幕对应参数会不同，但调整不大）
图片相似度比较（哈希算法，有更好的算法但我没用心找）

基本策略：

确定字幕位置
比较截取的两帧字幕是否相同，若相同，则抛弃其中一帧
由于调用的是百度平台，有ocr次数限制（1000次/月），所以将某段时间内字幕合成一张图片后再文字识别

具体代码如下：

1. 定义调用百度开放平台OCR的函数

百度官方的ocr可以选择返回带位置和不带位置的文字识别，所以这里定义两个函数，location_ocr 返回位置，baidu_ocr返回识别出的文字

没有用过百度ocr的朋友需要去申请API Key和API Secret，然后飞桨平台上也有很多关于字幕识别的项目

# --coding: utf-8 --

import math
import os
from cv2 import cv2
import requests
import base64


# 确定字幕位置
def location_ocr(img):
    '''
    构建请求url，获取Access Token，必须参数如下：

    grant_type： 必须参数，固定为client_credentials；
    client_id： 必须参数，应用的API Key；
    client_secret： 必须参数，应用的Secret Key；
    '''
    host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=' \
           + 'API Key' + '&client_secret=' + 'Secret Key'

    headers = {
        'Content-Type': 'application/json;charset=UTF-8'
    }
    # 获取token
    res = requests.get(url=host, headers=headers).json()
    url = 'https://aip.baidubce.com/rest/2.0/ocr/v1/accurate'
    data = {}
    data['access_token'] = res['access_token']

    '''
    基于之前获取的token值，再次向请求url发送post请求，完成文字识别
    '''

    data['image'] = base64.b64encode(img)
    # 发送post请求，传入图片信息参数，获取文字识别结果
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    print('文本写入中')
    res = requests.post(url=url, headers=headers, data=data)
    result = res.json()
    # print(result)
    words_result_num = int(result['words_result_num'])
    
    # 文字识别是从上到下的，所以我们取输出的最后一行文字的位置
    top = result['words_result'][words_result_num-1]['location']['top']
    height = result['words_result'][words_result_num-1]['location']['height']
    bottom = top+height
    # 返回字幕框
    return top-20, bottom+10


def baidu_ocr(img):
    '''
    构建请求url，获取Access Token，必须参数如下：

    grant_type： 必须参数，固定为client_credentials；
    client_id： 必须参数，应用的API Key；
    client_secret： 必须参数，应用的Secret Key；
    '''
    host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=' \
           + 'API Key' + '&client_secret=' + 'Secret Key'

    headers = {
        'Content-Type': 'application/json;charset=UTF-8'
    }
    # 获取token
    res = requests.get(url=host, headers=headers).json()
    url = 'https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic'
    data = {}
    data['access_token'] = res['access_token']
    '''
    基于之前获取的token值，再次向请求url发送post请求，完成文字识别
    '''
    data['image'] = base64.b64encode(img)

    # 发送post请求，传入图片信息参数，获取文字识别结果
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    print('文本写入中')
    res = requests.post(url=url, headers=headers, data=data)
    result = res.json()
    # print(result)

    text = result['words_result'][0]['words']
    # 输出文本结果，保存为txt
    f = open(r"保存txt的位置", 'a+')
    print(len(result['words_result']))
    for i in range(0, len(result['words_result'])):
        f.write(result['words_result'][i]['words'] + "\n")

2. 定义比较图像相似度的哈希算法

def similarity_hash(title, a, b):
    # 哈希算法比较图片相似度
    # 创建类
    if "AverageHash" == title:
        hashFun = cv2.img_hash.AverageHash_create()
    elif "PHash" == title:
        hashFun = cv2.img_hash.PHash_create()
    elif "MarrHildrethHash" == title:
        hashFun = cv2.img_hash.MarrHildrethHash_create()
    elif "RadialVarianceHash" == title:
        hashFun = cv2.img_hash.RadialVarianceHash_create()
    elif "BlockMeanHash" == title:
        hashFun = cv2.img_hash.BlockMeanHash_create()
    elif "ColorMomentHash" == title:
        hashFun = cv2.img_hash.ColorMomentHash_create()

    hash_a = hashFun.compute(a)
    hash_b = hashFun.compute(b)
    compare = hashFun.compare(hash_a, hash_b)
    return compare

3. 再定义一些有用的函数

def show_img(img):
    # 显示图片
    cv2.namedWindow('show_img', cv2.WINDOW_NORMAL)
    cv2.imshow('show_img', img)
    cv2.waitKey()
    cv2.destroyAllWindows()


def video_time(video_filename):

    """读取视频时长，计算需要切分识别的次数
    video_filename :所要切分的视频路径
    times :总切分次数
    """
    total_frames = int(videoCap.get(cv2.CAP_PROP_FRAME_COUNT))
    # 每 80（图数）*20（帧间隔）=1600 帧,就归结为一张图，计算总图数
    times = math.ceil(total_frames/1600)
    return times

4. 开始处理视频

def subtitle_cut(videoCap, i, top, bottom):

    """截取videoCap的第i帧的字幕区域并进行预处理"""

    videoCap.set(cv2.CAP_PROP_POS_FRAMES, i)  # 设置要获取的帧号
    TorF, frame = videoCap.read()  # read方法返回一个布尔值和一个视频帧
    subtitle_area = frame[top:bottom, :]
    # 阈值处理
    subtitle = cv2.cvtColor(subtitle_area, cv2.COLOR_BGR2GRAY)
    retVal, bw_img = cv2.threshold(subtitle,220,255,cv2.THRESH_BINARY_INV)
    return bw_img


video_file = '视频位置'
videoCap = cv2.VideoCapture(video_file)

# 先在视频中截张图，得出字幕位置
for i in range(10):
    videoCap.set(cv2.CAP_PROP_POS_FRAMES, 3000+40*i)  # 随机截取，有字就行
    TorF, frame = videoCap.read()  # read方法返回一个布尔值和一个视频帧
    r = frame.shape[0]
    show_img(frame)
    frame_en = cv2.imencode('.jpg', frame)[1]
    top, bottom = location_ocr(frame_en)
    if top > 0.8*r :  # 如果识别的字幕框位于0.8r以下，就判断为字幕
        show_img(frame[top:bottom, :])
        break

# 对视频截取字幕部分
times = video_time(video_file)
print(times)
os.chdir(r'E:\a_ocr\subtitles cutting\pics')
# 循环：每80张拼接成一张图（提高识别效果）

for pic_list in range(5, times+1):  # 视频前面一般没有字幕，于是从5开始
    np_pic_list = []
    for i in range(80*pic_list, 80*(pic_list+1)):
        i = i * 20  # i代表随便抽取的某些帧，每隔20帧就抽取一帧
        # 比较第i帧前后两张截图，发生字幕切换的i贴入np_pic_list
        img1= subtitle_cut(videoCap, i, top, bottom)
        img2 = subtitle_cut(videoCap, i-20, top, bottom)
        is_change_word = similarity_hash("PHash", img1, img2)
        print(is_change_word)
        if is_change_word>20:  # 判断字幕是否相同的分界点
            np_pic_list.append(img1)

    # 垂直拼接并保存成一张图
    img_splice = cv2.vconcat(np_pic_list)
    image = cv2.imencode('.jpg', img_splice)[1]
    show_img(img_splice)
    baidu_ocr(image)

刚学opencv没多久，写的代码怪乱的，看的朋友见谅

如果传入baidu_ocr中报错，显示图片格式不对，可以尝试image = cv2.imencode('.jpg', image)，将image转为一维ndarray.

图片预处理还可以用自适应阈值，形态学处理，高斯滤波等等，对有颜色的字幕可以转为HSV颜色域再处理。

视频字幕识别（百度AI开放平台OCR | python | opencv）

猜你喜欢