基于CRNN实现验证码识别

最近在做的一个小项目，进行简单的分享。现在不论是手机APP、网页、电脑客户端等，只要涉及用户登录的界面基本都需要输入验证码核实身份真伪。因此有时，我们也会有自动识别验证码的需求，例如：希望实现业务流程自动化时，用户登录作为流程第一步可能就会用到验证码的识别，将该模型部署为接口进行调用就可以完成该功能。

本项目基于非常经典的文本识别算法CRNN来进行验证码识别模型的训练，整个流程是基于PaddlePaddle来训练的，后边也会放上Pytorch版本的模型代码。项目中代码直接使用时，需要自己加上 import python包的部分。

1. 数据准备

1.1 数据集准备

这次分享的项目其实是一个简单的示例，使用一个比较简单的大写字母加数字的数据集进行训练，因此字符的丰富度可能比较有限，在真实场景下可以基于自己的需要找更加合适的数据集进行训练。这里把我找数据集的链接发给大家，就不放自己的数据集了，把大概准备数据的过程分享给大家。

1.2 准备标签文件

准备标签文件，这里因为我的数据集图片命名就是label的值，所以准备标签的方式比较简单。这里只是一种比较简单的数据准备方式，大家可以根据自己数据的情况进行标注和标签文件的准备。

#生成总的标签文件
train_path = "pic"
SUM = []
for root,dirs, files in os.walk(train_path): # 分别代表根目录、文件夹、文件
    for file in files:              
        imgpath = os.path.join(root, file)
        SUM.append(imgpath+"\t"+file.split(".")[0]+"\n")
    # 生成总标签文件
    allstr = ''.join(SUM)
    f = open('total_list.txt','w',encoding='utf-8')
    f.write(allstr)
    f.close
print("数据集数量：{}".format(len(SUM)))

生成总的标签文件后就可以划分训练集和验证集，训练集和验证集的比例也可以自己去定。

random.shuffle(SUM)
train_len = int(len(SUM) * 0.8)
test_list = SUM[:train_len]
train_list = SUM[:train_len]
print('训练集数量: {}, 验证集数量: {}'.format(len(train_list),len(test_list)))
#生成训练集的标签文件
train_txt = ''.join(train_list)
f_train = open('train_list.txt','w',encoding='utf-8')
f_train.write(train_txt)
f_train.close()
#生成测试集的标签文件
test_txt = ''.join(test_list)
f_test = open('test_list.txt','w',encoding='utf-8')
f_test.write(test_txt)
f_test.close()

1.3 准备数据字典

在OCR-文本识别任务中，有一个特别需要准备的文件就是字典。文本识别的结果最终包含于字典文件中的字符集，也就是字典文件中有的字符才有可能作为最终识别的结果，没有的字符也就不会作为结果进行输出。在这个项目里，字典中的字符集也就应该是所有大写字母加上数字的集合。

#准备字典
class_set = set()
lines = []
file = open("total_list.txt","r",encoding="utf-8")#待转换文档，这里我们使用的是数据集的标签文件
for i in file:
    a=i.strip('\n').split('\t')[-1]
    lines.append(a)
file.close
for line in lines:
    for e in line:
        class_set.add(e)
class_list = list(class_set)
class_list.sort()
print("class num: {0}".format(len(class_list)))
with codecs.open("new_dict.txt", "w", encoding='utf-8') as label_list:
    for id, c in enumerate(class_list):
        label_list.write("{0}\n".format(c))

1.4 可视化观察一张样本

img = Image.open('9APK.png')
img = np.array(img)

# 画出读取的图片
plt.figure(figsize=(10, 10))
plt.imshow(img)

在这里插入图片描述

2. 数据预处理

在数据灌入模型前，需要对数据进行预处理操作，使得图片和标签满足网络训练和预测的需要。这里简单实现了如下方法：

图像解码：将图像转为Numpy格式；
编码标签：将标签按照CTC(Connectionist temporal classification)算法要求进行编码。其中，字符串中每个字符替换为其在字符字典中的索引值，规定标签的最大长度max_text_len，如果标签中字符个数小于max_text_len，则剩余位置补0，例如规定max_text_len=10，标签为[2322]，字符字典为[0,1,2,3,4,5,6,7,8,9]，则编码后的标签为[2,3,2,2,0,0,0,0,0,0]；
缩放图像并归一化：将原图片的高度统一缩放到32，归一化后贴在尺寸为[3,32,100]的空白画布上；
返回图像、标签、长度：将保存在字典中的数据取出，以列表的形式返回，列表中元素顺序分别为 image, label, length。

图像解码

class DecodeImage(object):
    # 图像解码
    def __init__(self, img_mode='BGR', channel_first=False):
        self.img_mode = img_mode
        self.channel_first = channel_first

    def __call__(self, data):
        # 解码图像并返回结果
        img = data['image']
        img = np.frombuffer(img, dtype='uint8')
        img = cv2.imdecode(img, 1)
        if img is None:
            return None
        if self.img_mode == 'GRAY':
            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
        elif self.img_mode == 'RGB':
            assert img.shape[2] == 3, 'invalid shape of image[%s]' % (img.shape)
            img = img[:, :, ::-1]

        if self.channel_first:
            img = img.transpose((2, 0, 1))

        data['image'] = img
        return data

编码标签

将字符格式的标签转换为索引格式，如果不足 max_text_len 个，则在最后进行补零。

def encode(text, max_text_len, dict_index):
    # 将字符标签转换为对应的索引值
    # 如果没有字符或字符个数超过上限，返回None
    if len(text) == 0 or len(text) > max_text_len:
        return None
    # 将字符的索引值依次保存到text_list
    text_list = []
    for char in text:
        # 如果字符在字符字典没有出现，不进行保存
        if char not in dict_index:
            continue
        text_list.append(dict_index[char])
    if len(text_list) == 0:
        return None
    return text_list

class CTCLabelEncode(object):
    # 编码标签
    def __init__(self, max_text_length=25, character_dict_path='new_dict.txt'):
        self.max_text_length = max_text_length
        # 将标签编码为CTC格式
        character_str = ""
        # 读取字符字典
        with open(character_dict_path, "rb") as fin:
            lines = fin.readlines()
            for line in lines:
                line = line.decode('utf-8').strip("\n").strip("\r\n")
                character_str += line
        dict_character = list(character_str)
        # 添加类别：分隔符
        dict_character = ['blank'] + dict_character
        # 将每个类别对应的索引保存到字典中
        self.dict_index = {
    
    }
        for i, char in enumerate(dict_character):
            self.dict_index[char] = i

    def __call__(self, data):
        # 获取数据的标签
        text = data['label']
        # 将标签转换为索引
        text = encode(text, self.max_text_length, self.dict_index)
        if text is None:
            return None
        data['length'] = np.array(len(text))
        text = text + [0] * (self.max_text_length - len(text))
        data['label'] = np.array(text)
        return data

缩放图像并标准化

class RecResizeImg(object):
    def __init__(self, image_shape=[3, 32, 100]):
        self.image_shape = image_shape

    def __call__(self, data):
        img = data['image']
        norm_img = self.resize_norm_img(img, self.image_shape)
        data['image'] = norm_img
        return data
    
    def resize_norm_img(self, img, image_shape):
        # 缩放图像并对图像进行归一化
        # 缩放图像
        imgC, imgH, imgW = image_shape
        h = img.shape[0]
        w = img.shape[1]
        ratio = w / float(h)
        # 如果 w 大于等于100，令 resized_w = 100
        if math.ceil(imgH * ratio) > imgW:
            resized_w = imgW
        # 如果 w 小于100，令 resized_w 为 w 向上取整
        else:
            resized_w = int(math.ceil(imgH * ratio))
        resized_image = cv2.resize(img, (resized_w, imgH))
        # 对图片进行归一化
        resized_image = resized_image.astype('float32')
        resized_image = resized_image.transpose((2, 0, 1)) / 255
        resized_image -= 0.5
        resized_image /= 0.5
        # 新建大小为[3, 32, 100]的空白图像，将缩放后的图像贴到对应位置，其他位置补0
        padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
        padding_im[:, :, 0:resized_w] = resized_image
        return padding_im

返回图像、标签、长度

class KeepKeys(object):
    # 将字典格式的数据转换为列表格式返回
    def __init__(self, keep_keys=['image', 'label', 'length']):
        self.keep_keys = keep_keys

    def __call__(self, data):
        data_list = []
        for key in self.keep_keys:
            data_list.append(data[key])
        return data_list

汇总上述方法

# 图像预处理方法汇总
def transform(data, mode='train'):
    # 图像解码
    decode_image = DecodeImage()
    # 编码标签
    encode_label = CTCLabelEncode()
    # 缩放图像并标准化
    resize_image = RecResizeImg()
    data = decode_image(data)
    if mode == 'train' or mode == 'val':
        data = encode_label(data)
        keep_keys=['image', 'label', 'length']
    else:
        keep_keys = ['image']
    # 返回图像、标签、长度
    keepkeys = KeepKeys(keep_keys=keep_keys)
    data = resize_image(data)
    data = keepkeys(data)
    return data

定义数据读取类SimpleDataSet，实现数据批量读取和预处理。具体代码如下：

class SimpleDataSet(Dataset):
    def __init__(self, mode, label_file, data_dir, seed=None):
        super(SimpleDataSet, self).__init__()
        self.mode = mode.lower()
        # 标注文件中，使用'\t'作为分隔符区分图片名称与标签
        self.delimiter = '\t'
        # 数据集路径
        self.data_dir = data_dir
        # 随机数种子
        self.seed = seed
        # 获取所有数据，以列表形式返回
        self.data_lines = self.get_image_info_list(label_file)
        # 新建列表存放数据索引
        self.data_idx_order_list = list(range(len(self.data_lines)))
        # 如果是训练过程，将数据集进行随机打乱
        if self.mode == "train":
            self.shuffle_data_random()

    def get_image_info_list(self, label_file):
        # 获取标签文件中的所有数据
        with open(label_file, "rb") as f:
            lines = f.readlines()
        return lines

    def shuffle_data_random(self):
        #随机打乱数据
        random.seed(self.seed)
        random.shuffle(self.data_lines)
        return

    def __getitem__(self, idx):
        # 获取索引为idx的数据
        file_idx = self.data_idx_order_list[idx]
        data_line = self.data_lines[file_idx]
        try:
            # 获取图片名称以及标签
            data_line = data_line.decode('utf-8')
            substr = data_line.strip("\n").split(self.delimiter)
            file_name = substr[0]
            label = substr[1]
            # 获取图片路径
            img_path = os.path.join(self.data_dir, file_name)
            data = {
    
    'img_path': img_path, 'label': label}
            if not os.path.exists(img_path):
                raise Exception("{} does not exist!".format(img_path))
            # 读取图片并进行预处理
            with open(data['img_path'], 'rb') as f:
                img = f.read()
                data['image'] = img
            outs = transform(data, mode=self.mode.lower())
        # 如果当前数据读取失败，重新读取一个新数据
        except Exception as e:
            outs = None
        if outs is None:
            rnd_idx = np.random.randint(self.__len__()) if self.mode == "train" else (idx + 1) % self.__len__()
            return self.__getitem__(rnd_idx)
        return outs

    def __len__(self):
        # 返回数据集的大小
        return len(self.data_idx_order_list)

def build_dataloader(mode, label_file, data_dir, batch_size, drop_last, shuffle, num_workers, seed=None):
    # 创建数据读取类
    dataset = SimpleDataSet(mode, label_file, data_dir, seed)
    # 定义 batch_sampler
    batch_sampler = BatchSampler(dataset=dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)
    # 使用paddle.io.DataLoader创建数据读取器，并设置batchsize，进程数量num_workers等参数
    data_loader = DataLoader(dataset=dataset, batch_sampler=batch_sampler, num_workers=num_workers, return_list=True, use_shared_memory=False)

    return data_loader

# 定义训练集数据读取器
train_dataloader = build_dataloader('Train', 'train_list.txt', 'Verification_code', batch_size=256, drop_last=True, shuffle=True, num_workers=8)
# 定义验证集数据读取器
val_dataloader = build_dataloader('Val', 'test_list.txt', 'Verification_code', batch_size=256, drop_last=False, shuffle=False, num_workers=4)

3. 后处理

由于在预处理的过程中，对标签进行了CTC格式的编码，所以最终算法的输出是按照CTC算法的要求进行编码后的格式。因此，在获取预测结果后，还需要进行标签的解码，来得到最终的结果。

class CTCLabelDecode(object):
    def __init__(self, character_dict_path=None):
        self.character_str = ""
        # 读取字符字典，并保存到列表中
        with open(character_dict_path, "rb") as fin:
            lines = fin.readlines()
            for line in lines:
                line = line.decode('utf-8').strip("\n").strip("\r\n")
                self.character_str += line
        dict_character = list(self.character_str)
        # 添加类别：分隔符
        dict_character = self.add_special_char(dict_character)
        # 将每个类别对应的索引保存到字典中
        self.dict_index = {
    
    }
        for i, char in enumerate(dict_character):
            self.dict_index[char] = i
        self.character = dict_character

    def __call__(self, preds, label=None):
        if isinstance(preds, paddle.Tensor):
            preds = preds.numpy()
        # 获取预测标签以及对应概率
        preds_idx = preds.argmax(axis=2)
        preds_prob = preds.max(axis=2)
        # 解码预测标签
        text = self.decode(preds_idx, preds_prob, is_remove_duplicate=True)
        if label is None:
            return text
        # 解码真实标签
        label = self.decode(label)
        return text, label

    def add_special_char(self, dict_character):
        # 添加类别：分隔符
        dict_character = ['blank'] + dict_character
        return dict_character

    def decode(self, text_index, text_prob=None, is_remove_duplicate=False):
        result_list = []
        batch_size = len(text_index)
        for batch_idx in range(batch_size):
            char_list = []
            conf_list = []
            for idx in range(len(text_index[batch_idx])):
                # 如果当前字符的索引值为0，也就是'blank'字符，直接跳过
                if text_index[batch_idx][idx] == 0:
                    continue
                # 解码预测标签
                if is_remove_duplicate:
                    # 如果当前字符与前一个字符为重复字符，直接跳过
                    if idx > 0 and text_index[batch_idx][idx - 1] == text_index[batch_idx][idx]:
                        continue
                # 保存当前索引对应的字符
                char_list.append(self.character[int(text_index[batch_idx][idx])])
                # 如果是预测标签，保存置信度
                if text_prob is not None:
                    conf_list.append(text_prob[batch_idx][idx])
                # 如果是预测标签，置信度为1
                else:
                    conf_list.append(1)
            # 拼接字符串，置信度为所有字符置信度的平均值
            text = ''.join(char_list)
            result_list.append((text, np.mean(conf_list)))
        return result_list

# 实例化后处理过程
post_process_class = CTCLabelDecode('new_dict.txt')
# 类别个数
char_num = len(getattr(post_process_class, 'character'))

4. 模型定义

传统的文本识别方法需要先对单个文字进行切割，然后再对单个文字进行识别，本实验使用的是图像文本识别的经典算法CRNN。CRNN是2015年被提出的，到目前为止还是被广泛应用。该算法的主要思想是认为文本识别其实需要对序列进行预测，所以采用了预测序列常用的RNN网络。算法通过CNN提取图片特征，然后采用RNN对序列进行预测，最终使用CTC方法得到最终结果。该算法具有以下几个优点：

可以进行端到端的训练；
可以进行不定长文本的识别；
模型简单，效果好

CRNN的主要结构包括基于CNN的图像特征提取模块以及基于多层双向LSTM的文字序列特征提取模块。：

CRNN模型网络结构示意图 ^[1]

第一模块：使用CNN网络，对输入图像提取特征，得到特征图。这里参考paddleocr，使用调整后的MobileNetv3提取特征，其中输入图像的高度统一设置为32，宽度可以为任意长度，经过CNN网络后，特征图的高度缩放为1；
第二模块：Im2Seq，将CNN获取的特征图变换为RNN需要的特征向量序列的形状；
第三模块：使用双向LSTM（BiLSTM）对特征序列进行预测，学习序列中的每个特征向量并输出预测标签分布。这里其实相当于把特征向量的宽度视为LSTM中的时间维度；
第四模块：使用全连接层获取模型的预测结果；
第五模块：CTC转录层，解码模型输出的预测结果，得到最终输出。

说明：

这里的模块五转录层对应的就是上文中的标签解码过程。

延伸阅读：An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition

class CRNN(nn.Layer):
    def __init__(self):
        super(CRNN, self).__init__()
        # 定义骨干网络MobileNetV3
        self.backbone = MobileNetV3()
        in_channels = self.backbone.out_channels

        # 定义序列预测部分，即：全连接+BiLSTM
        self.neck = SequenceEncoder(in_channels, 96)
        in_channels = self.neck.out_channels

        # 定义 CTCHead, 输出类别数为字典中的元素个数
        self.head = CTCHead(in_channels, char_num)

    def forward(self, x):
        x = self.backbone(x)
        x = self.neck(x)
        x = self.head(x)
        return x

上边将模型分为了backbone、neck、head三个部分，接下来分别定义这3个部分。

4.1 定义backbone网络结构

这里考虑到文本序列存在高度较小而宽度较长的特性，对原始的MobileNetV3结构做了一定的改进，包括：

网络中第2、4、13个残差模块的步长改为（2，1）；
第7个残差模块的步长改为1；
最后的自适应平均池化改为步长为2的最大池化。

这样，图片经过网络处理，高度下采样了 $2^{5}$ 倍，最终的特征图高度为1；宽度下采样了 $2^{2}$ 倍，最终的特征图宽度为25。

# 计算 MobileNetV3 中每层的输出维度
def make_divisible(v, divisor=8, min_value=None):
    if min_value is None:
        min_value = divisor
    # 计算通道数
    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
    # 保证通道数不会太小
    if new_v < 0.9 * v:
        new_v += divisor
    return new_v

# 将卷积和批归一化封装为ConvBNLayer，方便后续复用
class ConvBNLayer(nn.Layer):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, groups=1, if_act=False, act=None):
        super(ConvBNLayer, self).__init__()
        self.if_act = if_act
        self.act = act
        # 创建卷积层
        self.conv = nn.Conv2D(in_channels=in_channels, out_channels=out_channels,
            kernel_size=kernel_size, stride=stride, padding=padding, groups=groups, weight_attr=ParamAttr(), bias_attr=False)
        # 创建批归一化层
        self.bn = nn.BatchNorm2D(num_features=out_channels)

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        if self.if_act:
            if self.act == "relu":
                x = F.relu(x)
            elif self.act == "hardswish":
                x = F.hardswish(x)
            else:
                print("The activation function({}) is selected incorrectly.".format(self.act))
                exit()
        return x

# MobileNetV3 模型中，引入了SE通道注意力机制来提升网络效果,参考 https://arxiv.org/abs/1709.01507
# 通过学习的方式来自动获取到每个特征通道的重要程度，依照重要程度去提升有用的特征并抑制用处不大的特征
# 定义注意力模块
class SEModule(nn.Layer):
    def __init__(self, in_channels, reduction=4):
        super(SEModule, self).__init__()
        # 创建自适应平均池化层
        self.avg_pool = nn.AdaptiveAvgPool2D(1)
        # 创建卷积层
        self.conv1 = nn.Conv2D(in_channels=in_channels, out_channels=in_channels // reduction, 
            kernel_size=1, stride=1, padding=0, weight_attr=ParamAttr(), bias_attr=ParamAttr())
        self.conv2 = nn.Conv2D(in_channels=in_channels // reduction, out_channels=in_channels, 
            kernel_size=1, stride=1, padding=0, weight_attr=ParamAttr(), bias_attr=ParamAttr())

    def forward(self, inputs):
        outputs = self.avg_pool(inputs)
        outputs = self.conv1(outputs)
        outputs = F.relu(outputs)
        outputs = self.conv2(outputs)
        outputs = F.hardsigmoid(outputs)
        return inputs * outputs

# 定义残差结构
class ResidualUnit(nn.Layer):
    def __init__(self, in_channels, mid_channels, out_channels, kernel_size, stride, use_se, act=None):
        super(ResidualUnit, self).__init__()
        self.if_shortcut = stride == 1 and in_channels == out_channels
        self.if_se = use_se
        # 创建ConvBNLayer
        self.expand_conv = ConvBNLayer(in_channels=in_channels, out_channels=mid_channels,
            kernel_size=1, stride=1, padding=0, if_act=True, act=act)
        self.bottleneck_conv = ConvBNLayer(in_channels=mid_channels, out_channels=mid_channels, kernel_size=kernel_size, 
            stride=stride, padding=int((kernel_size - 1) // 2), groups=mid_channels, if_act=True, act=act)
        # 如果指定了使用注意力机制，创建SEModule
        if self.if_se:
            self.mid_se = SEModule(mid_channels)
        # 创建ConvBNLayer
        self.linear_conv = ConvBNLayer(in_channels=mid_channels, out_channels=out_channels, kernel_size=1, stride=1, padding=0)

    def forward(self, inputs):
        x = self.expand_conv(inputs)
        x = self.bottleneck_conv(x)
        if self.if_se:
            x = self.mid_se(x)
        x = self.linear_conv(x)
        if self.if_shortcut:
            x = paddle.add(inputs, x)
        return x

class MobileNetV3(nn.Layer):
    def __init__(self, in_channels=3, scale=0.5, large_stride=None):
        super(MobileNetV3, self).__init__()
        # 设置步长列表
        if large_stride is None:
            large_stride = [1, 2, 2, 2]
        # 存放每个模块的具体参数，包括：卷积核大小、残差模块的中间通道数基数、输出通道数基数、是否使用注意力机制、激活函数、步长
        cfg = [
            [3, 16, 16, False, 'relu', large_stride[0]],
            [3, 64, 24, False, 'relu', (large_stride[1], 1)],
            [3, 72, 24, False, 'relu', 1],
            [5, 72, 40, True, 'relu', (large_stride[2], 1)],
            [5, 120, 40, True, 'relu', 1],
            [5, 120, 40, True, 'relu', 1],
            [3, 240, 80, False, 'hardswish', 1],
            [3, 200, 80, False, 'hardswish', 1],
            [3, 184, 80, False, 'hardswish', 1],
            [3, 184, 80, False, 'hardswish', 1],
            [3, 480, 112, True, 'hardswish', 1],
            [3, 672, 112, True, 'hardswish', 1],
            [5, 672, 160, True, 'hardswish', (large_stride[3], 1)],
            [5, 960, 160, True, 'hardswish', 1],
            [5, 960, 160, True, 'hardswish', 1],
        ]

        cls_ch_squeeze = 960

        inplanes = 16
        # 创建ConvBNLayer
        self.conv1 = ConvBNLayer(in_channels=in_channels, out_channels=make_divisible(inplanes * scale),
            kernel_size=3, stride=2, padding=1, if_act=True, act='hardswish')
        i = 0
        block_list = []
        inplanes = make_divisible(inplanes * scale)
        # 创建残差模块
        for (k, exp, c, se, nl, s) in cfg:
            block_list.append(ResidualUnit(in_channels=inplanes, mid_channels=make_divisible(scale * exp),
                    out_channels=make_divisible(scale * c), kernel_size=k, stride=s, use_se=se, act=nl))
            inplanes = make_divisible(scale * c)
            i += 1
        self.blocks = nn.Sequential(*block_list)
        # 创建ConvBNLayer
        self.conv2 = ConvBNLayer(in_channels=inplanes, out_channels=make_divisible(scale * cls_ch_squeeze),
            kernel_size=1, stride=1, padding=0, if_act=True, act='hardswish')
        # 创建最大池化层
        self.pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0)
        self.out_channels = make_divisible(scale * cls_ch_squeeze)

    def forward(self, x):
        x = self.conv1(x)
        x = self.blocks(x)
        x = self.conv2(x)
        x = self.pool(x)
        return x

4.2 定义neck网络结构

然后是 neck 部分，在 CRNN 中，neck 部分包括两个模块，分别是：

Im2Seq，将CNN获取的特征图变换为RNN需要的特征向量序列的形状；
多层双向LSTM（BiLSTM），这里实际使用了2层的BiLSTM。

class Im2Seq(nn.Layer):
    def __init__(self, in_channels):
        super().__init__()
        self.out_channels = in_channels

    def forward(self, x):
        B, C, H, W = x.shape
        assert H == 1
        # 删除第2个维度
        x = x.squeeze(axis=2)
        # 将形状调整为(batch, width, channels)
        x = x.transpose([0, 2, 1])
        return x

class EncoderWithRNN(nn.Layer):
    def __init__(self, in_channels, hidden_size):
        super(EncoderWithRNN, self).__init__()
        self.out_channels = hidden_size * 2
        # 定义2层BiLSTM
        self.lstm = nn.LSTM(in_channels, hidden_size, direction='bidirectional', num_layers=2)

    def forward(self, x):
        x, _ = self.lstm(x)
        return x

# 将Im2Seq以及BiLSTM模块串联
class SequenceEncoder(nn.Layer):
    def __init__(self, in_channels, hidden_size):
        super(SequenceEncoder, self).__init__()
        self.encoder_reshape = Im2Seq(in_channels)
        self.encoder = EncoderWithRNN(self.encoder_reshape.out_channels, hidden_size)
        self.out_channels = self.encoder.out_channels

    def forward(self, x):
        x = self.encoder_reshape(x)
        x = self.encoder(x)
        return x

4.3 定义head网络结构

最后是 head 部分，这里的 head 部分使用了全连接层获取模型的预测结果。

class CTCHead(nn.Layer):
    def __init__(self, in_channels, out_channels):
        super(CTCHead, self).__init__()
        stdv = 1.0 / math.sqrt(in_channels * 1.0)
        # 定义全连接层
        self.fc = nn.Linear(in_channels, out_channels, 
            weight_attr=ParamAttr(initializer=nn.initializer.Uniform(-stdv, stdv)),
            bias_attr=ParamAttr(initializer=nn.initializer.Uniform(-stdv, stdv)))
        self.out_channels = out_channels

    def forward(self, x):
        predicts = self.fc(x)
        # 如果是模型推理过程，使用softmax计算输出概率
        if not self.training:
            predicts = F.softmax(predicts, axis=2)
        return predicts

5. 定义损失函数

为了解决预测标签与真实标签无法对齐的问题，这里使用了CTC loss进行模型的训练。

class CTCLoss(nn.Layer):
    def __init__(self):
        super(CTCLoss, self).__init__()
        self.loss_func = nn.CTCLoss(blank=0, reduction='none')

    def __call__(self, predicts, batch):
        predicts = predicts.transpose((1, 0, 2))
        N, B, _ = predicts.shape
        preds_lengths = paddle.to_tensor([N] * B, dtype='int64')
        labels = batch[1].astype("int32")
        label_lengths = batch[2].astype('int64')
        loss = self.loss_func(predicts, labels, preds_lengths, label_lengths)
        loss = loss.mean()  # sum
        return {
    
    'loss': loss}

# 实例化损失函数
loss_class = CTCLoss()

6. 模型训练

这里基于PaddlePaddle开源的模型作为预训练模型来提高精度。

实例化模型

# 声明定义好的CRNN模型
model = CRNN()

定义优化器

使用Adam优化器，学习率设置为0.0005，beta1设置为0.9, beta2设置为0.999, epsilon设置为1e-08。

# 使用Adam优化器，学习率设置为0.0005，beta1设置为0.9, beta2设置为0.999, epsilon设置为1e-08。
lr_scheduler = 0.0005
optimizer = optim.Adam(learning_rate=lr_scheduler, beta1=0.9, beta2=0.999, epsilon=1e-08, parameters=model.parameters())

加载模型参数并微调

# 使用已有的预训练模型来初始化网络
def init_model(model, checkpoints):
    # 加载预训练的模型参数
    assert os.path.exists(checkpoints + ".pdparams"), "Given dir {}.pdparams not exist.".format(checkpoints)
    # 加载模型参数以及优化器参数
    para_dict = paddle.load(checkpoints + '.pdparams')
    model.set_state_dict(para_dict)

# 指定预训练权重文件
checkpoints = './pretrain_models/rec_mv3_none_bilstm_ctc_v2.0_train/best_accuracy'
# 加载预训练的模型参数
init_model(model, checkpoints)

定义评估方式

这里采用了两种指标对模型效果进行评估：

acc：准确率。只有两个字符串完全相同时，判断为正确。
norm_edit_dis： $\overline{distance_{Levenshtein}}$

说明：

Levenshtein距离，即两个字符串之间，由一个转成另一个所需的最少的编辑操作次数。编辑操作包括将一个字符替换成另一个字符，插入一个字符，删除一个字符，编辑距离越小，两个字符串的相似度越大。

# 定义评估类
class RecMetric(object):
    def __init__(self, main_indicator='acc'):
        # 设置主要评估指标为 'acc'
        self.main_indicator = main_indicator
        self.reset()

    def __call__(self, pred_label):
        preds, labels = pred_label
        # 获取预测值和真实标签
        correct_num = 0
        all_num = 0
        norm_edit_dis = 0.0
        # 计算准确率和norm_edit_dis
        for (pred, pred_conf), (target, _) in zip(preds, labels):
            pred = pred.replace(" ", "")
            target = target.replace(" ", "")
            norm_edit_dis += Levenshtein.distance(pred, target) / max(len(pred), len(target), 1)
            if pred == target:
                correct_num += 1
            all_num += 1
        self.correct_num += correct_num
        self.all_num += all_num
        self.norm_edit_dis += norm_edit_dis
        return {
    
    
            'acc': correct_num / all_num,
            'norm_edit_dis': 1 - norm_edit_dis / all_num
        }

    def get_metric(self):
        # 计算累加的准确率和norm_edit_dis并返回
        acc = 1.0 * self.correct_num / self.all_num
        norm_edit_dis = 1 - self.norm_edit_dis / self.all_num
        self.reset()
        return {
    
    'acc': acc, 'norm_edit_dis': norm_edit_dis}

    def reset(self):
        # 参数重置
        self.correct_num = 0
        self.all_num = 0
        self.norm_edit_dis = 0

# 实例化评估类
eval_class = RecMetric()

配置全局变量

# 训练过程中评估指标
cal_metric_during_train = True
# log队列的长度
log_smooth_window = 20
epoch_num = 40
# 打印log的间隔
print_batch_step = 10
# 当前迭代次数
global_step = 0
# 模型保存路径
save_model_dir = './output/rec/ic15/'
if not os.path.exists(save_model_dir):
    os.makedirs(save_model_dir)
# 最优模型保存路径
best_prefix = os.path.join(save_model_dir, 'best_accuracy')
# 最终模型保存路径
latest_prefix = os.path.join(save_model_dir, 'latest')

模型训练

# 设置随机种子
paddle.seed(2)
# 设置运行设备
# 开启0号GPU
use_gpu = True
paddle.set_device('gpu:0') if use_gpu else paddle.set_device('cpu')
# 定义一个存放最优模型指标的字典
best_model_dict = {
    
    'acc': 0, 'norm_edit_dis':0, 'best_epoch':0}
# 将模型调整为训练状态
model.train()
# 模型训练
for epoch in range(epoch_num):
    for idx, batch in enumerate(train_dataloader):
        if idx >= len(train_dataloader):
            break
        lr = optimizer.get_lr()
        # 获取当前batch的图片
        images = batch[0]
        # 前向计算
        preds = model(images)
        # 计算损失
        loss = loss_class(preds, batch)
        avg_loss = loss['loss']
        # 反向传播
        avg_loss.backward()
        optimizer.step()
        optimizer.clear_grad()
        step_loss = avg_loss.numpy().mean()
        # 训练过程中打印评估指标
        if cal_metric_during_train:
            batch = [item.numpy() for item in batch]
            post_result = post_process_class(preds, batch[1])
            eval_class(post_result)
            metric = eval_class.get_metric()
            acc = metric['acc']
            norm_edit_dis = metric['norm_edit_dis']

        if global_step > 0 and global_step % print_batch_step == 0:
            logs = 'loss: {:x<6f}, acc: {:x<6f}, norm_edit_dis: {:x<6f}'.format(step_loss, acc, norm_edit_dis)
            print('epoch: [{}/{}], iter: {}, {}'.format(epoch, epoch_num, global_step, logs))
        global_step += 1

    # 每隔5个epoch训练完成后，进行模型评估  
    if (epoch+1) % 5 == 0:
        # 将模型设置为评估状态
        model.eval()
        # 评估过程中不计算梯度值
        with paddle.no_grad():
            for idx, batch in enumerate(val_dataloader):
                if idx >= len(val_dataloader):
                    break
                images = batch[0]
                # 前向计算
                preds = model(images)
                batch = [item.numpy() for item in batch]
                # 模型后处理
                post_result = post_process_class(preds, batch[1])
                # 评估模型指标
                eval_class(post_result)
            # 获取最终指标
            cur_metric = eval_class.get_metric()
        print('[validation] cur metric, {}'.format(', '.join(['{}: {}'.format(k, v) for k, v in cur_metric.items()])))
        # 如果当前模型准确率高于最优模型，保存当前模型
        if cur_metric['acc'] >= best_model_dict['acc']:
            best_model_dict.update(cur_metric)
            best_model_dict['best_epoch'] = epoch
            # 保存模型
            paddle.save(model.state_dict(), best_prefix + '.pdparams')
        print('[validation] best metric, {}'.format(', '.join(['{}: {}'.format(k, v) for k, v in best_model_dict.items()])))
        # 将模型恢复为训练状态
        model.train()

# 打印训练过程中的最高准确率
print('best metric, {}'.format(', '.join(['{}: {}'.format(k, v) for k, v in best_model_dict.items()])))

模型验证和预测过程可以自己实现。

7. 基于 Pytorch 实现 RCNN

这里也把上边的模型使用 pytorch 实现一遍。

class CRNN(nn.Module):
    def __init__(self):
        super(CRNN, self).__init__()
        self.backbone = MobileNetV3()
        in_channels = self.backbone.out_channels

        self.neck = SequenceEncoder(in_channels, 96)
        in_channels = self.neck.out_channels

        self.head = CTCHead(in_channels, 36)

    def forward(self, x):
        x = self.backbone(x)
        x = self.neck(x)
        x = self.head(x)
        return x
    
    
def make_divisible(v, divisor=8, min_value=None):
    if min_value is None:
        min_value = divisor
    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
    if new_v < 0.9 * v:
        new_v += divisor
    return new_v


class ConvBNLayer(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, groups=1, if_act=False, act=None):
        super(ConvBNLayer, self).__init__()
        self.if_act = if_act
        self.act = act
        self.conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
            kernel_size=kernel_size, stride=stride, padding=padding, groups=groups, bias=False)
        self.bn = nn.BatchNorm2d(num_features=out_channels)

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        if self.if_act:
            if self.act == "relu":
                x = F.relu(x)
            elif self.act == "hardswish":
                x = F.hardswish(x)
            else:
                print("The activation function({}) is selected incorrectly.".format(self.act))
                exit()
        return x
    

class SEModule(nn.Module):
    def __init__(self, in_channels, reduction=4):
        super(SEModule, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.conv1 = nn.Conv2d(in_channels=in_channels, out_channels=in_channels // reduction, 
            kernel_size=1, stride=1, padding=0)
        self.conv2 = nn.Conv2d(in_channels=in_channels // reduction, out_channels=in_channels, 
            kernel_size=1, stride=1, padding=0)

    def forward(self, inputs):
        outputs = self.avg_pool(inputs)
        outputs = self.conv1(outputs)
        outputs = F.relu(outputs)
        outputs = self.conv2(outputs)
        outputs = F.hardsigmoid(outputs)
        return inputs * outputs
    

class ResidualUnit(nn.Module):
    def __init__(self, in_channels, mid_channels, out_channels, kernel_size, stride, use_se, act=None):
        super(ResidualUnit, self).__init__()
        self.if_shortcut = stride == 1 and in_channels == out_channels
        self.if_se = use_se
        self.expand_conv = ConvBNLayer(in_channels=in_channels, out_channels=mid_channels,
            kernel_size=1, stride=1, padding=0, if_act=True, act=act)
        self.bottleneck_conv = ConvBNLayer(in_channels=mid_channels, out_channels=mid_channels, kernel_size=kernel_size, 
            stride=stride, padding=int((kernel_size - 1) // 2), groups=mid_channels, if_act=True, act=act)
        if self.if_se:
            self.mid_se = SEModule(mid_channels)
        self.linear_conv = ConvBNLayer(in_channels=mid_channels, out_channels=out_channels, kernel_size=1, stride=1, padding=0)

    def forward(self, inputs):
        x = self.expand_conv(inputs)
        x = self.bottleneck_conv(x)
        if self.if_se:
            x = self.mid_se(x)
        x = self.linear_conv(x)
        if self.if_shortcut:
            x = torch.add(inputs, x)
        return x
    

class MobileNetV3(nn.Module):
    def __init__(self, in_channels=3, scale=0.5, large_stride=None):
        super(MobileNetV3, self).__init__()
        if large_stride is None:
            large_stride = [1, 2, 2, 2]
        cfg = [
            [3, 16, 16, False, 'relu', large_stride[0]],
            [3, 64, 24, False, 'relu', (large_stride[1], 1)],
            [3, 72, 24, False, 'relu', 1],
            [5, 72, 40, True, 'relu', (large_stride[2], 1)],
            [5, 120, 40, True, 'relu', 1],
            [5, 120, 40, True, 'relu', 1],
            [3, 240, 80, False, 'hardswish', 1],
            [3, 200, 80, False, 'hardswish', 1],
            [3, 184, 80, False, 'hardswish', 1],
            [3, 184, 80, False, 'hardswish', 1],
            [3, 480, 112, True, 'hardswish', 1],
            [3, 672, 112, True, 'hardswish', 1],
            [5, 672, 160, True, 'hardswish', (large_stride[3], 1)],
            [5, 960, 160, True, 'hardswish', 1],
            [5, 960, 160, True, 'hardswish', 1],
        ]
        cls_ch_squeeze = 960

        inplanes = 16
        self.conv1 = ConvBNLayer(in_channels=in_channels, out_channels=make_divisible(inplanes * scale),
            kernel_size=3, stride=2, padding=1, if_act=True, act='hardswish')
        i = 0
        block_list = []
        inplanes = make_divisible(inplanes * scale)
        for (k, exp, c, se, nl, s) in cfg:
            block_list.append(ResidualUnit(in_channels=inplanes, mid_channels=make_divisible(scale * exp),
                    out_channels=make_divisible(scale * c), kernel_size=k, stride=s, use_se=se, act=nl))
            inplanes = make_divisible(scale * c)
            i += 1
        self.blocks = nn.Sequential(*block_list)
        self.conv2 = ConvBNLayer(in_channels=inplanes, out_channels=make_divisible(scale * cls_ch_squeeze),
            kernel_size=1, stride=1, padding=0, if_act=True, act='hardswish')
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.out_channels = make_divisible(scale * cls_ch_squeeze)

    def forward(self, x):
        x = self.conv1(x)
        x = self.blocks(x)
        x = self.conv2(x)
        x = self.pool(x)
        return x
    
    
class Im2Seq(nn.Module):
    def __init__(self, in_channels):
        super().__init__()
        self.out_channels = in_channels

    def forward(self, x):
        B, C, H, W = x.shape
        assert H == 1
        x = x.squeeze(axis=2)
        x = x.transpose(1, 2)
        return x
    
    
class EncoderWithRNN(nn.Module):
    def __init__(self, in_channels, hidden_size):
        super(EncoderWithRNN, self).__init__()
        self.out_channels = hidden_size * 2
        self.device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
        self.lstm = nn.LSTM(in_channels, hidden_size, bidirectional=True, num_layers=2, batch_first=True)

    def forward(self, x):
        x = x.to(self.device)
        x, _ = self.lstm(x)
        return x
    

class SequenceEncoder(nn.Module):
    def __init__(self, in_channels, hidden_size):
        super(SequenceEncoder, self).__init__()
        self.encoder_reshape = Im2Seq(in_channels)
        self.encoder = EncoderWithRNN(self.encoder_reshape.out_channels, hidden_size)
        self.out_channels = self.encoder.out_channels

    def forward(self, x):
        x = self.encoder_reshape(x)
        x = self.encoder(x)
        return x
    
    
class CTCHead(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(CTCHead, self).__init__()
        stdv = 1.0 / math.sqrt(in_channels * 1.0)
        self.fc = nn.Linear(in_channels, out_channels)
        self.out_channels = out_channels

    def forward(self, x):
        predicts = self.fc(x)
        if not self.training:
            predicts = F.softmax(predicts, dim=2)
        return predicts