Python实现对简单验证码识别

本文参考链接：https://www.cnblogs.com/xuchunlin/p/9415620.html

首先收集原始的验证码图片：

这种验证码的特点是统一都是红色字体的四个字符组成，其中包括大写字母 A-Z，小写字母 a-z 和数字 0-9。我这儿需要的验证码识别对大小写不加以区分，所以大小写识别错误不算作识别错误，比如第一个验证码：DwP6，如果识别成：Dwp6，我们也把他算作识别正确。通过观察，我们可以提前想到，这种验证码识别的难度在于部分字符的难以区分，比如 S 和 5、I 和 1、 i 和 j、2 和 Z、0 和 o 、y 和 v 、9 和 g 等等之类。字符间的间距也会有一定的干扰，降低了单个字符的区分度，比如会把紧邻的两个字符 vv识别成一个字符 w。验证码中的部分字符连人本身都难以区分，对机器而言难度也就可想而知了。

首先进行灰度处理，将原来的彩色图片变成灰度图片，同时去除部分图片中的暗色条纹。这里选择合适的阈值参数比较重要，阈值太大过滤不掉太多杂质信息，阈值太小则剩下的有效信息又太少。下面是阈值 threshold 分别为 210， 180， 150 对应的灰度图片，我程序里选择的 threshold=180。

import os

import pytesseract
from PIL import Image

before = r"D:\MyProject\Python\ReturnVisit\auth_code\before\\"  # 原始验证码图片目录
after = r"D:\MyProject\Python\ReturnVisit\auth_code\after\\"  # 处理后验证码图片目录


def grey_processing():  # 灰度处理
    threshold = 180
    file_list = os.listdir(before)
    for file in file_list:
        path = before + file
        img = Image.open(path)
        w, h = img.size
        for x in range(w):
            for y in range(h):
                r, g, b = img.getpixel((x, y))
                if 190 <= r <= 255 and 170 <= g <= 255 and 0 <= b <= 140:
                    img.putpixel((x, y), (0, 0, 0))
                if 0 <= r <= 90 and 210 <= g <= 255 and 0 <= b <= 90:
                    img.putpixel((x, y), (0, 0, 0))
        img = img.convert('L').point([0] * threshold + [1] * (256 - threshold), '1')
        #  边缘噪点处理
        for x in range(0, img.size[0]):
            for y in range(0, img.size[1]):
                if x in [0, img.size[0] - 1] or y in [0, img.size[1] - 1]:
                    img.putpixel((x, y), 255)  # 将边缘全变为非黑的点(简单粗暴)
        path = path.replace('PNG', 'jpeg')
        path = path.replace('before', 'after')  # 更换新图片保存路径
        img.save(path)

灰度处理后的图片：

然后进行二值化处理和内部噪点处理：

t2val = {}  # 使用字典来保存二值化像素矩阵

def two_value(image, G):  # 二值化处理, G是图像二值化阀值
    for y in range(0, image.size[1]):
        for x in range(0, image.size[0]):
            g = image.getpixel((x, y))
            if g < G:  # 灰度处理以后，点的区分度已经比较明显了。
                t2val[(x, y)] = 1
            else:
                t2val[(x, y)] = 0


# 自定义孤立点为噪点                
def clear_noise(image):  # 清噪
    #  边缘噪点处理(本来想放到这块，但是这儿对像素的修改不能即使生效，对内部噪点处理会有影响，这不是我想要的结果!)
    # for x in range(0, image.size[0]):
    #     for y in range(0, image.size[1]):
    #         if x in [0, image.size[0] - 1] or y in [0, image.size[1] - 1]:
    #             image.putpixel((x, y), 255)  # 将边缘全变为非黑的点(简单粗暴)
    # 内部噪点处理
    for x in range(1, image.size[0] - 1):
        for y in range(1, image.size[1] - 1):
            nearDots = 0
            L = 1  # 黑点用1表示
            if L == t2val[(x - 1, y - 1)]:
                nearDots += 1
            if L == t2val[(x - 1, y)]:
                nearDots += 1
            if L == t2val[(x - 1, y + 1)]:
                nearDots += 1
            if L == t2val[(x, y - 1)]:
                nearDots += 1
            if L == t2val[(x, y + 1)]:
                nearDots += 1
            if L == t2val[(x + 1, y - 1)]:
                nearDots += 1
            if L == t2val[(x + 1, y)]:
                nearDots += 1
            if L == t2val[(x + 1, y + 1)]:
                nearDots += 1

            if nearDots == 0 and t2val[(x, y)] == 1:  # 如果当前是黑点且周围领域没有其他黑点
                image.putpixel((x, y), 255)  # 将当前点变为非黑的点
def denoise():  # 降噪
    file_list = os.listdir(after)
    for file in file_list:
        path = after + file
        image = Image.open(path)
        two_value(image, 100)
        clear_noise(image)
        path = path.replace('before', 'after')
        image.save(path)

降噪处理后的图片：

然后就是关于对降噪后的验证码图片进行识别了，其中会用到 pytesseract，这个包的安装和使用请自行参考链接https://blog.csdn.net/EB_NUM/article/details/77060009。需要下载 tesseract-ocr-setup-3.02.02.exe 进行安装，并对 pytesseract.py文件进行相应修改。

def recognize():  # 识别
    file_list = os.listdir(after)
    for file in file_list:
        img_path = after + file
        im = Image.open(img_path)
        text = pytesseract.image_to_string(im, lang='eng')
        exclude_char_list = ' ‘’.:\\|\'\"?![],()~@#$%^&*_+-={};<>/¥©“é€£'
        #  去除识别结果中的特殊字符
        text = ''.join([x for x in text if x not in exclude_char_list])[:4]
        #  对易混淆字符添加识别结果可选项，提高识别率
        #  S 和 5、l 和 1、 i 和 j、2 和 Z、0 和 o 、y 和 v 、9 和 g、h 和 n
        options = [text]
        if len(text) == 4:
            if 'i' in text:
                options.append(text.replace('i', 'j'))
            if 'j' in text:
                options.append(text.replace('j', 'i'))
            if 'l' in text:
                options.append(text.replace('l', '1'))
            if '1' in text:
                options.append(text.replace('1', 'l'))
            if 's' in text:
                options.append(text.replace('s', '5'))
            if '5' in text:
                options.append(text.replace('5', 's'))
            if '2' in text:
                options.append(text.replace('2', 'z'))
            if 'z' in text:
                options.append(text.replace('z', '2'))
            if '0' in text:
                options.append(text.replace('0', 'o'))
            if 'o' in text:
                options.append(text.replace('o', '0'))
            if 'y' in text:
                options.append(text.replace('y', 'v'))
            if 'v' in text:
                options.append(text.replace('v', 'y'))
            if '9' in text:
                options.append(text.replace('9', 'g'))
            if 'g' in text:
                options.append(text.replace('g', '9'))
            if 'h' in text:
                options.append(text.replace('h', 'n'))
            if 'n' in text:
                options.append(text.replace('n', 'h'))
            if '8' in text:
                options.append(text.replace('8', 'B'))
            if 'B' in text:
                options.append(text.replace('B', '8'))
        elif len(text) == 3 and "W" in text.upper():
            options.append(text.upper().replace('W', 'vv'))
            options.append(text.upper().replace('W', 'yy'))
        print(file, options)

识别结果如下：

"D:\Program Files\Python36\python3.exe" D:/MyProject/Python/ReturnVisit/verification_code_recognize.py
IMG_0326.jpeg ['DwP6']
IMG_0327.jpeg ['QXH']
IMG_0328.jpeg ['hzew', 'h2ew', 'nzew']
IMG_0329.jpeg ['']
IMG_0330.jpeg ['iQlg', 'jQlg', 'iQ1g', 'iQl9']
IMG_0331.jpeg ['apor', 'ap0r']
IMG_0332.jpeg ['apor', 'ap0r']
IMG_0333.jpeg ['wsn', 'vvSN', 'yySN']
IMG_0334.jpeg ['uSZ5', 'uSZs']
IMG_0335.jpeg ['nz']
IMG_0336.jpeg ['aNpW']
IMG_0337.jpeg ['RWU', 'RvvU', 'RyyU']
IMG_0338.jpeg ['LNX1', 'LNXl']
IMG_0339.jpeg ['Mzln', 'Mz1n', 'M2ln', 'Mzlh']
IMG_0340.jpeg ['Xo1v', 'Xolv', 'X01v', 'Xo1y']
IMG_0341.jpeg ['WBJ4']
IMG_0342.jpeg ['Qc1S', 'QclS']
IMG_0343.jpeg ['JTPB']
IMG_0344.jpeg ['rW7N']
IMG_0345.jpeg ['ZSLE']
IMG_0347.jpeg ['b3D']
IMG_0348.jpeg ['OY7']
IMG_0349.jpeg ['HQK7']
IMG_0350.jpeg ['X304', 'X3o4']
IMG_0351.jpeg ['2PQW', 'zPQW']
IMG_0352.jpeg ['psks', 'p5k5']
IMG_0353.jpeg ['NOE']
IMG_0354.jpeg ['WYOY']
IMG_0355.jpeg ['M3oA', 'M30A']
IMG_0356.jpeg ['AMV']
IMG_0357.jpeg ['VFSR']
IMG_0358.jpeg ['ZZIH']
IMG_0359.jpeg ['4RoS', '4R0S']
IMG_0360.jpeg ['mu']
IMG_0361.jpeg ['mm']
IMG_0362.jpeg ['FBV2', 'FBVz']
IMG_0363.jpeg ['2eEc', 'zeEc']
IMG_0364.jpeg ['Wm']
IMG_0365.jpeg ['EoFR', 'E0FR']
IMG_0366.jpeg ['CQU9', 'CQUg']

Process finished with exit code 0

其中列表第一个元素为原始识别结果，通过统计，识别正确率大概只有25%左右。如果把S 和 5、I 和 1、 i 和 j、2 和 Z、0 和 o 、y 和 v 、9 和 g这些容易混淆字符稍作处理给出多个可选项，识别正确率应该会有一定的提升。

如果你想自己训练模型来进行验证码的识别，请继续往下看：

自己训练模型的话，最重要的是训练集的收集了。为表诚意，小弟先奉上1056张带标记的验证码图片百度云链接（为了便于分享，已经将图片文件夹压缩成了rar文件，其中图片文件名为验证码生成的真实字符串，完全手工标记，因此也有部分出错的地方，不过出错的部分很少，不用过于担心！）：链接：https://pan.baidu.com/s/14pJdTXD9OarUSrkgeDGE5g 提取码：xfym，我自己使用了其中 1000 张作为训练集，剩下的 56 张作为验证集使用。

目录结构如图：其中 train 目录是训练集所在目录，before 为带标签的原始验证码图片目录；after 为二值化、降噪处理后的验证码图片目录；cutting 则为验证码处理后的图片再次切割后的单个字符图片目录。validation 目录是验证集所在目录，目录结构与 train 目录类似：

验证集有了以后，先进行灰度化处理（同上），然后对每四个字符的验证码图片进行切割标记：

import numpy as np
from sklearn.externals import joblib
from sklearn.neighbors import KNeighborsClassifier
before = r"D:\MyProject\Python\ReturnVisit\auth_code\train\before\\"  # 原始验证码图片目录
after = r"D:\MyProject\Python\ReturnVisit\auth_code\train\after\\"  # 处理后验证码图片目录
cutting = r"D:\MyProject\Python\ReturnVisit\auth_code\train\cutting\\"  # 切割后的图片目录


def smart_slice_image(img, outDir, file, count=4, p_w=3):
    """
    :param img: image对象
    :param outDir: 切割图片输出目录
    :param count: 图片中有多少个图片
    :param p_w: 对切割地方多少像素内进行判断
    """
    w, h = img.size
    pixdata = img.load()
    eachWidth = int(w / count)  # 每个切割图片的宽度
    beforeX = 0
    for i in range(count):
        # temp_dir = file[i]
        allBCount = []
        nextXOri = (i + 1) * eachWidth
        for x in range(nextXOri - p_w, nextXOri + p_w):
            if x >= w:
                x = w - 1
            if x < 0:
                x = 0
            b_count = 0
            for y in range(h):
                if pixdata[x, y] == 0:
                    b_count += 1
            allBCount.append({'x_pos': x, 'count': b_count})
        sort = sorted(allBCount, key=lambda e: e.get('count'))
        nextX = sort[0]['x_pos']
        box = (beforeX, 0, nextX, h)
        isExists = os.path.exists(outDir + file[i].upper() + "\\")
        # 判断结果
        if not isExists:
            os.makedirs(outDir + file[i].upper() + "\\")
        img.crop(box).save(outDir + file[i].upper() + "\\" + file[i] + "_" + file)
        beforeX = nextX


def cutting_image():
    file_list = os.listdir(after)
    for file in file_list:
        path = after + file
        img = Image.open(path)
        out_dir = cutting
        smart_slice_image(img, out_dir, file, count=4, p_w=3)

验证码图片切割好以后，开始进行模型训练，我这儿的原始验证码图片规格为 49×25：

def load_dataset():  # 将切割后的图片与字符标签对应起来保存成数组
    X = []
    Y = []
    for i in "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ":
        target_path = cutting + i  # 训练集
        for title in os.listdir(target_path):
            # pix = np.asarray(Image.open(os.path.join(target_path, title)).convert('L'))
            pix = np.asarray(Image.open(os.path.join(target_path, title)).resize((13, 25), Image.ANTIALIAS))
            pix = pix.reshape(13 * 25)
            X.append(pix)
            Y.append(i)
    X = np.asarray(X)
    Y = np.asarray(Y)
    return X, Y


def train_model():
    X, Y = load_dataset()
    knn = KNeighborsClassifier()
    knn.fit(X, Y)
    joblib.dump(knn, 'recognize.model')

模型训练完成以后，开始进行模型验证：

def verify_model(model):
    pre_list = []
    y_list = []
    for i in "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ":
        part_path = cutting.replace('train', 'validation') + i  # 验证集
        if os.path.exists(part_path):
            for title in os.listdir(part_path):
                pix = np.asarray(Image.open(os.path.join(part_path, title)).resize((13, 25), Image.ANTIALIAS))
                pix = pix.reshape(13 * 25)
                pre_list.append(pix)  # 待预测列表
                y_list.append(i)
    pre_list = np.asarray(pre_list)
    # y_list = np.asarray(y_list)
    predict_list = list(model.predict(pre_list))  # 预测结果
    acc = 0
    for i in range(len(y_list)):
        if y_list[i] == predict_list[i]:
            acc += 1
    print("原始字符：", list(y_list))
    print("预测字符：", list(predict_list))
    print("单个字符识别正确率：", "%.2f" % (100 * acc / len(y_list)) + "%")


if __name__ == '__main__':
    train_model()
    model = joblib.load('recognize.model')
    verify_model(model)

运行输出结果为（一张验证码图片如果有相同字符，按照我当前对分割图片的命名规则会覆盖掉，所以这儿的原始字符个数并不是 56*4=224张），验证码字符的粘连性导致切割后的图片与字符的对应关系出错：

原始字符： ['0', '0', '1', '2', '4', '5', '5', '5', '5', '6', '6', '6', '6', '7', '7', '7', 
'9', '9', '9', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 
'D', 'D', 'D', 'D', 'D', 'D', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'F', 'F', 
'F', 'G', 'G', 'G', 'G', 'G', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'I', 'I', 'I', 'I', 'I', 
'I', 'J', 'J', 'J', 'J', 'K', 'K', 'K', 'K', 'K', 'K', 'K', 'L', 'L', 'L', 'L', 'L', 'M', 
'M', 'M', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'O', 'O', 'O', 'O', 'O', 'P', 'P', 'P', 'P', 
'P', 'P', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 
'R', 'S', 'S', 'S', 'S', 'S', 'S', 'T', 'T', 'T', 'U', 'U', 'V', 'V', 'V', 'V', 'V', 'V', 
'V', 'V', 'V', 'V', 'V', 'V', 'W', 'W', 'W', 'W', 'X', 'X', 'X', 'X', 'Y', 'Y', 'Y', 'Y', 
'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 
'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 
'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 
'Z', 'Z', 'Z', 'Z']
预测字符： ['1', '0', '1', '2', '4', '5', '5', '5', '1', '5', '6', '6', '5', '7', '7', '7', 
'D', '9', '9', 'Q', 'A', 'A', '1', 'B', 'B', 'B', 'C', '1', 'C', 'O', 'C', '1', 'C', 'O', 
'L', 'D', 'D', 'Q', 'D', 'W', 'E', 'C', '5', 'E', 'E', 'E', 'E', 'E', 'E', 'C', 'F', 'F', 
'4', 'G', 'G', 'G', 'G', 'G', 'H', 'H', 'H', 'H', 'H', 'H', 'H', '5', 'J', 'I', 'I', 'I', 
'I', 'D', '1', 'B', '1', '1', 'K', 'K', 'K', 'K', 'K', 'I', 'L', '1', 'I', '1', 'L', 'M', 
'H', 'M', 'N', 'N', 'M', 'H', 'N', '1', 'K', 'H', 'O', 'O', 'C', '0', 'B', 'P', '1', 'C', 
'B', 'O', 'D', 'O', 'Q', 'O', 'O', 'Q', 'I', 'Q', '0', 'S', 'L', 'R', '1', 'R', 'R', 'R', 
'H', 'H', 'S', 'S', 'S', 'S', 'S', 'H', 'U', 'T', 'U', 'I', 'V', '1', '1', 'V', 'V', '5', 
'Y', 'V', 'V', 'V', 'V', 'V', 'W', '1', 'W', 'W', 'X', 'X', 'X', 'X', 'Y', 'Y', 'Y', 'Y', 
'H', 'Y', '4', '1', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'W', 
'Y', 'Y', '1', '1', 'Y', 'Y', 'Y', '1', 'I', 'Z', 'Z', '7', '7', 'Z', '2', 'Z', '7', 'Z', 
'7', 'Z', '7', 'Z', '2', 'Z', '7', '7', '7', 'Z', 'Z', '7', '1', '7', 'Z', '7', 'Z', '7', 
'7', 'Z', 'Z', 'Z']
单个字符识别正确率： 61.01%

单个字符识别率在60%左右的话，整体的识别率还是偏低的，肯定还有可以进行改进的地方，比如降噪这块，可以使用 8-连通区域范围黑点的个数来进行更加合理的降噪。为了便于查看整个识别过程的步骤，我将单张验证码图片的识别单独写了一个函数。

import os

import pytesseract
from PIL import Image
import numpy as np
from sklearn.externals import joblib
from sklearn.neighbors import KNeighborsClassifier

before = r"D:\MyProject\Python\ReturnVisit\auth_code\train\before\\"  # 原始验证码图片目录
after = r"D:\MyProject\Python\ReturnVisit\auth_code\train\after\\"  # 处理后验证码图片目录
cutting = r"D:\MyProject\Python\ReturnVisit\auth_code\train\cutting\\"  # 切割后的图片目录


def single_picture_recognize(model, image_path):  # 单张验证码图片识别
    # 灰度处理
    img = Image.open(image_path)
    threshold = 180  # 阈值
    w, h = img.size
    for x in range(w):
        for y in range(h):
            r, g, b = img.getpixel((x, y))
            if 190 <= r <= 255 and 170 <= g <= 255 and 0 <= b <= 140:
                img.putpixel((x, y), (0, 0, 0))
            if 0 <= r <= 90 and 210 <= g <= 255 and 0 <= b <= 90:
                img.putpixel((x, y), (0, 0, 0))
    img = img.convert('L').point([0] * threshold + [1] * (256 - threshold), '1')

    #  边缘噪点处理
    for x in range(0, img.size[0]):
        for y in range(0, img.size[1]):
            if x in [0, img.size[0] - 1] or y in [0, img.size[1] - 1]:
                img.putpixel((x, y), 255)  # 将边缘全变为非黑的点(简单粗暴)
    img.save(image_path)

    # 降噪处理
    image = Image.open(image_path)
    two_value(image, 100)
    clear_noise(image)
    image.save(image_path)

    # 图片切割
    """
    :param img: image对象
    :param outDir: 切割图片输出目录
    :param count: 图片中有多少个图片
    :param p_w: 对切割地方多少像素内进行判断
    """
    img = Image.open(image_path)
    cutting_picture = []
    count = 4
    p_w = 3
    w, h = img.size
    pixdata = img.load()
    eachWidth = int(w / count)  # 每个切割图片的宽度
    beforeX = 0
    for i in range(count):
        # temp_dir = file[i]
        allBCount = []
        nextXOri = (i + 1) * eachWidth
        for x in range(nextXOri - p_w, nextXOri + p_w):
            if x >= w:
                x = w - 1
            if x < 0:
                x = 0
            b_count = 0
            for y in range(h):
                if pixdata[x, y] == 0:
                    b_count += 1
            allBCount.append({'x_pos': x, 'count': b_count})
        sort = sorted(allBCount, key=lambda e: e.get('count'))
        nextX = sort[0]['x_pos']
        box = (beforeX, 0, nextX, h)
        pix = np.asarray(img.crop(box).resize((13, 25), Image.ANTIALIAS))
        pix = pix.reshape(13 * 25)
        cutting_picture.append(pix)
        beforeX = nextX

    # 开始识别
    pre_list = np.asarray(cutting_picture)
    predict_list = list(model.predict(pre_list))  # 预测结果
    return predict_list


if __name__ == '__main__':
    # train_model()
    model = joblib.load('recognize.model')
    image_path = r"D:\MyProject\Python\ReturnVisit\auth_code\test.png"
    predict_result = single_picture_recognize(model, image_path)
    print(predict_result)

识别输出结果如下：

"D:\Program Files\Python36\python3.exe" D:/MyProject/Python/ReturnVisit/auth_code/verification_code_recognize.py
['Y', 'E', 'U', 'V']

Process finished with exit code 0

原始图片为：

Python实现对简单验证码识别

猜你喜欢