基于python的验证码识别

在利用python对一些网站进行批量操作的时候，验证码是个绕不过去的东西，虽然现在网上有很多图像识别的api，但是可能不适用于你的项目，我为了批量爬取班上同学的学分绩点写了个简单的图像识别大家可以参考一下我的图像识别的思路。

1. 验证码图片的获取

想要分析验证码，首先就需要获取验证码进行分析，在谷歌浏览器上打开验证码所在页面，打开开发者工具，选择NetWork,然后刷新一下验证码，就可以得到验证码的url地址，具体可以参考伯乐在线上的python计算学分绩点。

这是我的实验环境

有了验证码url以后，我们就可以利用python将验证码图片保存在本地，便于分析，代码如下。



import urllib  
import urllib.request  
import http.cookiejar  


#自己分析出的验证码url    
yzmurl = 'http://run.hbut.edu.cn/Account/GetValidateCode?time=1488614613244' 

 #从数据包中分析出，处理post请求的url
posturl = 'http://run.hbut.edu.cn/Account/LogOn'hosturl = 'http://run.hbut.edu.cn/Account/LogOn?ReturnUrl=%2f' #主页url

#加载cookie 
cj = http.cookiejar.LWPCookieJar()  
cookie_support = urllib.request.HTTPCookieProcessor(cj)  
opener = urllib.request.build_opener(cookie_support, urllib.request.HTTPHandler)  
urllib.request.install_opener(opener)  

#打开登录主页面（他的目的是从页面下载cookie，这样我们在再送post数据时就有cookie了，否则发送不成功）  
h = urllib.request.urlopen(hosturl) 

#获取验证码图片 
picture = opener.open(yzmurl).read()

#保存图片在本地
local = open('d://image.jpg', 'wb')
local.write(picture)
local.close()

2.将彩色图片变为黑白

图片识别因为彩色图片比较复杂，一般把图片变为黑白。先把图片用 ‘cconvert(“L”)’ 转化为256级灰度图像，再根据图片的阈值转化成黑白图像，阈值的合适与否直接影响到黑白图像的轮廓是否清晰从而影响图像识别的准确度。彩色图片转化为黑白详情请参考不剃头的一休哥，阈值的作用及求法请参考阮一峰大牛的相似图片搜索。我的代码如下。

from PIL import Image
#打开图片
im=Image.open("d:\\image.jpg")

#将图片转化为灰度
imgry = im.convert("L")

#计算阈值
threshold=thresholds(imgry)

#转化为黑白图片
table = []
for i in range(256):
    if i < threshold:
        table.append(0)
    else:
        table.append(1)
out = imgry.point(table,'1')

#阈值计算函数，算法请看上面的链接
def thresholds(im):
    x,y=im.size
    n=x*y
    #print(n)
    max=0
    for i in range(256):
        n1=0
        n2=0
        h2=0
        h1=0
        for m in im.getdata():
            if m>= i:
                n2+=1
                h2+=m
            else:
                n1+=1
                h1+=m
        #print(n1)
        #print(n2)
        if n1==0 or n2== 0:
            continue
        w1=n1/n
        w2=n2/n
        u1=h1/n1
        u2=h2/n2
        mile=w1*w2*(u1-u2)**2
        #print (mile)
        if mile >max:
            max=mile
            treshold=i
        else:
            continue
            #print (max)
    return treshold

3.图片降噪

现在的验证码为了防止很轻易的被代码识别出来，往往会在制作验证码的时候加入很多噪声用来干扰，想要识别图片的话，就必须要降噪。
参考网上的文章之后，我用单位面积内点的密度来进行计算。于是首先计算单位面积内点的个数，将单位面积内点个数少于某一指定数的面积去除，剩余的部分基本上就是验证码字符的部分。详情请参照小五义的验证码降噪。

我的降噪代码

change=out.convert("P")
pointmidu(out)

#计算规定面积内黑色像素的个数
def numpoint(im):
    w,h = im.size
    data = list( im.getdata() )
    mumpoint=0
    for x in range(w):
        for y in range(h):
            if data[ y*w + x ] !=255:#255是白色
                mumpoint+=1
    return mumpoint

#计算3*3范围内点的密度,具体大小根据自己情况而定
def pointmidu(im):
    w,h = im.size
    p=[]
    for y in range(0,h,3):
        if(y+3>h):
            d=h
        else:
            d=y+3
        for x in range(0,w,3):
            if(x+3>w):
                m=w
            else:
                m=x+3
            box = (x,y, m,d)
            im1=im.crop(box)
            a=numpoint(im1)
            if a<2:##如果3*3范围内小于2个点，那么将该部分全部换为白色。这个也是根据自己的情况来决定
                for i in range(x,m):
                    for j in range(y,d):
                        im.putpixel((i,j), 255)
    #保存降噪后的图片
    im.save('d:\image1.gif')

4.图片的分割

验证码识别是一个一个的识别，所以需要把验证码图片上的数字或字母分割出来，用windows自带的画板打开验证码图片，就可以粗略的找到分割点，之后再根据分割情况调整分割点。可以参考不剃头的一休哥。分割完之后，就可以进行识别了。

5.图像识别

图像分割完之后就可以进行识别了，要识别首先得有样本，所以先建立一个文件夹，然后在里面建立0-9的文件夹，用来存放样本，由于之前没有样本所以需要人工的识别几张分割好的图片，并把它放在对应的文件夹中，确保每个文件夹里至少有一张就可以了。
然后开始识别流程，识别用的是比较2个图片向量的余弦夹角，具体请参考wusuopuBUPT的专栏。
将要识别的图片与样本空间里所有的样本进行求余弦夹角，然后找出将结果进行排序，找出相似度最高的一个，如果相似度大于95%就抛弃，小于95%就保存在一个地方，写一个循环让这段代码执行一段时间，然后再人工的把保存的图片放在相应的文件夹中，这样样本就比较丰富，确保了识别的准确率，我这个的识别率是100%（当然我们学校的验证码全部由数字组成比较简单）。

图像识别代码如下（样本空间已经生成完毕）

from PIL import Image
import os
import time
import re  
import math
import urllib  
import urllib.request  
import http.cookiejar  
import string
from html.parser import HTMLParser  
from urllib.parse import urlparse 
from time import sleep
#import pytesseract

#计算阈值
def thresholds(im):
    x,y=im.size
    n=x*y
    #print(n)
    max=0
    for i in range(256):
        n1=0
        n2=0
        h2=0
        h1=0
        for m in im.getdata():
            if m>= i:
                n2+=1
                h2+=m
            else:
                n1+=1
                h1+=m
        #print(n1)
        #print(n2)
        if n1==0 or n2== 0:
            continue
        w1=n1/n
        w2=n2/n
        u1=h1/n1
        u2=h2/n2
        mile=w1*w2*(u1-u2)**2
        #print (mile)
        if mile >max:
            max=mile
            treshold=i
        else:
            continue
            #print (max)
    return treshold

def numpoint(im):
    w,h = im.size
    data = list( im.getdata() )
    mumpoint=0
    for x in range(w):
        for y in range(h):
            if data[ y*w + x ] !=255:#255是白色
                mumpoint+=1
    return mumpoint

#计算3*3范围内点的密度
def pointmidu(im):
    w,h = im.size
    p=[]
    for y in range(0,h,3):
        if(y+3>h):
            d=h
        else:
            d=y+3
        for x in range(0,w,3):
            if(x+3>w):
                m=w
            else:
                m=x+3
            box = (x,y, m,d)
            im1=im.crop(box)
            a=numpoint(im1)
            if a<2:##如果3*3范围内小于2个点，那么将该部分全部换为白色。
                for i in range(x,m):
                    for j in range(y,d):
                        im.putpixel((i,j), 255)
    im.save('d:\image1.gif')

def magnitude(concordance):
    total = 0
    for word,count in concordance.items():
        total += count ** 2
    return math.sqrt(total)

    #计算矢量之间的 cos 值
def relation(concordance1, concordance2):
    relevance = 0
    topvalue = 0
    for word,count in concordance1.items():
        if word in concordance2:
            topvalue += count * concordance2[word]
    return topvalue / (magnitude(concordance1) * magnitude(concordance2))

#把图片转化为向量
def buildvector(im):
    d1 = {}
    count = 0
    for i in im.getdata():
        d1[count] = i
        count += 1
    return d1



#验证码url
yzmurl = 'http://run.hbut.edu.cn/Account/GetValidateCode?time=1488614613244'   

#从数据包中分析出，处理post请求的url
posturl = 'http://run.hbut.edu.cn/Account/LogOn' 

#主页url
hosturl = 'http://run.hbut.edu.cn/Account/LogOn?ReturnUrl=%2f' 

#加载cookie
cj = http.cookiejar.LWPCookieJar()  
cookie_support = urllib.request.HTTPCookieProcessor(cj)  
opener = urllib.request.build_opener(cookie_support, urllib.request.HTTPHandler)  
urllib.request.install_opener(opener)  

#打开登录主页面（他的目的是从页面下载cookie，这样我们在再送post数据时就有cookie了，否则发送不成功）  
h = urllib.request.urlopen(hosturl)

#获取验证码并保存在本地
picture = opener.open(yzmurl).read()
local = open('d://image.jpg', 'wb')
local.write(picture)
local.close()

#加载训练集
iconset = ['0','1','2','3','4','5','6','7','8','9']
imageset = []
for letter in iconset:
    temp = []
    for img in os.listdir('./yzm/%s/'%(letter)):        
        temp.append(buildvector(Image.open("./yzm/%s/%s"%(letter,img))))

    imageset.append({letter:temp})

#变为黑白图片
im=Image.open("d:\\image.jpg")
imgry = im.convert("L")
threshold=thresholds(imgry)
table = []
for i in range(256):
    if i < threshold:
        table.append(0)
    else:
        table.append(1)
out = imgry.point(table,'1')
change=out.convert("P")

#降噪
pointmidu(change)

#分割图片
im2=Image.open(r"d:\image1.gif")
letters = [(7,15),(16,24),(25,33),(35,43)]
ocr=[]
for letter in letters:
    im3 = im2.crop(( letter[0] , 0, letter[1],im2.size[1] ))

#将图片与所有样本比较并排序
    guess = []
    for image in imageset:
        for x,y in image.items():
            if len(y) != 0:
                for i in range(len(y)):             
                    guess.append( (relation(y[i],buildvector(im3)),x) )
    guess.sort(reverse=True)
    ocr.append(guess[0][1])

#得到识别出来的验证码
yzm=''
for i in range(len(ocr)):
    yzm+=ocr[i] 
print(yzm.strip())

第一次写技术类文章，写的不好请多多包涵