第二十六节：scrapy爬虫识别验证码（四）手绘验证码识别

一、介绍

今天主要介绍的是微博客户端在登录时出现的四宫格手绘验证码，不多说直接看看验证码长成什么样。

二、思路

1、由于微博上的手绘验证码只有四个宫格，且每个宫格之间都有有向线段连接，所以我们可以判断四个宫格不同方向的验证码一共有24种，

我们将四个宫格进行标号，得到的结果如下：

则我们可以排列出24种不同的手绘方向的验证码，分别为一下24种

1234	2134	3124	4321
1243	2143	3142	4312
1342	2314	3214	4123
1324	2341	3241	4132
1423	2413	3412	4213
1432	2431	3421	4231

2、我们通过获取到微博客户端的24种手绘验证码后需要进行模板匹配，这样通过全图匹配的方式进行滑动。

三、代码实现

1、首先是要通过微博移动端（https://passport.weibo.cn/signin/login）批量获取手绘验证码，但是这个验证码不一定出现，

只有在账号存在风险或者频繁登录的时候才会出现。获取手绘验证码的代码如下：

 1 # -*- coding:utf-8 -*-
 2 import time
 3 from io import BytesIO
 4 from PIL import Image
 5 from selenium import webdriver
 6 from selenium.webdriver.common.by import By
 7 from selenium.common.exceptions import TimeoutException
 8 from selenium.webdriver.support.ui import WebDriverWait
 9 from selenium.webdriver.support import expected_conditions as EC
10 
11 
12 class CrackWeiboSlide():
13     def __init__(self):
14         self.url = "https://passport.weibo.cn/signin/login?entry=mweibo&r=https://m.weibo.cn/"
15         self.browser = webdriver.Chrome(r"D:\chromedriver.exe")
16         self.browser.maximize_window()
17         self.wait = WebDriverWait(self.browser,5)
18 
19 
20     def __del__(self):
21         self.browser.close()
22 
23     def open(self):
24         # 打开模拟浏览器
25         self.browser.get(self.url)
26         # 获取用户名元素
27         username = self.wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="loginName"]')))
28         # 获取密码框元素
29         password = self.wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="loginPassword"]')))
30         # 获取登录按钮元素
31         submit = self.wait.until(EC.element_to_be_clickable((By.XPATH,'//*[@id="loginAction"]')))
32         # 提交数据并登录
33         username.send_keys("15612345678")
34         password.send_keys("xxxxxxxxxxxx")
35         submit.click()
36 
37 
38     def get_image(self,name = "captcha.png"):
39         try:
40             # 获取验证码图片元素
41             img = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME,"patt-shadow")))
42             time.sleep(1)
43             # 获取验证码图片所在的位置
44             location = img.location
45             # 获取验证码图片的大小
46             size = img.size
47             top = location["y"]  # 上
48             bottom = location["y"] + size["height"]  # 下
49             left = location["x"]  # 左
50             right = location["x"] + size["width"]  # 右
51             print("验证码的位置：", left, top, right, bottom)
52             # 将当前窗口进行截屏
53             screenshot = self.browser.get_screenshot_as_png()
54             # 读取截图
55             screenshot = Image.open(BytesIO(screenshot))
56             # 剪切九宫格图片验证码
57             captcha = screenshot.crop((left, top, right, bottom))
58             # 将剪切的九宫格验证码保存到指定位置
59             captcha.save(name)
60             print("微博登录验证码保存完成！！！")
61             return captcha
62         except TimeoutException:
63             print("没有出现验证码！！")
64             # 回调打开模拟浏览器函数
65             self.open()
66 
67 
68     def main(self):
69         count = 1
70         while True:
71             # 调用打开模拟浏览器函数
72             self.open()
73             # 调用获取验证码图片函数
74             self.get_image(str(count) + ".png")
75             count += 1
76 
77 
78 if __name__ == '__main__':
79     crack = CrackWeiboSlide()
80     crack.main()

批量获取手绘验证码

得到的24种手绘验证码，同时需要对这些手绘验证码根据上边的编号进行命名

上图就是我们需要的模板，接下来我们进行遍历模板匹配即可

2、模板匹配

通过遍历手绘验证码模板进行匹配

  1 import os
  2 import time
  3 from io import BytesIO
  4 from PIL import Image
  5 from selenium import webdriver
  6 from selenium.common.exceptions import TimeoutException
  7 from selenium.webdriver import ActionChains
  8 from selenium.webdriver.common.by import By
  9 from selenium.webdriver.support.ui import WebDriverWait
 10 from selenium.webdriver.support import expected_conditions as EC
 11 from os import listdir
 12  
 13 USERNAME = '13389185673'
 14 PASSWORD = ''
 15  
 16 TEMPLATES_FOLDER = 'templates/'
 17  
 18  
 19 class CrackWeiboSlide():
 20     def __init__(self):
 21         self.url = 'https://passport.weibo.cn/signin/login?entry=mweibo&r=https://m.weibo.cn/'
 22         self.browser = webdriver.Chrome()
 23         self.wait = WebDriverWait(self.browser, 20)
 24         self.username = USERNAME
 25         self.password = PASSWORD
 26  
 27     def __del__(self):
 28         self.browser.close()
 29  
 30     def open(self):
 31         """
 32         打开网页输入用户名密码并点击
 33         :return: None
 34         """
 35         self.browser.get(self.url)
 36         username = self.wait.until(EC.presence_of_element_located((By.ID, 'loginName')))
 37         password = self.wait.until(EC.presence_of_element_located((By.ID, 'loginPassword')))
 38         submit = self.wait.until(EC.element_to_be_clickable((By.ID, 'loginAction')))
 39         username.send_keys(self.username)
 40         password.send_keys(self.password)
 41         submit.click()
 42  
 43     def get_position(self):
 44         """
 45         获取验证码位置
 46         :return: 验证码位置元组
 47         """
 48         try:
 49             img = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'patt-shadow')))
 50         except TimeoutException:
 51             print('未出现验证码')
 52             self.open()
 53         time.sleep(2)
 54         location = img.location
 55         size = img.size
 56         top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + size[
 57             'width']
 58         return (top, bottom, left, right)
 59  
 60     def get_screenshot(self):
 61         """
 62         获取网页截图
 63         :return: 截图对象
 64         """
 65         screenshot = self.browser.get_screenshot_as_png()
 66         screenshot = Image.open(BytesIO(screenshot))
 67         return screenshot
 68  
 69     def get_image(self, name='captcha.png'):
 70         """
 71         获取验证码图片
 72         :return: 图片对象
 73         """
 74         top, bottom, left, right = self.get_position()
 75         print('验证码位置', top, bottom, left, right)
 76         screenshot = self.get_screenshot()
 77         captcha = screenshot.crop((left, top, right, bottom))
 78         captcha.save(name)
 79         return captcha
 80  
 81     def is_pixel_equal(self, image1, image2, x, y):
 82         """
 83         判断两个像素是否相同
 84         :param image1: 图片1
 85         :param image2: 图片2
 86         :param x: 位置x
 87         :param y: 位置y
 88         :return: 像素是否相同
 89         """
 90         # 取两个图片的像素点
 91         pixel1 = image1.load()[x, y]
 92         pixel2 = image2.load()[x, y]
 93         threshold = 20
 94         if abs(pixel1[0] - pixel2[0]) < threshold and abs(pixel1[1] - pixel2[1]) < threshold and abs(
 95                 pixel1[2] - pixel2[2]) < threshold:
 96             return True
 97         else:
 98             return False
 99  
100     def same_image(self, image, template):
101         """
102         识别相似验证码
103         :param image: 待识别验证码
104         :param template: 模板
105         :return:
106         """
107         # 相似度阈值
108         threshold = 0.99
109         count = 0
110         for x in range(image.width):
111             for y in range(image.height):
112                 # 判断像素是否相同
113                 if self.is_pixel_equal(image, template, x, y):
114                     count += 1
115         result = float(count) / (image.width * image.height)
116         if result > threshold:
117             print('成功匹配')
118             return True
119         return False
120  
121     def detect_image(self, image):
122         """
123         匹配图片
124         :param image: 图片
125         :return: 拖动顺序
126         """
127         for template_name in listdir(TEMPLATES_FOLDER):
128             print('正在匹配', template_name)
129             template = Image.open(TEMPLATES_FOLDER + template_name)
130             if self.same_image(image, template):
131                 # 返回顺序
132                 numbers = [int(number) for number in list(template_name.split('.')[0])]
133                 print('拖动顺序', numbers)
134                 return numbers
135  
136     def move(self, numbers):
137         """
138         根据顺序拖动
139         :param numbers:
140         :return:
141         """
142         # 获得四个按点
143         circles = self.browser.find_elements_by_css_selector('.patt-wrap .patt-circ')
144         dx = dy = 0
145         for index in range(4):
146             circle = circles[numbers[index] - 1]
147             # 如果是第一次循环
148             if index == 0:
149                 # 点击第一个按点
150                 ActionChains(self.browser) \
151                     .move_to_element_with_offset(circle, circle.size['width'] / 2, circle.size['height'] / 2) \
152                     .click_and_hold().perform()
153             else:
154                 # 小幅移动次数
155                 times = 30
156                 # 拖动
157                 for i in range(times):
158                     ActionChains(self.browser).move_by_offset(dx / times, dy / times).perform()
159                     time.sleep(1 / times)
160             # 如果是最后一次循环
161             if index == 3:
162                 # 松开鼠标
163                 ActionChains(self.browser).release().perform()
164             else:
165                 # 计算下一次偏移
166                 dx = circles[numbers[index + 1] - 1].location['x'] - circle.location['x']
167                 dy = circles[numbers[index + 1] - 1].location['y'] - circle.location['y']
168  
169     def crack(self):
170         """
171         破解入口
172         :return:
173         """
174         self.open()
175         # 获取验证码图片
176         image = self.get_image('captcha.png')
177         numbers = self.detect_image(image)
178         self.move(numbers)
179         time.sleep(10)
180         print('识别结束')
181  
182  
183 if __name__ == '__main__':
184     crack = CrackWeiboSlide()
185     crack.crack()

匹配验证识别

四、识别结果

通过循环四次后绘出四条方向，最终得到效果图

第二十六节：scrapy爬虫识别验证码（四）手绘验证码识别

猜你喜欢