selenium最新模拟登录知乎

selenium模拟登录,保存cookies

使用selenium模拟登录,并把cookies保存到本地
现在还没有遇到大家说的倒立文字验证码
等后面遇到了,再做验证码处理(只要正确登录,很大程度上不会弹出验证码。验证码处理代码已更新)

由于知乎对selenium做了反爬处理,故此处需要接管本地已打开的浏览器,具体参考这篇文章:https://blog.csdn.net/qq_42206477/article/details/86477446

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

import os
import pickle
import time

chrome_options = Options()
chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")#知乎对selenium做了反爬处理,故此处接管本地已打开的浏览器
browser = webdriver.Chrome(executable_path = 'D:\Documents\Downloads\chromedriver_win32\chromedriver.exe', options=chrome_options)
# 初次建立连接,随后方可修改cookie
browser.get("https://www.zhihu.com/signin")
browser.find_element_by_css_selector(".SignFlow-accountInput.Input-wrapper input").send_keys("用户名")
time.sleep(3)
browser.find_element_by_css_selector(".SignFlow-password input").send_keys("密码")
browser.find_element_by_css_selector(
    ".Button.SignFlow-submitButton").click()
time.sleep(10)
Cookies = browser.get_cookies()
cookie_dict = {}

for cookie in Cookies:
    #将cookie写入文件
    f = open('zhihucookie'+cookie['name']+'.zhihu','wb')
    #将对象序列化成文件
    pickle.dump(cookie,f)
    f.close()
    cookie_dict[cookie['name']] = cookie['value']
browser.close()
使用cookies访问页面

使用我们保存在本地的cookies访问 https://www.zhihu.com
不再需要登录

chrome_options = Options()
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(executable_path = 'D:\Documents\Downloads\chromedriver_win32\chromedriver.exe',options= chrome_options)
browser.get('https://www.zhihu.com') #必须要打开 不然set cookies时会出问题
# 删除第一次建立连接时的cookie
browser.delete_all_cookies()

# 读取登录时存储到本地的cookie
cookie_dict = {}
dirs = os.listdir('存储cookies的本地地址')                   
for i in dirs:     # 循环读取路径下的文件并筛选后缀名为zhihu的文件
   if os.path.splitext(i)[1] == ".zhihu":
       f = open(i,'rb')
       cookie= pickle.load(f)
       cookie_dict['name'] = cookie['name']
       cookie_dict['value'] = cookie['value'].replace('"', '')
       browser.add_cookie(cookie_dict)
       
# 再次访问页面,便可实现免登陆访问
browser.get('https://www.zhihu.com')
response = browser.page_source
print(response)
browser.close()
代码更新

上面的代码只是简单的介绍模拟登录保存cookies的原理。
今天重新整理了一版完整代码

# coding:utf-8
#selenium模拟登录知乎 保存cookie到本地 验证码识别
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import selenium.common.exceptions as ex
from selenium.webdriver import ActionChains

import requests
import time
import pickle
import os,base64
import re

# os.popen('chrome.exe --remote-debugging-port=9222 --user-data-dir="C:\selenum\AutomationProfile"')

class Zhihu:
    def __init__(self,home_url):
        self.home_url = home_url
        self.header = {
            'User-Agent': 'user-agentMozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
        }

	#保存session,下次可直接使用,避免再次登录
    def save_session(self,session): 
        with open('zhihu_session.txt','wb') as f:
            pickle.dump(session, f)
            print("Cookies have been writed.")

	#加载session
    def load_session(self):     
        with open('zhihu_session.txt', 'rb') as f:
            s = pickle.load(f)
        return s

	#判断是否登录
    def is_login(self,browser): 
        try:
            return bool(
                browser.find_element_by_css_selector(".GlobalSideBar-navText")
            )
        except ex.NoSuchElementException:
            return False

	#判断是否有验证码 并返回验证码类型
    def is_captch(self,browser): 
        ca_type = {}
        result = browser.find_element_by_css_selector(".SignFlow-captchaContainer img").get_attribute("src")
        if result != 'data:image/jpg;base64,null':
            type = browser.find_element_by_css_selector(".SignFlow-captchaContainer img").get_attribute("class")
            ca_type['url'] = result
            if type =='Captcha-chineseImg':
                ca_type['type'] = 'chinese' #中文验证码
            else:
                ca_type['type'] = 'english' #英文验证码
        else:
            ca_type['url'] = 'none'
            ca_type['type'] = 'none'#没有验证码

        return ca_type

	#获取验证码图片保存到当前目录并识别
    def get_captcha(self,type,url,browser): 
        content = re.search('data:image/jpg;base64,(.*)',url.replace('%0A',''))
        hq = content.group(1)
        img = base64.b64decode(hq)
        with open('captcha.jpg', 'wb') as f:
            f.write(img)
            f.close()
            # 自动打开刚获取的验证码
            from PIL import Image
            try:
                img = Image.open('captcha.jpg')
                img.show()
                if type == 'chinese':
                    self.disti_captcha_chinese(browser)
                img.close()
                if type == 'english':
                    self.disti_captcha_english(browser)
            except:
                pass

	# 识别英文验证码
    def disti_captcha_english(self, browser):  
        seq = input('请输入验证码\n>')
        browser.find_element_by_xpath("//input[@name='captcha']").send_keys(seq)
        browser.find_element_by_css_selector(
            ".Button.SignFlow-submitButton").click()

	#识别中文验证码
    def disti_captcha_chinese(self,browser):
        points = [[22.796875, 22], [42.796875, 22], [63.796875, 21], [84.796875, 20], [107.796875, 20],
                  [129.796875, 22], [150.796875, 22]]
        input_points = []
        seq = input('请输入倒立文字位置\n>')
        for i in seq:
            input_points.append(points[int(i) - 1])
        img = browser.find_element_by_css_selector(".Captcha-chineseImg")
        location = img.location
        size = img.size
        x1,y1 = input_points[0][0]+location['x'],input_points[0][1]+location['y']
        x2, y2 = input_points[1][0] + location['x'], input_points[1][1] + location['y']
        ActionChains(browser).move_by_offset(x1, y1).click().perform()
        ActionChains(browser).move_by_offset(x2, y2).click().perform()
        browser.find_element_by_css_selector(
            ".Button.SignFlow-submitButton").click()

	#初次登录用selenium模拟,并获得cookies
    def GetCookies(self):       
        chrome_options = Options()
        chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
        browser = webdriver.Chrome(executable_path='D:\Documents\Downloads\chromedriver_win32\chromedriver.exe',
                                   options=chrome_options)
        # 初次建立连接
        browser.get("https://www.zhihu.com/signin")
        if not self.is_login(browser):
            browser.find_element_by_css_selector(".SignFlow-accountInput.Input-wrapper input").send_keys("手机号")
            browser.find_element_by_css_selector(".SignFlow-password input").send_keys("密码")
            browser.find_element_by_css_selector(
                ".Button.SignFlow-submitButton").click()
            ca_type = self.is_captch(browser)
            if ca_type['type'] !="none":
                self.get_captcha(ca_type['type'],ca_type['url'],browser)
            time.sleep(10)
        if  self.is_login(browser):
            cookies = browser.get_cookies()
            browser.quit()
            return cookies

	#获取session
    def get_session(self):  
        s = requests.Session()
        if not os.path.exists('zhihu_session.txt'):   #如果没有session,则创建一个,并且保存到文件中
            s.headers.clear()
            for cookie in self.GetCookies():
                s.cookies.set(cookie['name'], cookie['value'])
            self.save_session(s)
        else:                                   #如果已存在session,则直接加载使用
            s = self.load_session()
        return s

	#开始爬取
    def Crawl(self):    
        s = self.get_session()
        html = s.get(self.home_url).text
        return html

zhihu = Zhihu('https://www.zhihu.com/')
zhihu.Crawl()

中文验证码(倒立文字)似乎不能成功识别,而且这种方式效率也不高,后续再修改与优化。
倒立文字验证码,大家可以参考这位大神的作品 https://github.com/muchrooms/zheye

猜你喜欢

转载自blog.csdn.net/qq_42206477/article/details/86512085