Python crawler simulation login knowledge

When crawling some webpages, the content of some webpages often needs to be logged in. At this time, we need to use simulated login.
When I first started simulated login, I tried to use selenium to simulate clicks, but it was too troublesome. One is that each webpage needs to write specific rules, which takes too long and is too inefficient. The third is the verification code.
So please use the post request with peace of mind

# coding=UTF-8
import re
import requests
from bs4 import BeautifulSoup
import time
import lxml
from PIL import Image
import json
import time
import cookielib
from mycptcha import APIClient
# import http.cookiejar
class Zhihu(object):
       # 初始化参数:请求头和session并加载cookie
       def __init__(self):
              self.headers = {
                     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
                     "Host": "www.zhihu.com",
                     "Referer": "https://www.zhihu.com/",
                     }

              self.session = requests.Session()
              self.session.cookies = cookielib.LWPCookieJar("cookie")
              try:
                     self.session.cookies.load(ignore_discard=True)
              except IOError:
                     print('Cookie未加载!')

       #获取网页中的xsrf,后续需作为请求数据发送
       def get_xsrf(self):
              html = self.session.get('https://www.zhihu.com', headers=self.headers).text
              soup = BeautifulSoup(html, 'lxml')
              xsrf = soup.find('input').get('value')
              return xsrf
       #获取验证码图片并显示
       def get_captcha(self):
              cli=APIClient()
              t = str(int(time.time() * 1000))
              captcha_url = 'http://www.zhihu.com/captcha.gif?r=' + t + "&type=login"
              response=self.session.get(captcha_url,headers=self.headers)
              file=open('cptcha.gif','wb')
              with file as f:
                     f.write(response.content)

              print '正在为您自动识别并输入验证码...'
              captcha=cli.result()
              return captcha

       #登陆方法,需传入用户名和密码
       def login(self,username,password):
              #如果传入的为11位数字的话,则传入phone_num
              if re.match(r'\d{11}$',username):
                     url = 'http://www.zhihu.com/login/phone_num'
                     data={
                            '_xsrf': self.get_xsrf(),
                            'password': password,
                            'remember_me': 'true',
                            'phone_num': username
                     }
              #否则传入email
              else:
                     url = 'https://www.zhihu.com/login/email'
                     data = {
   
   '_xsrf': self.get_xsrf(),
                             'password': password,
                             'remember_me': 'true',
                             'email': username
                             }
              #发送post请求
              response = self.session.post(url, data=data, headers=self.headers)
              #将源代码导入为json格式
              result=json.loads(response.text)
              #如果返回状态为1,则登陆失败,需要传入验证码
              if(result['r']==1):
                     #传入captcha参数,数值为get_captcha方法的返回值
                     data['captcha']=self.get_captcha()
                     #再次请求
                     response2=self.session.post(url, data=data, headers=self.headers)
                     #输出登陆状态
                     print((json.loads(response2.text))['msg'])
              #保存cookies
              self.session.cookies.save(ignore_discard=True, ignore_expires=True)

       #测试是否已经登陆过
       def is_login(self):
              #此网址为用户个人资料,如果之前没有登陆过,浏览器则会重定向到登陆网址
              url = "https://www.zhihu.com/settings/profile"
              # 禁止重定向,否则登录失败重定向到首页也是响应200
              login_code = self.session.get(url, headers=self.headers, allow_redirects=False)
              if login_code.status_code == 200:
                     return True
              else:
                     return False
       #返回登陆过的session对象
       def get_session(self):
              return self.session

if __name__=='__main__':

       zhihu=Zhihu()
       if zhihu.is_login():
              print('已经登陆过的')
              session=zhihu.get_session()
       else:
              username=raw_input('请输入用户名:')
              password=raw_input('请输入密码:')
              zhihu.login(username,password)
              session=zhihu.get_session()
       url = "https://www.zhihu.com/settings/profile"
       info=session.get(url,headers=zhihu.headers)
       print(info.text)

Guess you like

Origin blog.csdn.net/mrliqifeng/article/details/78651341