scrapy 登陆知乎

参考 https://github.com/zkqiang/Zhihu-Login
# -*- coding: utf-8 -*-
import scrapy

import time
import re
import base64
import hmac
import hashlib
import json
import matplotlib.pyplot as plt
from PIL import Image


class ZhihuSpider(scrapy.Spider):
    name = 'zhihu'
    allowed_domains = ['www.zhihu.com']
    start_urls = ['http://www.zhihu.com/']

    login_url = 'https://www.zhihu.com/signup'
    login_api = 'https://www.zhihu.com/api/v3/oauth/sign_in'
    login_data = {
        'client_id': 'c3cef7c66a1843f8b3a9e6a1e3160e20',
        'grant_type': 'password',
        'source': 'com.zhihu.web',
        'username': "+86xxxxxx",
        'password': "xxxxxx",
        # 传入'cn'是倒立汉字验证码
        'lang': 'en',
        'ref_source': 'homepage'
    }
    headers = {
        'Connection': 'keep-alive',
        'Host': 'www.zhihu.com',
        'Referer': 'https://www.zhihu.com/',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                        'AppleWebKit/537.36 (KHTML, like Gecko) '
                        'Chrome/69.0.3497.100 Safari/537.36'
    }

    def start_requests(self):
        if self.login_data["lang"] == 'cn':
            api = 'https://www.zhihu.com/api/v3/oauth/captcha?lang=cn'
        else:
            api = 'https://www.zhihu.com/api/v3/oauth/captcha?lang=en'
        yield scrapy.Request(url=api, headers=self.headers, callback=self._is_need_captcha)

    def _is_need_captcha(self, response):
        show_captcha = re.search(r'true', response.text)

        if show_captcha:
            yield scrapy.Request(url=response.url,
                                 headers=self.headers,
                                 method="PUT",
                                 callback=self._get_captcha)
        else:
            timestamp = str(int(time.time() * 1000))
            self.login_data.update({
                'captcha': "",
                'timestamp': timestamp,
                'signature': self._get_signature(timestamp)
            })
            yield scrapy.FormRequest(
                url=self.login_api,
                formdata=self.login_data,
                headers=self.headers,
                callback=self.check_login
            )

    def _get_captcha(self, response):
        json_data = json.loads(response.text)
        img_base64 = json_data['img_base64'].replace(r'\n', '')
        with open('./captcha.jpg', 'wb') as f:
            f.write(base64.b64decode(img_base64))
        img = Image.open('./captcha.jpg')
        if self.login_data["lang"] == 'cn':
            plt.imshow(img)
            print('点击所有倒立的汉字，按回车提交')
            points = plt.ginput(7)
            capt = json.dumps({'img_size': [200, 44],
                               'input_points': [[i[0] / 2, i[1] / 2] for i in points]})
        else:
            img.show()
            capt = input('请输入图片里的验证码：')
        # 这里必须先把参数 POST 验证码接口
        yield scrapy.FormRequest(url=response.url,
                           formdata={'input_text': capt},
                           headers=self.headers,
                           callback=self.captcha_login,
                           meta={"captcha":capt}
                           )


    def captcha_login(self, response):
        timestamp = str(int(time.time() * 1000))
        self.login_data.update({
            'captcha': response.meta['captcha'],
            'timestamp': timestamp,
            'signature': self._get_signature(timestamp)
        })

        yield scrapy.FormRequest(
            url=self.login_api,
            formdata=self.login_data,
            headers=self.headers,
            callback=self.check_login
        )

    def check_login(self, response):
        yield scrapy.Request(
            url=self.login_url,
            headers=self.headers,
            callback=self.parse
        )

    def _get_signature(self, timestamp):
        """
        通过 Hmac 算法计算返回签名
        实际是几个固定字符串加时间戳
        :param timestamp: 时间戳
        :return: 签名
        """
        ha = hmac.new(b'd1b964811afb40118a12068ff74a12f4', digestmod=hashlib.sha1)
        grant_type = self.login_data['grant_type']
        client_id = self.login_data['client_id']
        source = self.login_data['source']
        ha.update(bytes((grant_type + client_id + source + timestamp), 'utf-8'))
        return ha.hexdigest()

    def parse(self, response):
        print(response.text)
猜你喜欢