python 爬虫学习笔记（2）

目标：爬取知乎
代码：
# -*- coding: utf-8 -*-
__author__ = 'beauty'
# import sys
# reload(sys)
# sys.setdefaultencoding("utf-8")
import sys
type = sys.getfilesystemencoding()
'''
作者：liuzhijun
微信： lzjun567
公众号：Python之禅（id：VTtalk）
'''
import time
from http import cookiejar
import requests
from BeautifulSoup import BeautifulSoup
#
headers = {
    "Host": "www.zhihu.com",
    "Referer": "https://www.zhihu.com/",
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87'
}

# 使用登录cookie信息
session = requests.session()
session.cookies = cookiejar.LWPCookieJar(filename='cookies.txt')
try:
    # print(session.cookies)
    session.cookies.load(ignore_discard=True)
except:
    print("还没有cookie信息").decode('utf-8')
def get_xsrf():
    response = session.get("https://www.zhihu.com", headers=headers)
    soup = BeautifulSoup(response.content, smartQuotesTo="html")
    xsrf = soup.find('input', attrs={"name": "_xsrf"}).get("value")
    return xsrf


def get_captcha():
    """
    把验证码图片保存到当前目录，手动识别验证码
    :return:
    """
    t = str(int(time.time() * 1000))
    captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + "&type=login"
    r = session.get(captcha_url, headers=headers)
    with open('captcha.jpg', 'wb') as f:
        f.write(r.content)
    captcha = raw_input(u"验证码：")
    return captcha


def login(account, password):
    login_url = 'https://www.zhihu.com/login/email'
    data = {
        'account': account,
        'password': password,
        '_xsrf': get_xsrf(),
        "captcha": get_captcha(),
        'remember_me': 'true'}
    response = session.post(login_url, data=data, headers=headers)
    login_code = response.json()
    print(login_code['msg'])
    for i in session.cookies:
        print(i)
    session.cookies.save()


if __name__ == '__main__':
    account = "youraccout"
    password = "yourpassword"
    login(account, password)
结果：登录过于频繁，请稍后重试
PS：不知道是哪里出来问题，日后再试
python 爬虫学习笔记（2）

猜你喜欢