scrapy爬取知乎

1、新建

> scrapy startproject spider_pjt2_zhihu

> cd spider_pjt2_zhihu

> scrapy genspider zhihu www.zhihu.com

2. `spider_pjt2_zhihu/utils/ZhihuAccount.py`

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File    : ZhihuAccount.py


from selenium import webdriver
import requests
from time import sleep

try:
    import http.cookiejar as cookielib
except Exception as e:
    print("兼容Py2.x", e)
    import cookielib  # 兼容Py2.x

import os


class ZhihuAccount(object):
    """"
    入口：check_login
    True：
    False：
    """
    def __init__(self):
        self.brower = None
        self.session = requests.session()
        self.filename = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
                                     'cookies/zhihu_cookie.text')
        self.session.cookies = cookielib.LWPCookieJar(filename=self.filename)
        self.headers = {
            'Referer': 'https://www.zhihu.com/signup?next=%2F',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/72.0.3626.121 Safari/537.36',
        }
        # 加载cookie
        self.load_cookies()  # 加载失败主动抛出异常，还没完成

    def login(self, username='', password=''):
        if username == '' or password == '':
            username = input('输入名称：')
            password = input('输入密码：')
        self.brower = webdriver.Chrome(executable_path='D:/selenium/chromedriver.exe')
        self.brower.get('https://www.zhihu.com/signup?next=%2F')
        try:
            self.brower.find_element_by_xpath('//*[@id="root"]/div/main/div/div/div/div[2]/div[2]/span').click()  # 点击
            self.brower.find_element_by_xpath('//*[@id="root"]//input[@name="username"]').send_keys(username)
            sleep(2)
            self.brower.find_element_by_xpath('//*[@id="root"]//input[@name="password"]').send_keys(password)

            self.brower.execute_script('Object.defineProperties(navigator,{webdriver:{get:() => false}});')
            self.brower.execute_script('window.navigator.webdriver')  # status =

            self.brower.find_element_by_xpath('//*/form/button').click()  # 点击  # if status == ('None' or 'False'):

            sleep(1)
            # 登录逻辑中保存session
            for cookie in self.brower.get_cookies():
                self.session.cookies.set_cookie(
                    cookielib.Cookie(version=0, name=cookie['name'], value=cookie['value'],
                                     port='80', port_specified=False, domain=cookie['domain'],
                                     domain_specified=True, domain_initial_dot=False,
                                     path=cookie['path'], path_specified=True,
                                     secure=cookie['secure'], rest={},
                                     expires=cookie['expiry'] if "expiry" in cookie else None,
                                     discard=False, comment=None, comment_url=None, rfc2109=False))

            self.session.cookies.save()
            return True
        except Exception as e_login:
            print("登录失败", e_login)
            return False

    def load_cookies(self):
        try:
            self.session.cookies.load(ignore_discard=True)
            return True
        except Exception as e_load:
            print("zhihu_cookie未能加载", e_load)
            print("正在重新登录...")
            # 第一次尝试登录：
            if self.login():
                print("cookie成功加载")
                return True
            else:
                print("加载cookie失败")
                return False

    def check_login(self):
        # 通过设置页面返回状态码来判断是否为登录状态
        inbox_url = 'https://www.zhihu.com/settings/account'
        response = self.session.get(inbox_url, headers=self.headers, allow_redirects=False)
        status = True
        if not response.status_code == 200:
            # 第二次尝试登录：
            # print("正在重新登录...")
            if not self.login():
                status = False

        # 关闭浏览器：
        if self.brower:
            self.brower.quit()
        self.session.close()

        if status:
            return True
        else:
            return False

`spider_pjt2_zhihu/middlewares.py`

from scrapy.downloadermiddlewares.cookies import CookiesMiddleware
from spider_pjt2_zhihu.utils.ZhihuAccount import ZhihuAccount
import requests
from collections import defaultdict
from scrapy.http.cookies import CookieJar
try:
    import http.cookiejar as cookielib
except Exception as e:
    print("兼容Py2.x", e)
    import cookielib  # 兼容Py2.x

class ZhihuCookiesMiddleware(CookiesMiddleware):
    def __init__(self, debug=False):
        super().__init__(debug)
        self.load_zhihu_cookies()

    def load_zhihu_cookies(self):
        # 加载zhihu_cookie.txt
        # 先测试登录，根据返回状态码判断是否进一步操作
        account = ZhihuAccount()
        if not account.check_login():
            print("登录失败")
            return
        print("登录成功")

        # 利用cookie重新登录知乎
        session = requests.session()
        session.cookies = cookielib.LWPCookieJar(filename=account.filename)

        session.cookies.load(ignore_discard=True)  # 由于在ZhihuAccount中已经检测过，所以这里不用检测异常

        post_url = 'https://www.zhihu.com'
        response = session.get(post_url, headers=account.headers, allow_redirects=False)
        if response.status_code == 200:
            # print(response.text)
            for cookie in session.cookies:  # 坑，，，注意从session中获取，而不是从response中获取。。。
                self.jars['zhihu'].set_cookie(cookie)

`spider_pjt2_zhihu/settings.py`

COOKIES_ENABLED = True

DOWNLOADER_MIDDLEWARES = {
    'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None,  # 关闭
    'spider_pjt2_zhihu.middlewares.ZhihuCookiesMiddleware': 1,
}

`spider_pjt2_zhihu/spiders/zhihu.py`

# -*- coding: utf-8 -*-
import scrapy

class ZhihuSpider(scrapy.Spider):
    name = 'zhihu'
    allowed_domains = ['www.zhihu.com']
    start_urls = ['http://www.zhihu.com/']  # 虽然start_requests被重写用不到这个

    def __init__(self):
        self.headers = {
            # 'Referer': 'https://www.zhihu.com/',  # 这里可以不加，作用不大
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/72.0.3626.121 Safari/537.36',
        }

        super(ZhihuSpider, self).__init__()

    def start_requests(self):
        # 重写start_requests
        for url in self.start_urls:
            yield scrapy.Request(url, headers=self.headers, meta={'cookiejar': 'zhihu'})

    def parse(self, response):
        print("body:\n", response.body)

测试

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File    : main.py

from scrapy.cmdline import execute

import sys
import os

sys.path.append(os.path.dirname(os.path.abspath(__file__)))

# 调用execute()函数来执行命令，此方法传递一个数组作为参数
execute(["scrapy", "crawl", "zhihu"])

进一步操作：设计数据库表

table: zhihu_question

question_id     # 主键，问题编号
question_title  # 问题的标题
created_time    # 创建时间
updated_time    # 更新时间

question_url        # 问题链接
question_topics     # 关键词
question_content    # 问题描述
answer_num          # 回答数量
comments_num        # 评论数，评论问题
watch_user_num      # 关注者数量
click_num           # 被浏览数量
crawl_time          # 初始爬取时间
crawl_update_time   # 最后爬取时间

table: zhihu_answer

answer_id       # 回答id, 作为主键
question_id     # 问题id，作为外键
answer_url      # 回答链接
author_id       # zhihu ID，由于可以匿名回答，可为空
author_name     # 回答者名称，显示名称
answer_content  # 回答内容
voteup_count    # 赞同数
comment_count   # 评论数
create_time     # 回答时间
updated_time    # 最后更新时间
crawl_time          # 初始爬取时间
crawl_update_time   # 最后爬取时间

正则表达式分析：

(.*question\/(\d+))(\/|$).*

在这里插入图片描述

参考：

scrapy添加cookie的三种方式 https://blog.csdn.net/qq_40655579/article/details/85126064
scrapy中如何设置应用cookies https://blog.csdn.net/fuck487/article/details/84617194
scrapy在中间件携带cookie发送请求 https://blog.csdn.net/qq_42336549/article/details/80991814
Scrapy源码注解–CookiesMiddleware https://www.cnblogs.com/thunderLL/p/8060279.html

1、新建

2. spider_pjt2_zhihu/utils/ZhihuAccount.py

spider_pjt2_zhihu/middlewares.py

spider_pjt2_zhihu/settings.py

spider_pjt2_zhihu/spiders/zhihu.py

测试

进一步操作：设计数据库表

table: zhihu_question

table: zhihu_answer

猜你喜欢

2. `spider_pjt2_zhihu/utils/ZhihuAccount.py`

`spider_pjt2_zhihu/middlewares.py`

`spider_pjt2_zhihu/settings.py`

`spider_pjt2_zhihu/spiders/zhihu.py`