Scrapy框架:爬取知乎

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/weixin_39777626/article/details/81810984

本文主要知识点

  1. Scrapy框架
  2. 静态页面+动态页面 抓取
  3. pymongo存储

思路梳理
这里写图片描述

代码

注:书中源代码太久远完全跑不动,以下代码是本人大致揣摩了一下作者意思编写的
终端运行:

scrapy startproject zhihuCrawl
cd zhihuCrawl
scrapy genspider -t crawl zhihu.com zhihu.com

zhihu_com.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import json
from zhihuCrawl.items import StaticStateItem,DynamicStateItem
from urllib.parse import urlparse


class ZhihuComSpider(CrawlSpider):
    name = 'zhihu.com'
    allowed_domains = ['zhihu.com']
    headers = {
        'User-Agent': r'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.90 Safari/537.36 2345Explorer/9.3.2.17331',
        'Referer': 'https://www.zhihu.com',
        'Connection': 'keep-alive'
    }
    #start_url='https://www.zhihu.com/people/de-chuan-guang-guo-81/activities'

    def start_requests(self):
        start_urls=['https://www.zhihu.com/api/v4/members/mei-ri-ren-wu/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=0&limit=20',
            'https://www.zhihu.com/api/v4/members/mei-ri-ren-wu/followers?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=0&limit=20']
        for start_url in start_urls:
            yield scrapy.Request(url=start_url,headers=self.headers,callback=self.parse_dynamic_state)

    def parse_static_state(self, response):
        user_id = urlparse(response.url).path.split('/')[-2]
        user_image_url = response.xpath('//*[@id="ProfileHeader"]/div/div[2]/div/div[1]/img/@src').extract_first()
        name = response.xpath('//*[@id="ProfileHeader"]/div/div[2]/div/div[2]/div[1]/h1/span[1]/text()').extract_first()
        follows_num = response.xpath('//*[@id="root"]/div/main/div/div[2]/div[2]/div[2]/div/a[1]/div/strong/text()').extract_first()
        followers_num = response.xpath('//*[@id="root"]/div/main/div/div[2]/div[2]/div[2]/div/a[2]/div/strong/text()').extract_first()

        static_state_item=StaticStateItem(user_id=user_id,user_image_url=user_image_url,name=name,
            follows_num=(follows_num),followers_num=(followers_num))
        yield static_state_item
        next_urls=['https://www.zhihu.com/api/v4/members/'+user_id+'/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=0&limit=20',
            'https://www.zhihu.com/api/v4/members/'+user_id+'/followers?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=0&limit=20']
        for next_url in next_urls:
            yield scrapy.Request(url=next_url,headers=self.headers,callback=self.parse_dynamic_state)

    def parse_dynamic_state(self,response):
        follow= json.loads(response.body)
        for i in range(20):
            try:
                url_token=follow['data'][i]['url_token']
                user_type=follow['data'][i]['user_type']
                if user_type=='people':
                    user_url='https://www.zhihu.com/people/'+url_token+'/activities'
                else:
                    user_url='https://www.zhihu.com/org/'+url_token+'/activities'
                dynamic_state_item=DynamicStateItem(user_url=user_url)
                yield dynamic_state_item
                yield scrapy.Request(url=user_url,headers=self.headers,callback=self.parse_static_state)
            except:
                break
        next_url=follow['paging']['next']
        yield scrapy.Request(url=next_url,headers=self.headers,callback=self.parse_dynamic_state)

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class StaticStateItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    user_id = scrapy.Field()
    user_image_url = scrapy.Field()
    name = scrapy.Field()
    follows_num = scrapy.Field()
    followers_num = scrapy.Field()



class DynamicStateItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    user_url = scrapy.Field()

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
from zhihuCrawl.items import StaticStateItem,DynamicStateItem

class ZhihucrawlPipeline(object):
    def __init__(self,mongo_uri,mongo_db):
        self.mongo_uri=mongo_uri
        self.mongo_db=mongo_db

    @classmethod
    def from_crawler(cls,crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DATABASE','zhihu')
        )

    def open_spider(self,spider):
        self.client=pymongo.MongoClient(self.mongo_uri)
        self.db=self.client[self.mongo_db]

    def close_spider(self,spider):
        self.client=pymongo.MongoClient(self.mongo_uri)
        self.client.close()

    def process_item(self, item, spider):
        if isinstance(item,StaticStateItem):
            self._process_static_item(item)
        else:
            self._process_dynamic_item(item)
        return item

    def _process_static_item(self,item):
        self.db.StaticStateInfo.insert(dict(item))

    def _process_dynamic_item(self,item):
        self.db.DynamicStateInfo.insert(dict(item))

settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for zhihuCrawl project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'zhihuCrawl'

SPIDER_MODULES = ['zhihuCrawl.spiders']
NEWSPIDER_MODULE = 'zhihuCrawl.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'zhihuCrawl (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.35
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
COOKIES_ENABLED = True

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'zhihuCrawl.middlewares.ZhihucrawlSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'zhihuCrawl.middlewares.ZhihucrawlDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
MONGO_URI='mongodb://127.0.0.1:27017/'
MONGO_DATABASE='zhihu'
ITEM_PIPELINES = {
    'zhihuCrawl.pipelines.ZhihucrawlPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

缺陷:

每爬取一段时间后就会出现验证码,如不及时处理则会导致爬虫停止运作
(暂时处理方式:
方案一:当爬虫状态码为3XX、4XX时,浏览器打开知乎(无需登录),然后手动验证
方案二:浏览器打开知乎(无需登录),时不时刷新一下,该打码时就打码)

猜你喜欢

转载自blog.csdn.net/weixin_39777626/article/details/81810984