版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/weixin_39777626/article/details/81810984
本文主要知识点:
- Scrapy框架
- 静态页面+动态页面 抓取
- pymongo存储
思路梳理
代码
注:书中源代码太久远完全跑不动,以下代码是本人大致揣摩了一下作者意思编写的
终端运行:
scrapy startproject zhihuCrawl
cd zhihuCrawl
scrapy genspider -t crawl zhihu.com zhihu.com
zhihu_com.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import json
from zhihuCrawl.items import StaticStateItem,DynamicStateItem
from urllib.parse import urlparse
class ZhihuComSpider(CrawlSpider):
name = 'zhihu.com'
allowed_domains = ['zhihu.com']
headers = {
'User-Agent': r'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.90 Safari/537.36 2345Explorer/9.3.2.17331',
'Referer': 'https://www.zhihu.com',
'Connection': 'keep-alive'
}
#start_url='https://www.zhihu.com/people/de-chuan-guang-guo-81/activities'
def start_requests(self):
start_urls=['https://www.zhihu.com/api/v4/members/mei-ri-ren-wu/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=0&limit=20',
'https://www.zhihu.com/api/v4/members/mei-ri-ren-wu/followers?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=0&limit=20']
for start_url in start_urls:
yield scrapy.Request(url=start_url,headers=self.headers,callback=self.parse_dynamic_state)
def parse_static_state(self, response):
user_id = urlparse(response.url).path.split('/')[-2]
user_image_url = response.xpath('//*[@id="ProfileHeader"]/div/div[2]/div/div[1]/img/@src').extract_first()
name = response.xpath('//*[@id="ProfileHeader"]/div/div[2]/div/div[2]/div[1]/h1/span[1]/text()').extract_first()
follows_num = response.xpath('//*[@id="root"]/div/main/div/div[2]/div[2]/div[2]/div/a[1]/div/strong/text()').extract_first()
followers_num = response.xpath('//*[@id="root"]/div/main/div/div[2]/div[2]/div[2]/div/a[2]/div/strong/text()').extract_first()
static_state_item=StaticStateItem(user_id=user_id,user_image_url=user_image_url,name=name,
follows_num=(follows_num),followers_num=(followers_num))
yield static_state_item
next_urls=['https://www.zhihu.com/api/v4/members/'+user_id+'/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=0&limit=20',
'https://www.zhihu.com/api/v4/members/'+user_id+'/followers?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=0&limit=20']
for next_url in next_urls:
yield scrapy.Request(url=next_url,headers=self.headers,callback=self.parse_dynamic_state)
def parse_dynamic_state(self,response):
follow= json.loads(response.body)
for i in range(20):
try:
url_token=follow['data'][i]['url_token']
user_type=follow['data'][i]['user_type']
if user_type=='people':
user_url='https://www.zhihu.com/people/'+url_token+'/activities'
else:
user_url='https://www.zhihu.com/org/'+url_token+'/activities'
dynamic_state_item=DynamicStateItem(user_url=user_url)
yield dynamic_state_item
yield scrapy.Request(url=user_url,headers=self.headers,callback=self.parse_static_state)
except:
break
next_url=follow['paging']['next']
yield scrapy.Request(url=next_url,headers=self.headers,callback=self.parse_dynamic_state)
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class StaticStateItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
user_id = scrapy.Field()
user_image_url = scrapy.Field()
name = scrapy.Field()
follows_num = scrapy.Field()
followers_num = scrapy.Field()
class DynamicStateItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
user_url = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
from zhihuCrawl.items import StaticStateItem,DynamicStateItem
class ZhihucrawlPipeline(object):
def __init__(self,mongo_uri,mongo_db):
self.mongo_uri=mongo_uri
self.mongo_db=mongo_db
@classmethod
def from_crawler(cls,crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE','zhihu')
)
def open_spider(self,spider):
self.client=pymongo.MongoClient(self.mongo_uri)
self.db=self.client[self.mongo_db]
def close_spider(self,spider):
self.client=pymongo.MongoClient(self.mongo_uri)
self.client.close()
def process_item(self, item, spider):
if isinstance(item,StaticStateItem):
self._process_static_item(item)
else:
self._process_dynamic_item(item)
return item
def _process_static_item(self,item):
self.db.StaticStateInfo.insert(dict(item))
def _process_dynamic_item(self,item):
self.db.DynamicStateInfo.insert(dict(item))
settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for zhihuCrawl project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'zhihuCrawl'
SPIDER_MODULES = ['zhihuCrawl.spiders']
NEWSPIDER_MODULE = 'zhihuCrawl.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'zhihuCrawl (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.35
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = True
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'zhihuCrawl.middlewares.ZhihucrawlSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'zhihuCrawl.middlewares.ZhihucrawlDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
MONGO_URI='mongodb://127.0.0.1:27017/'
MONGO_DATABASE='zhihu'
ITEM_PIPELINES = {
'zhihuCrawl.pipelines.ZhihucrawlPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
缺陷:
每爬取一段时间后就会出现验证码,如不及时处理则会导致爬虫停止运作
(暂时处理方式:
方案一:当爬虫状态码为3XX、4XX时,浏览器打开知乎(无需登录),然后手动验证
方案二:浏览器打开知乎(无需登录),时不时刷新一下,该打码时就打码)