使用Scrapy递归爬取知乎用户信息
分析知乎网页的具体过程就不一一道来了。
1,因为知乎会检查User-Agent,所以首先在scrapy的seettings.py 中,设置ROBOTSTXT_OBEY = False,然后取消DEFAULT_REQUEST_HEADERS的注释,在这中间加入,User-Agent头部信息进行测试后发现还是不行。需要加入authorization参数,这个的值可以在请求网页上复制,目测这个值要很长一段时间才会过期。所以复制过来可以直接用。
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"authorization":"oauth c3cef7c66a1843f8b3a9e6a1e3160e20",
}
2,分析:
既然要爬取信息,首先我们选择一位知乎关注者跟粉丝比较多的用户,然后写第一个爬虫parse_user去爬取他的信息同时回调给第二个爬虫parse_follows and 第三个爬虫parse_followers来递归爬取对应的用户。
再然后写第二个爬虫parse_follows去爬取他的关注的用户列表,同时回调给第一个爬虫parse_user去抓取他的信息。然后写第三个爬虫parse_followers去爬取他的粉丝用户列表,同时回调给第一个爬虫parse_user去抓取他的信息
总共三个解析函数,每一个包含两个功能 :
- parse_user:抓取信息,同时通过回调来遍历该用户的 关注 跟 粉丝
- parse_follows:获取一个关注列表,并迭代来抓取每一个用户的信息;实现关注列表的翻页。
parse_followers:获取粉丝列表,并迭代来抓取每一个用户的信息;实现关注列表的翻页。
分析url可知,所有的链接都基于用户参数中的 url_token ,每一个用户的这个参数不一样,可以在data中看到。
这里我选用知乎 路人甲 来做为列子。他的 url_token 是 sgai。
分析粉丝链接,关注者链接,个人信息的链接得出一个规律,可以实现对应url 的拼接。
如:
# 个人详细信息url
user_url = "https://www.zhihu.com/api/v4/members/{user_url_token}?include={include}"
user_query = "allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics"
# 他所关注的用户
follows_url = "https://www.zhihu.com/api/v4/members/{user_url_token}/followees?include={include}&offset={offset}&limit={limit}"
follows_query = "data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics"
# 他的粉丝
followers_url = "https://www.zhihu.com/api/v4/members/{user_url_token}/followers?include={include}&offset={offset}&limit={limit}"
followers_query = "data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics"
对应的query 就是fromdata中的params参数,因此我们可以对他用format()进行拼接。
下面是爬虫部分代码:
#zhihu.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request,Spider
import json
from ..items import *
class ZhihuSpider(scrapy.Spider):
name = 'zhihu'
allowed_domains = ['www.zhihu.com']
start_urls = ['http://www.zhihu.com/']
# 路人甲
start_user_url_token = "sgai"
# 个人详细信息url
user_url = "https://www.zhihu.com/api/v4/members/{user_url_token}?include={include}"
user_query = "allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics"
# 他所关注的用户
follows_url = "https://www.zhihu.com/api/v4/members/{user_url_token}/followees?include={include}&offset={offset}&limit={limit}"
follows_query = "data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics"
# 他的粉丝
followers_url = "https://www.zhihu.com/api/v4/members/{user_url_token}/followers?include={include}&offset={offset}&limit={limit}"
followers_query = "data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics"
def start_requests(self):
yield Request(url=self.user_url.format(user_url_token=self.start_user_url_token,include=self.user_query),callback=self.parse_user)
yield Request(url=self.follows_url.format(user_url_token=self.start_user_url_token,include=self.follows_query,offset=0,limit=20),callback=self.parse_follows)
yield Request(url=self.followers_url.format(user_url_token=self.start_user_url_token,include=self.followers_query,offset=0,limit=20),callback=self.parse_followers)
# 处理一个用户
def parse_user(self, response):
result = json.loads(response.text)
item = UserItem()
for field in item.fields:
if field in result.keys():
item[field] = result.get(field)
yield item
# 递归每一个用户的关注列表
yield Request(url=self.follows_url.format(user_url_token=result.get("url_token"),include=self.follows_query,offset=0,limit=20),callback=self.parse_follows)
# 递归每一个用户的粉丝列表
yield Request(url=self.followers_url.format(user_url_token=result.get("url_token"),include=self.followers_query,offset=0,limit=20),callback=self.parse_followers)
# 处理关注列表
def parse_follows(self,response):
result = json.loads(response.text)
# 是否含有数据且非最后一页
if "data" in result.keys():
for result in result.get("data"):
yield Request(url=self.user_url.format(user_url_token=result.get("url_token"),include=self.user_query),callback=self.parse_user)
if "paging" in result.keys() and result.get("paging").get("is_end") == False:
next_page = result.get("paging").get("next")
yield Request(url=next_page,callback=self.parse_follows)
def parse_followers(self,response):
result = json.loads(response.text)
# 是否含有数据且非最后一页
if "data" in result.keys():
for result in result.get("data"):
yield Request(url=self.user_url.format(user_url_token=result.get("url_token"),include=self.user_query),callback=self.parse_user)
if "paging" in result.keys() and result.get("paging").get("is_end") == False:
next_page = result.get("paging").get("next")
yield Request(url=next_page,callback=self.parse_followers)
在item.py中加入自己想爬取的字段,这里我只爬了部分信息
# -*- coding: utf-8 -*-
from scrapy import Field,Item
class UserItem(Item):
allow_message = Field()
answer_count = Field()
articles_count = Field()
avatar_url = Field()
badge = Field()
employments = Field()
company = Field()
follower_count = Field()
gender = Field()
headline = Field()
id = Field()
is_advertiser = Field()
is_blocking = Field()
is_followed = Field()
is_following = Field()
is_org = Field()
name = Field()
type = Field()
url = Field()
url_token = Field()
user_type = Field()
在pipeline 中对item 的数据进行处理,保存为文件或者数据库
# -*- coding: utf-8 -*-
# 保存为json文件
import json
class UserPipeline(object):
"""
Save as a json file
"""
def open_spider(self, spider):
self.file = open('items.json', 'w',encoding="utf8")
def close_spider(self, spider):
self.file.close()
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
return item
# 保存到数据库
import pymongo
class MongoPipeline(object):
def __init__(self, mongo_url, mongo_db):
self.mongo_url = mongo_url
self.mongo_db = mongo_db
# this classmethod id used to get args from settings.py
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_url=crawler.settings.get('MONGO_URL'),
mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_url)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
self.db["user"].update({"url_token":item["url_token"]},{"$set":item},True)
return item
pipeline.py 的相关参数,在settings.py 中给出,便于程序管理:
# settings.py
# 数据库信息
MONGO_URL = "localhost"
MONGO_DATABASE = "zhihu"
最后,在settings.py 中配置相应的 pipeline,运行程序即可