Scrapy爬取知乎用户信息

使用Scrapy递归爬取知乎用户信息

分析知乎网页的具体过程就不一一道来了。
1，因为知乎会检查User-Agent，所以首先在scrapy的seettings.py 中，设置ROBOTSTXT_OBEY = False，然后取消DEFAULT_REQUEST_HEADERS的注释，在这中间加入，User-Agent头部信息进行测试后发现还是不行。需要加入authorization参数，这个的值可以在请求网页上复制，目测这个值要很长一段时间才会过期。所以复制过来可以直接用。

DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
  "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
  "authorization":"oauth c3cef7c66a1843f8b3a9e6a1e3160e20",
}

2，分析：
既然要爬取信息，首先我们选择一位知乎关注者跟粉丝比较多的用户，然后写第一个爬虫parse_user去爬取他的信息同时回调给第二个爬虫parse_follows and 第三个爬虫parse_followers来递归爬取对应的用户。
再然后写第二个爬虫parse_follows去爬取他的关注的用户列表，同时回调给第一个爬虫parse_user去抓取他的信息。然后写第三个爬虫parse_followers去爬取他的粉丝用户列表，同时回调给第一个爬虫parse_user去抓取他的信息
总共三个解析函数，每一个包含两个功能：

parse_user：抓取信息，同时通过回调来遍历该用户的关注跟粉丝
parse_follows：获取一个关注列表，并迭代来抓取每一个用户的信息；实现关注列表的翻页。
parse_followers:获取粉丝列表，并迭代来抓取每一个用户的信息；实现关注列表的翻页。

分析url可知，所有的链接都基于用户参数中的 url_token ，每一个用户的这个参数不一样，可以在data中看到。
这里我选用知乎 路人甲 来做为列子。他的 url_token 是 sgai。

分析粉丝链接，关注者链接，个人信息的链接得出一个规律，可以实现对应url 的拼接。
如：

# 个人详细信息url
    user_url = "https://www.zhihu.com/api/v4/members/{user_url_token}?include={include}"
    user_query = "allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics"
# 他所关注的用户
    follows_url = "https://www.zhihu.com/api/v4/members/{user_url_token}/followees?include={include}&offset={offset}&limit={limit}"
    follows_query = "data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics"
 # 他的粉丝
    followers_url = "https://www.zhihu.com/api/v4/members/{user_url_token}/followers?include={include}&offset={offset}&limit={limit}"
    followers_query = "data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics"

对应的query 就是fromdata中的params参数，因此我们可以对他用format()进行拼接。
下面是爬虫部分代码：

#zhihu.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request,Spider
import json
from ..items import *

class ZhihuSpider(scrapy.Spider):
    name = 'zhihu'
    allowed_domains = ['www.zhihu.com']
    start_urls = ['http://www.zhihu.com/']
    # 路人甲
    start_user_url_token = "sgai"

    # 个人详细信息url
    user_url = "https://www.zhihu.com/api/v4/members/{user_url_token}?include={include}"
    user_query = "allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics"
    # 他所关注的用户
    follows_url = "https://www.zhihu.com/api/v4/members/{user_url_token}/followees?include={include}&offset={offset}&limit={limit}"
    follows_query = "data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics"
    # 他的粉丝
    followers_url = "https://www.zhihu.com/api/v4/members/{user_url_token}/followers?include={include}&offset={offset}&limit={limit}"
    followers_query = "data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics"
    def start_requests(self):
        yield Request(url=self.user_url.format(user_url_token=self.start_user_url_token,include=self.user_query),callback=self.parse_user)
        yield Request(url=self.follows_url.format(user_url_token=self.start_user_url_token,include=self.follows_query,offset=0,limit=20),callback=self.parse_follows)
        yield Request(url=self.followers_url.format(user_url_token=self.start_user_url_token,include=self.followers_query,offset=0,limit=20),callback=self.parse_followers)

    # 处理一个用户
    def parse_user(self, response):
        result = json.loads(response.text)
        item = UserItem()
        for field in item.fields:
            if field in result.keys():
                item[field] = result.get(field)
        yield item
        # 递归每一个用户的关注列表
        yield Request(url=self.follows_url.format(user_url_token=result.get("url_token"),include=self.follows_query,offset=0,limit=20),callback=self.parse_follows)
        # 递归每一个用户的粉丝列表
        yield Request(url=self.followers_url.format(user_url_token=result.get("url_token"),include=self.followers_query,offset=0,limit=20),callback=self.parse_followers)

    # 处理关注列表
    def parse_follows(self,response):
        result = json.loads(response.text)
        # 是否含有数据且非最后一页
        if "data" in result.keys():
            for result in result.get("data"):
                yield Request(url=self.user_url.format(user_url_token=result.get("url_token"),include=self.user_query),callback=self.parse_user)

        if "paging" in result.keys() and result.get("paging").get("is_end") == False:
            next_page = result.get("paging").get("next")
            yield Request(url=next_page,callback=self.parse_follows)

    def parse_followers(self,response):
        result = json.loads(response.text)
        # 是否含有数据且非最后一页
        if "data" in result.keys():
            for result in result.get("data"):
                yield Request(url=self.user_url.format(user_url_token=result.get("url_token"),include=self.user_query),callback=self.parse_user)

        if "paging" in result.keys() and result.get("paging").get("is_end") == False:
            next_page = result.get("paging").get("next")
            yield Request(url=next_page,callback=self.parse_followers)

在item.py中加入自己想爬取的字段，这里我只爬了部分信息

# -*- coding: utf-8 -*-
from scrapy import Field,Item

class UserItem(Item):
    allow_message = Field()
    answer_count = Field()
    articles_count = Field()
    avatar_url = Field()
    badge = Field()
    employments = Field()
    company = Field()
    follower_count = Field()
    gender = Field()
    headline = Field()
    id = Field()
    is_advertiser = Field()
    is_blocking = Field()
    is_followed = Field()
    is_following = Field()
    is_org = Field()
    name = Field()
    type = Field()
    url = Field()
    url_token = Field()
    user_type = Field()

在pipeline 中对item 的数据进行处理，保存为文件或者数据库

# -*- coding: utf-8 -*-

# 保存为json文件
import json
class UserPipeline(object):
    """
    Save as a json file

    """
    def open_spider(self, spider):
        self.file = open('items.json', 'w',encoding="utf8")

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item
# 保存到数据库
import pymongo
class MongoPipeline(object):

    def __init__(self, mongo_url, mongo_db):
        self.mongo_url = mongo_url
        self.mongo_db = mongo_db
# this classmethod id used to get args from settings.py 
    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_url=crawler.settings.get('MONGO_URL'),
            mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
        )

    def open_spider(self, spider):
        self.client = pymongo.MongoClient(self.mongo_url)
        self.db = self.client[self.mongo_db]

    def close_spider(self, spider):
        self.client.close()

    def process_item(self, item, spider):
        self.db["user"].update({"url_token":item["url_token"]},{"$set":item},True)
        return item

pipeline.py 的相关参数，在settings.py 中给出，便于程序管理：

# settings.py

# 数据库信息
MONGO_URL = "localhost"
MONGO_DATABASE = "zhihu"

最后，在settings.py 中配置相应的 pipeline，运行程序即可

Scrapy爬取知乎用户信息

使用Scrapy递归爬取知乎用户信息

猜你喜欢