scrapy crawling zhihu user info

Create a virtual environment

mkvirtualenv zhuhu

Create a project

pip install scrapy 
scrapy startproject Zhihu

Initialize git repository

git init 

Add .gitignore file:

# .gitignore
/.idea
*.pyc

Create a file reptiles

scrapy genspider zhihu zhihu.com

Coding

First, we try to run:

scrapy crawl zhihu

It should be able to run through.

Change does not comply with robots protocol:

# Zhihu/settings.py
# Obey robots.txt rules
ROBOTSTXT_OBEY = False

Test know almost direct request interface 400 will return. Means exception request.

import requests as req
from fake_useragent import UserAgent

ua = UserAgent()

headers = {"User-Agent": ua.random}

url = "https://www.zhihu.com/api/v4/members/traderusingpython/activities?limit=7&session_id=1204109748246929408&after_id=1581262642&desktop=True"

ret = req.get(url, headers=headers)

print(ret)

Setting a default request header for a program:

# Zhihu/settings.py
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'authorization': 'oauth c3cef7c66a1843f8b3a9e6a1e3160e20',
}
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

Crawling user information

First, get current information about the user, and then parse out its concerns people and fans, and then get the user to continue to resolve personal information, analytical focus and the fans information.

# -*- coding: utf-8 -*-
import json

import scrapy
from scrapy import Request
from Zhihu.items import UserItem


class ZhihuSpider(scrapy.Spider):
    name = 'zhihu'
    allowed_domains = ['zhihu.com']
    user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}'
    start_user = 'traderusingpython'
    user_query = 'locations,employments,gender,educations,business,voteup_count,thanked_Count,follower_count,following_count,cover_url,following_topic_count,following_question_count,following_favlists_count,following_columns_count,answer_count,articles_count,pins_count,question_count,commercial_question_count,favorite_count,favorited_count,logs_count,marked_answers_count,marked_answers_text,message_thread_token,account_status,is_active,is_force_renamed,is_bind_sina,sina_weibo_url,sina_weibo_name,show_sina_weibo,is_blocking,is_blocked,is_following,is_followed,mutual_followees_count,vote_to_count,vote_from_count,thank_to_count,thank_from_count,thanked_count,description,hosted_live_count,participated_live_count,allow_message,industry_category,org_name,org_homepage,badge[?(type=best_answerer)].topics'

    def start_requests(self):
        yield Request(self.user_url.format(user=self.start_user, include=self.user_query), self.parse_user)

    def parse_user(self, response):
        result = json.loads(response.text)
        item = UserItem()

        for field in item.fields:
            if field in result.keys():
                item[field] = result.get(field)
        yield item
# -*- coding: utf-8 -*-
import json

import scrapy
from scrapy import Request
from Zhihu.items import UserItem


class ZhihuSpider(scrapy.Spider):
    name = 'zhihu'
    allowed_domains = ['zhihu.com']
    user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}'
    start_user = 'traderusingpython'
    user_query = 'locations,employments,gender,educations,business,voteup_count,thanked_Count,follower_count,following_count,cover_url,following_topic_count,following_question_count,following_favlists_count,following_columns_count,answer_count,articles_count,pins_count,question_count,commercial_question_count,favorite_count,favorited_count,logs_count,marked_answers_count,marked_answers_text,message_thread_token,account_status,is_active,is_force_renamed,is_bind_sina,sina_weibo_url,sina_weibo_name,show_sina_weibo,is_blocking,is_blocked,is_following,is_followed,mutual_followees_count,vote_to_count,vote_from_count,thank_to_count,thank_from_count,thanked_count,description,hosted_live_count,participated_live_count,allow_message,industry_category,org_name,org_homepage,badge[?(type=best_answerer)].topics'
    follows_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
    followers_url = 'https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}'
    follows_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
    followers_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'

    def start_requests(self):
        yield Request(self.user_url.format(user=self.start_user, include=self.user_query), self.parse_user)

    def parse_user(self, response):
        result = json.loads(response.text)
        item = UserItem()

        for field in item.fields:
            if field in result.keys():
                item[field] = result.get(field)
        yield item

        current_user = result.get("url_token")

        # 获取当前用户的关注者
        yield Request(
            self.follows_url.format(user=current_user, include=self.follows_query, limit=20, offset=0),
            self.parse_follows)

        # 获取当前用户的粉丝
        yield Request(
            self.followers_url.format(user=current_user, include=self.followers_query, limit=20, offset=0),
            self.parse_followers)

    def parse_follows(self, response):
        # 拿到当前用户的关注信息
        results = json.loads(response.text)

        if 'data' in results.keys():
            for result in results.get('data'):
                yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
                              self.parse_user)

        # 关注信息翻页
        if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
            next_page = results.get('paging').get('next')
            yield Request(next_page,
                          self.parse_follows)

    def parse_followers(self, response):
        # 拿到当前用户的粉丝信息
        results = json.loads(response.text)

        if 'data' in results.keys():
            for result in results.get('data'):
                yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
                              self.parse_user)

        # 粉丝信息翻页
        if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
            next_page = results.get('paging').get('next')
            yield Request(next_page,
                          self.parse_followers)

Store in the mongo

# Zhihu/pipelines.py
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo


class ZhihuPipeline(object):
    def process_item(self, item, spider):
        return item


class MongoPipeline(object):
    collection_name = 'users'

    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DATABASE')
        )

    def open_spider(self, spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    def close_spider(self, spider):
        self.client.close()

    def process_item(self, item, spider):
        # 将用户信息更新到 mongo 数据库中
        self.db[self.collection_name].update({'url_token': item['url_token']}, dict(item), True)
        return item

Modify the storage configuration

# Zhihu/settings.py
ITEM_PIPELINES = {
    'zhihuuser.pipelines.MongoPipeline': 300,
    # 'scrapy_redis.pipelines.RedisPipeline': 301
}

MONGO_URI = 'localhost'
MONGO_DATABASE = 'zhihu'

git address

Published 291 original articles · won praise 104 · views 410 000 +

Guess you like

Origin blog.csdn.net/Enjolras_fuu/article/details/104287756