scrapy arrastrándose Zhihu información de usuario

Crear un entorno virtual

mkvirtualenv zhuhu

Crear un proyecto

pip install scrapy 
scrapy startproject Zhihu

Inicializar repositorio git

git init 

Añadir archivo .gitignore:

# .gitignore
/.idea
*.pyc

Crear un archivo de reptiles

scrapy genspider zhihu zhihu.com

codificación

En primer lugar, tratamos de ejecutar:

scrapy crawl zhihu

Debe ser capaz de correr a través.

El cambio no cumple con el protocolo de los robots:

# Zhihu/settings.py
# Obey robots.txt rules
ROBOTSTXT_OBEY = False

Prueba sabe petición casi directa interfaz 400 volverá. solicitud de excepción medios.

import requests as req
from fake_useragent import UserAgent

ua = UserAgent()

headers = {"User-Agent": ua.random}

url = "https://www.zhihu.com/api/v4/members/traderusingpython/activities?limit=7&session_id=1204109748246929408&after_id=1581262642&desktop=True"

ret = req.get(url, headers=headers)

print(ret)

Ajuste de un encabezado de solicitud por defecto para un programa:

# Zhihu/settings.py
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'authorization': 'oauth c3cef7c66a1843f8b3a9e6a1e3160e20',
}
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

Arrastrándose información de los usuarios

En primer lugar, obtener información actual sobre el usuario, y luego analizar sus inquietudes personas y los aficionados, y luego llegar al usuario seguir para resolver la información personal, el enfoque analítico y la información de los aficionados.

# -*- coding: utf-8 -*-
import json

import scrapy
from scrapy import Request
from Zhihu.items import UserItem


class ZhihuSpider(scrapy.Spider):
    name = 'zhihu'
    allowed_domains = ['zhihu.com']
    user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}'
    start_user = 'traderusingpython'
    user_query = 'locations,employments,gender,educations,business,voteup_count,thanked_Count,follower_count,following_count,cover_url,following_topic_count,following_question_count,following_favlists_count,following_columns_count,answer_count,articles_count,pins_count,question_count,commercial_question_count,favorite_count,favorited_count,logs_count,marked_answers_count,marked_answers_text,message_thread_token,account_status,is_active,is_force_renamed,is_bind_sina,sina_weibo_url,sina_weibo_name,show_sina_weibo,is_blocking,is_blocked,is_following,is_followed,mutual_followees_count,vote_to_count,vote_from_count,thank_to_count,thank_from_count,thanked_count,description,hosted_live_count,participated_live_count,allow_message,industry_category,org_name,org_homepage,badge[?(type=best_answerer)].topics'

    def start_requests(self):
        yield Request(self.user_url.format(user=self.start_user, include=self.user_query), self.parse_user)

    def parse_user(self, response):
        result = json.loads(response.text)
        item = UserItem()

        for field in item.fields:
            if field in result.keys():
                item[field] = result.get(field)
        yield item
# -*- coding: utf-8 -*-
import json

import scrapy
from scrapy import Request
from Zhihu.items import UserItem


class ZhihuSpider(scrapy.Spider):
    name = 'zhihu'
    allowed_domains = ['zhihu.com']
    user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}'
    start_user = 'traderusingpython'
    user_query = 'locations,employments,gender,educations,business,voteup_count,thanked_Count,follower_count,following_count,cover_url,following_topic_count,following_question_count,following_favlists_count,following_columns_count,answer_count,articles_count,pins_count,question_count,commercial_question_count,favorite_count,favorited_count,logs_count,marked_answers_count,marked_answers_text,message_thread_token,account_status,is_active,is_force_renamed,is_bind_sina,sina_weibo_url,sina_weibo_name,show_sina_weibo,is_blocking,is_blocked,is_following,is_followed,mutual_followees_count,vote_to_count,vote_from_count,thank_to_count,thank_from_count,thanked_count,description,hosted_live_count,participated_live_count,allow_message,industry_category,org_name,org_homepage,badge[?(type=best_answerer)].topics'
    follows_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
    followers_url = 'https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}'
    follows_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
    followers_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'

    def start_requests(self):
        yield Request(self.user_url.format(user=self.start_user, include=self.user_query), self.parse_user)

    def parse_user(self, response):
        result = json.loads(response.text)
        item = UserItem()

        for field in item.fields:
            if field in result.keys():
                item[field] = result.get(field)
        yield item

        current_user = result.get("url_token")

        # 获取当前用户的关注者
        yield Request(
            self.follows_url.format(user=current_user, include=self.follows_query, limit=20, offset=0),
            self.parse_follows)

        # 获取当前用户的粉丝
        yield Request(
            self.followers_url.format(user=current_user, include=self.followers_query, limit=20, offset=0),
            self.parse_followers)

    def parse_follows(self, response):
        # 拿到当前用户的关注信息
        results = json.loads(response.text)

        if 'data' in results.keys():
            for result in results.get('data'):
                yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
                              self.parse_user)

        # 关注信息翻页
        if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
            next_page = results.get('paging').get('next')
            yield Request(next_page,
                          self.parse_follows)

    def parse_followers(self, response):
        # 拿到当前用户的粉丝信息
        results = json.loads(response.text)

        if 'data' in results.keys():
            for result in results.get('data'):
                yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
                              self.parse_user)

        # 粉丝信息翻页
        if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
            next_page = results.get('paging').get('next')
            yield Request(next_page,
                          self.parse_followers)

Almacenar en el mongo

# Zhihu/pipelines.py
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo


class ZhihuPipeline(object):
    def process_item(self, item, spider):
        return item


class MongoPipeline(object):
    collection_name = 'users'

    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DATABASE')
        )

    def open_spider(self, spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    def close_spider(self, spider):
        self.client.close()

    def process_item(self, item, spider):
        # 将用户信息更新到 mongo 数据库中
        self.db[self.collection_name].update({'url_token': item['url_token']}, dict(item), True)
        return item

Modificar la configuración de almacenamiento

# Zhihu/settings.py
ITEM_PIPELINES = {
    'zhihuuser.pipelines.MongoPipeline': 300,
    # 'scrapy_redis.pipelines.RedisPipeline': 301
}

MONGO_URI = 'localhost'
MONGO_DATABASE = 'zhihu'

dirección de Git

Publicados 291 artículos originales · ganado elogios 104 · vistas 410 000 +

Supongo que te gusta

Origin blog.csdn.net/Enjolras_fuu/article/details/104287756
Recomendado
Clasificación