仮想環境を作成します。
mkvirtualenv zhuhu
プロジェクトを作成します。
pip install scrapy
scrapy startproject Zhihu
初期のgitリポジトリ
git init
.gitignoreファイルを追加します。
# .gitignore
/.idea
*.pyc
ファイル爬虫類を作成します。
scrapy genspider zhihu zhihu.com
コーディング
まず、実行しよう:
scrapy crawl zhihu
経由で実行することができるはずです。
変更は、ロボットのプロトコルに準拠していません。
# Zhihu/settings.py
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
テストは400が返されますインターフェイスはほぼ直接リクエストを知っています。手段の例外要求。
import requests as req
from fake_useragent import UserAgent
ua = UserAgent()
headers = {"User-Agent": ua.random}
url = "https://www.zhihu.com/api/v4/members/traderusingpython/activities?limit=7&session_id=1204109748246929408&after_id=1581262642&desktop=True"
ret = req.get(url, headers=headers)
print(ret)
プログラムのデフォルトのリクエストヘッダの設定:
# Zhihu/settings.py
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'authorization': 'oauth c3cef7c66a1843f8b3a9e6a1e3160e20',
}
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
ユーザー情報をクロール
まず、ユーザに関する現在の情報を取得し、その懸念の人々とファンを解析し、ユーザーが個人情報を、分析の焦点とファンの情報を解決するために継続して取得します。
# -*- coding: utf-8 -*-
import json
import scrapy
from scrapy import Request
from Zhihu.items import UserItem
class ZhihuSpider(scrapy.Spider):
name = 'zhihu'
allowed_domains = ['zhihu.com']
user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}'
start_user = 'traderusingpython'
user_query = 'locations,employments,gender,educations,business,voteup_count,thanked_Count,follower_count,following_count,cover_url,following_topic_count,following_question_count,following_favlists_count,following_columns_count,answer_count,articles_count,pins_count,question_count,commercial_question_count,favorite_count,favorited_count,logs_count,marked_answers_count,marked_answers_text,message_thread_token,account_status,is_active,is_force_renamed,is_bind_sina,sina_weibo_url,sina_weibo_name,show_sina_weibo,is_blocking,is_blocked,is_following,is_followed,mutual_followees_count,vote_to_count,vote_from_count,thank_to_count,thank_from_count,thanked_count,description,hosted_live_count,participated_live_count,allow_message,industry_category,org_name,org_homepage,badge[?(type=best_answerer)].topics'
def start_requests(self):
yield Request(self.user_url.format(user=self.start_user, include=self.user_query), self.parse_user)
def parse_user(self, response):
result = json.loads(response.text)
item = UserItem()
for field in item.fields:
if field in result.keys():
item[field] = result.get(field)
yield item
# -*- coding: utf-8 -*-
import json
import scrapy
from scrapy import Request
from Zhihu.items import UserItem
class ZhihuSpider(scrapy.Spider):
name = 'zhihu'
allowed_domains = ['zhihu.com']
user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}'
start_user = 'traderusingpython'
user_query = 'locations,employments,gender,educations,business,voteup_count,thanked_Count,follower_count,following_count,cover_url,following_topic_count,following_question_count,following_favlists_count,following_columns_count,answer_count,articles_count,pins_count,question_count,commercial_question_count,favorite_count,favorited_count,logs_count,marked_answers_count,marked_answers_text,message_thread_token,account_status,is_active,is_force_renamed,is_bind_sina,sina_weibo_url,sina_weibo_name,show_sina_weibo,is_blocking,is_blocked,is_following,is_followed,mutual_followees_count,vote_to_count,vote_from_count,thank_to_count,thank_from_count,thanked_count,description,hosted_live_count,participated_live_count,allow_message,industry_category,org_name,org_homepage,badge[?(type=best_answerer)].topics'
follows_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
followers_url = 'https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}'
follows_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
followers_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
def start_requests(self):
yield Request(self.user_url.format(user=self.start_user, include=self.user_query), self.parse_user)
def parse_user(self, response):
result = json.loads(response.text)
item = UserItem()
for field in item.fields:
if field in result.keys():
item[field] = result.get(field)
yield item
current_user = result.get("url_token")
# 获取当前用户的关注者
yield Request(
self.follows_url.format(user=current_user, include=self.follows_query, limit=20, offset=0),
self.parse_follows)
# 获取当前用户的粉丝
yield Request(
self.followers_url.format(user=current_user, include=self.followers_query, limit=20, offset=0),
self.parse_followers)
def parse_follows(self, response):
# 拿到当前用户的关注信息
results = json.loads(response.text)
if 'data' in results.keys():
for result in results.get('data'):
yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
self.parse_user)
# 关注信息翻页
if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
next_page = results.get('paging').get('next')
yield Request(next_page,
self.parse_follows)
def parse_followers(self, response):
# 拿到当前用户的粉丝信息
results = json.loads(response.text)
if 'data' in results.keys():
for result in results.get('data'):
yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
self.parse_user)
# 粉丝信息翻页
if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
next_page = results.get('paging').get('next')
yield Request(next_page,
self.parse_followers)
モンゴに保管してください
# Zhihu/pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
class ZhihuPipeline(object):
def process_item(self, item, spider):
return item
class MongoPipeline(object):
collection_name = 'users'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
# 将用户信息更新到 mongo 数据库中
self.db[self.collection_name].update({'url_token': item['url_token']}, dict(item), True)
return item
ストレージ構成を変更します。
# Zhihu/settings.py
ITEM_PIPELINES = {
'zhihuuser.pipelines.MongoPipeline': 300,
# 'scrapy_redis.pipelines.RedisPipeline': 301
}
MONGO_URI = 'localhost'
MONGO_DATABASE = 'zhihu'