Create a virtual environment
mkvirtualenv zhuhu
Create a project
pip install scrapy
scrapy startproject Zhihu
Initialize git repository
git init
Add .gitignore file:
# .gitignore
/.idea
*.pyc
Create a file reptiles
scrapy genspider zhihu zhihu.com
Coding
First, we try to run:
scrapy crawl zhihu
It should be able to run through.
Change does not comply with robots protocol:
# Zhihu/settings.py
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
Test know almost direct request interface 400 will return. Means exception request.
import requests as req
from fake_useragent import UserAgent
ua = UserAgent()
headers = {"User-Agent": ua.random}
url = "https://www.zhihu.com/api/v4/members/traderusingpython/activities?limit=7&session_id=1204109748246929408&after_id=1581262642&desktop=True"
ret = req.get(url, headers=headers)
print(ret)
Setting a default request header for a program:
# Zhihu/settings.py
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'authorization': 'oauth c3cef7c66a1843f8b3a9e6a1e3160e20',
}
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
Crawling user information
First, get current information about the user, and then parse out its concerns people and fans, and then get the user to continue to resolve personal information, analytical focus and the fans information.
# -*- coding: utf-8 -*-
import json
import scrapy
from scrapy import Request
from Zhihu.items import UserItem
class ZhihuSpider(scrapy.Spider):
name = 'zhihu'
allowed_domains = ['zhihu.com']
user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}'
start_user = 'traderusingpython'
user_query = 'locations,employments,gender,educations,business,voteup_count,thanked_Count,follower_count,following_count,cover_url,following_topic_count,following_question_count,following_favlists_count,following_columns_count,answer_count,articles_count,pins_count,question_count,commercial_question_count,favorite_count,favorited_count,logs_count,marked_answers_count,marked_answers_text,message_thread_token,account_status,is_active,is_force_renamed,is_bind_sina,sina_weibo_url,sina_weibo_name,show_sina_weibo,is_blocking,is_blocked,is_following,is_followed,mutual_followees_count,vote_to_count,vote_from_count,thank_to_count,thank_from_count,thanked_count,description,hosted_live_count,participated_live_count,allow_message,industry_category,org_name,org_homepage,badge[?(type=best_answerer)].topics'
def start_requests(self):
yield Request(self.user_url.format(user=self.start_user, include=self.user_query), self.parse_user)
def parse_user(self, response):
result = json.loads(response.text)
item = UserItem()
for field in item.fields:
if field in result.keys():
item[field] = result.get(field)
yield item
# -*- coding: utf-8 -*-
import json
import scrapy
from scrapy import Request
from Zhihu.items import UserItem
class ZhihuSpider(scrapy.Spider):
name = 'zhihu'
allowed_domains = ['zhihu.com']
user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}'
start_user = 'traderusingpython'
user_query = 'locations,employments,gender,educations,business,voteup_count,thanked_Count,follower_count,following_count,cover_url,following_topic_count,following_question_count,following_favlists_count,following_columns_count,answer_count,articles_count,pins_count,question_count,commercial_question_count,favorite_count,favorited_count,logs_count,marked_answers_count,marked_answers_text,message_thread_token,account_status,is_active,is_force_renamed,is_bind_sina,sina_weibo_url,sina_weibo_name,show_sina_weibo,is_blocking,is_blocked,is_following,is_followed,mutual_followees_count,vote_to_count,vote_from_count,thank_to_count,thank_from_count,thanked_count,description,hosted_live_count,participated_live_count,allow_message,industry_category,org_name,org_homepage,badge[?(type=best_answerer)].topics'
follows_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
followers_url = 'https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}'
follows_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
followers_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
def start_requests(self):
yield Request(self.user_url.format(user=self.start_user, include=self.user_query), self.parse_user)
def parse_user(self, response):
result = json.loads(response.text)
item = UserItem()
for field in item.fields:
if field in result.keys():
item[field] = result.get(field)
yield item
current_user = result.get("url_token")
# 获取当前用户的关注者
yield Request(
self.follows_url.format(user=current_user, include=self.follows_query, limit=20, offset=0),
self.parse_follows)
# 获取当前用户的粉丝
yield Request(
self.followers_url.format(user=current_user, include=self.followers_query, limit=20, offset=0),
self.parse_followers)
def parse_follows(self, response):
# 拿到当前用户的关注信息
results = json.loads(response.text)
if 'data' in results.keys():
for result in results.get('data'):
yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
self.parse_user)
# 关注信息翻页
if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
next_page = results.get('paging').get('next')
yield Request(next_page,
self.parse_follows)
def parse_followers(self, response):
# 拿到当前用户的粉丝信息
results = json.loads(response.text)
if 'data' in results.keys():
for result in results.get('data'):
yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
self.parse_user)
# 粉丝信息翻页
if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
next_page = results.get('paging').get('next')
yield Request(next_page,
self.parse_followers)
Store in the mongo
# Zhihu/pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
class ZhihuPipeline(object):
def process_item(self, item, spider):
return item
class MongoPipeline(object):
collection_name = 'users'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
# 将用户信息更新到 mongo 数据库中
self.db[self.collection_name].update({'url_token': item['url_token']}, dict(item), True)
return item
Modify the storage configuration
# Zhihu/settings.py
ITEM_PIPELINES = {
'zhihuuser.pipelines.MongoPipeline': 300,
# 'scrapy_redis.pipelines.RedisPipeline': 301
}
MONGO_URI = 'localhost'
MONGO_DATABASE = 'zhihu'