Create project ignore
Add in setting.py
#Added a deduplication container class configuration, which uses the Redis set collection to store fingerprint data to achieve deduplication persistence. DUPEFILTER_CLASS="scrapy_redis.dupefilter.RFPDupeFilter" #Use scrapy_redis component's own scheduler SCHEDULER="scrapy_redis.scheduler.Scheduler" #Configure whether the scheduler is persistent, that is, whether to clear the request queue and deduplication fingerprint set in Redis when the crawler is finished. #True means that if you climbed today, you will not climb again tomorrow. SCHEDULER_PERSIST=True ITEM_PIPELINES={ 'scrapy_redis.pipelines.RedisPipeline':400 } #Save to Redis REDIS_HOST="127.0.0.1" REDIS_PORT=6379
Write in crawler file
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_redis.spiders import RedisCrawlSpider
from fbsPro.items import FbsproItem
class FbsSpider(RedisCrawlSpider):
name = "fbs"
# allowed_domains = ["www.xxx.com"]
# start_urls = ["https://www.xxx.com"]
redis_key = ' sanQuqu ' #Scheduler name that can be shared
rules = (Rule(LinkExtractor(allow=r"id=1&page=\d+"), callback="parse_item", follow=True),)
def parse_item(self, response):
li_list=response.xpath('/html/body/div[2]/div[3]/ul[2]/li')
for li in li_list:
title=li.xpath('./span[3]/a/text()').extract_first()
item=FbsproItem()
item['title']=title
yield item
#need to override method
def make_requests_from_url(self, url):
yield scrapy.Request(url=url)
Run crawler file
After running, wait for manually entering the Url in the Shell window.
Then open the Redis installation directory
Right click to open the Shell window
Enter the command redis-server
Open the client redis-cli
This is the code above "" redis_key = ' sanQuqu ' #Scheduler name that can be shared ""
Enter lpush sanQuqu and enter the URL to be crawled here.
Press Enter to see the crawled data in pycharm