Crawling: http: //quotes.toscrape.com
Single Page
# -*- coding: utf-8 -*- import scrapy class QuoteSpider(scrapy.Spider): name = 'quote' allowed_domains = ['quotes.toscrape.com'] start_urls = ['http://quotes.toscrape.com/'] """ Knowledge Point text 1. text () Gets the label 2. @ property gets the value of the property 3. extract () Find more extract_first () to find a """ def parse(self, response): # print(response.text) quotes = response.xpath('//div[@class="col-md-8"]/div[@class="quote"]') # print(quotes)'' for quote in quotes: print('=' * 20) # print(quote) # extract_first() 查找一个 text = quote.xpath('.//span[@class="text"]/text()').extract_first() print(text) author = quote.xpath('.//span/small[@class="author"]/text()').extract_first() print(author) # extract()查找多个 tags = quote.xpath('.//div[@class="tags"]/a[@class="tag"]/@href').extract() print(tags)
All pages
# -*- coding: utf-8 -*- import scrapy class QuoteSpider(scrapy.Spider): name = 'quote' allowed_domains = ['quotes.toscrape.com'] start_urls = ['http://quotes.toscrape.com/'] """ Knowledge Point text 1. text () Gets the label 2. @ property gets the value of the property 3. extract () Find more extract_first () to find a 4. response.urljoin () url splicing 5. scrapy.Request(url=_next, callback=self.parse) 回调 """ def parse(self, response): # print(response.text) quotes = response.xpath('//div[@class="col-md-8"]/div[@class="quote"]') # print(quotes)'' for quote in quotes: print('=' * 20) # print(quote) # extract_first() 查找一个 text = quote.xpath('.//span[@class="text"]/text()').extract_first() print(text) author = quote.xpath('.//span/small[@class="author"]/text()').extract_first() print(author) # extract()查找多个 tags = quote.xpath('.//div[@class="tags"]/a[@class="tag"]/@href').extract() print(tags) print('>' * 40) next_url = response.xpath('//div[@class="col-md-8"]/nav/ul[@class="pager"]/li[@class="next"]/a/@href').extract_first() print(next_url) # 拼接url _next = response.urljoin(next_url) print(_next) # callback 回调函数 yield scrapy.Request(url=_next, callback=self.parse)