Scrapy爬取豆瓣电影Top250信息

爬取多页电影信息及简介

  爬取标题,评分,排名,摘要,剧情简介

  使用 Feed exports 保存爬取的数据

代码:

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 import re
 4 from ..items import ScrapyDoubanItem
 5 
 6 
 7 class DoubanSpider(scrapy.Spider):
 8     name = 'douban'
 9     # allowed_domains = ['douban.com']
10     start_urls = ['https://movie.douban.com/top250']
11 
12     def parse(self, response):
13         # print(response.url)
14         abstract = response.xpath('//*[@class="inq"]/text()').extract()
15         detail_pages = response.xpath('//div[@class="hd"]/a/@href').extract()
16         # print(detail_pages)
17         for ind,detail_page in enumerate(detail_pages):
18             # print(detail_page)
19             abstract_detail = abstract[ind]
20             yield scrapy.Request(detail_page,callback=self.parse_detail,meta={'abstract_detail':abstract_detail})
21         next_page = response.xpath('//span[@class="next"]/a/@href').extract_first()
22         base_url = 'https://movie.douban.com/top250'
23         # print(title)
24         if next_page:
25             yield scrapy.Request(url=base_url+next_page,callback=self.parse)
26 
27     def parse_detail(self, response):
28         # print(response.url)
29         title = response.xpath('//*[@property="v:itemreviewed"]/text()').extract_first()
30         score = response.xpath('//*[@class="ll rating_num"]/text()').extract_first()
31         rank = response.xpath('//*[@class="top250-no"]/text()').extract()
32         describe = response.xpath('//*[@property="v:summary"]/text()').extract_first()
33         abstract_detail = response.meta['abstract_detail']
34         item = ScrapyDoubanItem()
35         item['title'] = title
36         item['score'] = score
37         item['rank'] = rank
38         item['abstract_detail'] = abstract_detail
39         item['describe'] = describe
40         yield item
41         # print(title, abstract_detail, score, rank)
42         # print(score, abstract_detail)

猜你喜欢

转载自www.cnblogs.com/JinZL/p/11720171.html