1. Crawl to Movie Paradise Data
1.1 File structure
1.2 Code
movie.py
import scrapy
from scrapy_movie_099.items import ScrapyMovie099Item
class MoiveSpider(scrapy.Spider):
name = 'moive'
allowed_domains = ['www.ygdy8.net']
start_urls = ['https://www.ygdy8.net/html/gndy/china/index.html']
def parse(self, response):
#//div[@class="co_content8"]//td[2]//a[2]/@href
#//div[@class="co_content8"]//td[2]//a[2]
print('=====================================')
#
a_list = response.xpath('//div[@class="co_content8"]//td[2]//a[2]')
#print(a_list.extract())#获取到的信息是整一个列表
#print(a_list.extract_first())#获取到的是列表中的第一个元素
for a in a_list:
#获取第一页的name和连接
name = a.xpath('./text()').extract_first()
href = a.xpath('./@href').extract_first()
#print(name.extract_first())
#第二页的地址是
url = 'https://www.ygdy8.net' + href
#print(url)
#dui dierdui对第二页的连接发起访问
yield scrapy.Request(url=url,callback=self.parse_second,meta={
'name':name})
print('=====================================')
pass
def parse_second(self,response):
#注意 如果拿不到数据的情况下 一定检查xpath的语法是否正确
#//div[@id="Zoom"]/span/img/@src
#//*[@id="Zoom"]/span/img
src = response.xpath('//div[@id="Zoom"]//img/@src').extract_first()
#jiesho接受到请求的那个meta参数的值
name = response.meta['name']
movie = ScrapyMovie099Item(src=src,name=name)
yield movie
items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class ScrapyMovie099Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
src = scrapy.Field()
pass
pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class ScrapyMovie099Pipeline:
def open_spider(self, spider):
self.fp = open('movie.json', 'w', encoding='utf-8')
def process_item(self, item, spider):
self.fp.write(str(item))
return item
def close_spider(self, spider):
self.fp.close()
setting.py
da open it
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'scrapy_movie_099.pipelines.ScrapyMovie099Pipeline': 300,
}