Python crawler small case (6) ——— scrapy-2

1. Crawl to Movie Paradise Data

1.1 File structure

Please add a picture description

1.2 Code

movie.py

import scrapy

from scrapy_movie_099.items import ScrapyMovie099Item
class MoiveSpider(scrapy.Spider):
    name = 'moive'
    allowed_domains = ['www.ygdy8.net']
    start_urls = ['https://www.ygdy8.net/html/gndy/china/index.html']

    def parse(self, response):
        #//div[@class="co_content8"]//td[2]//a[2]/@href
        #//div[@class="co_content8"]//td[2]//a[2]
        print('=====================================')
        #
        a_list = response.xpath('//div[@class="co_content8"]//td[2]//a[2]')
        #print(a_list.extract())#获取到的信息是整一个列表
        #print(a_list.extract_first())#获取到的是列表中的第一个元素
        for a in a_list:
            #获取第一页的name和连接
            name = a.xpath('./text()').extract_first()
            href = a.xpath('./@href').extract_first()
            #print(name.extract_first())

            #第二页的地址是
            url = 'https://www.ygdy8.net' + href
            #print(url)

            #dui dierdui对第二页的连接发起访问
            yield scrapy.Request(url=url,callback=self.parse_second,meta={
    
    'name':name})

        print('=====================================')
        pass
    def parse_second(self,response):
        #注意 如果拿不到数据的情况下 一定检查xpath的语法是否正确
        #//div[@id="Zoom"]/span/img/@src
        #//*[@id="Zoom"]/span/img
        src = response.xpath('//div[@id="Zoom"]//img/@src').extract_first()

        #jiesho接受到请求的那个meta参数的值
        name = response.meta['name']

        movie = ScrapyMovie099Item(src=src,name=name)
        yield movie

items.py

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class ScrapyMovie099Item(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    name = scrapy.Field()
    src = scrapy.Field()
    pass

pipelines.py

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter


class ScrapyMovie099Pipeline:
    def open_spider(self, spider):
        self.fp = open('movie.json', 'w', encoding='utf-8')

    def process_item(self, item, spider):
        self.fp.write(str(item))
        return item

    def close_spider(self, spider):
        self.fp.close()

setting.py
da open it

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    
    
   'scrapy_movie_099.pipelines.ScrapyMovie099Pipeline': 300,
}

Guess you like

Origin blog.csdn.net/guoguozgw/article/details/128856698