Scrapy框架自定义pipeline两层下载路径去下载图片,关于item传值的问题

版权声明:未经博主同意,禁止转载。谢谢! https://blog.csdn.net/cp_123321/article/details/84675034

自定义两层路径的时候,item是需要经过传值的,爬虫函数如下

import scrapy
from urllib.request import urljoin
from ..items import OffmymindspiderItem
class OffmymindSpider(scrapy.Spider):
    name = 'offmymind'
    allowed_domains = ['www.biaobaiju.com']
    start_urls = ['http://www.biaobaiju.com/']

    def parse(self, response):
        """
        获取每个分类的地址和分类的名称
        :param response:
        :return:
        """
        a_list = response.xpath("//ul[@class='nav clearfix']/li/a")
        for a in a_list:
            img_type_url = a.xpath("@href").extract_first("")
            img_type_name = a.xpath("text()").extract_first("")
            yield scrapy.Request(url=img_type_url, dont_filter=False, callback=self.parse_img_type_info, meta={"img_type_name":img_type_name})

    def parse_img_type_info(self, response):
        """
        解析每个分类地址的源代码,并取出每个图片集链接的网址
        :param response:
        :return:
        """
        div_list = response.xpath("//ul[@id='container']/li/div[2]")
        #因为parse()里item的值是不能丢弃的,所以在这需要用item接收一下,将meta里的值取出来,然后在parse_img_type_info()里给item再添加一个键值对(第二层路径),通过request一块传给下一个函数。
        item = response.meta
        #取每一页中的小分类的url地址
        for div in div_list:
            img_small_type_href = div.xpath("a/@href").extract_first("")
            img_small_type_name = div.xpath("a/text()").extract_first("")
            item["img_small_type_name"] = img_small_type_name
            yield scrapy.Request(url=img_small_type_href, dont_filter=True, callback=self.parse_every_small_type_info, meta=item)
        #判断是否有下一页;由于页数较多,这段代码没有执行,只下载每个分类的第一页
        # href = response.xpath("//ul[@class='pagination']/li[last()]/a/@href").extract_first("")
        # if href=="":
        #     print("没有下一页了!")
        # elif href!="":
        #     next_page_url = urljoin(response.url,href)
        #    #需要调用自身,即下载完第一页的图片后,判断是否有下一页,有的话在调用parse_img_type_info(),然后下载第二页的图片
        #     yield scrapy.Request(url=next_page_url, dont_filter=True, callback=self.parse_img_type_info, meta=item)

    def parse_every_small_type_info(self, response):
        """
        解析每个小分类地址的图片网址
        :return:
        """
        # 只取第一页的图片
        # print(response)
        p_list = response.xpath("//div[@class='content tag_blue']/p")
        for p in p_list:
            img_url = p.xpath("img/@src").extract_first("")
            if img_url=="":
                del img_url
            elif img_url!="":
                item = OffmymindspiderItem()
                item["img_url"] = [img_url]
                #img_type_name,img_small_type_name是以字典的形式存放在meta中的,属于response
                item["img_small_type_name"] = response.meta.get("img_small_type_name")
                item["img_type_name"] = response.meta.get("img_type_name")
                yield item

items.py中的代码

import scrapy
class ZhanzhangsucaispiderItem(scrapy.Item):
    name = scrapy.Field()
    img_url = scrapy.Field()
    img_path = scrapy.Field()

settings.py中修改的内容

ROBOTSTXT_OBEY = False#第22行需要改成False
#第67行
ITEM_PIPELINES = {
   'OffMyMindSpider.pipelines.CustomImagesPipeline': 300,
}
IMAGES_STORE = "imgs"

pipelines.py中的代码

from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
import scrapy
class CustomImagesPipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        for img_download_url in item.get("img_url"):
            yield scrapy.Request(url=img_download_url, meta={"item":item})

    def file_path(self, request, response=None, info=None):
        # print(request)
        #将item取出来
        item = request.meta["item"]
        img_type_name = item["img_type_name"]
        img_small_type_name = item["img_small_type_name"]
        #img_url存放在列表中
        img_url = item.get("img_url")[0].split("/")[-1]
        return "%s/%s/%s"%(img_type_name, img_small_type_name, img_url)

    def item_completed(self, results, item, info):
        print("---")
        img_path = results[0][1].get("path")
        if not img_path:
            raise DropItem("Image download failed, delete the corresponding item value, do not let it return out")
        item["img_path"] = img_path
        return item

具体内容的注释可以参考链接:https://mp.csdn.net/postedit/84668344

猜你喜欢

转载自blog.csdn.net/cp_123321/article/details/84675034