scrapy custom name to download pictures

Recent data has to be fetched crawling with reptiles picture, a picture began to use the default pipeline, that is configured in settings.py

  ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1}

. But the picture name but can not be customized, find ImagesPipeline, find the default name of the picture code

def file_path(self, request, response=None, info=None):
    ## start of deprecation warning block (can be removed in the future)
    def _warn():
        from scrapy.exceptions import ScrapyDeprecationWarning
        import warnings
        warnings.warn('ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, '
                      'please use file_path(request, response=None, info=None) instead',
                      category=ScrapyDeprecationWarning, stacklevel=1)

    # check if called from image_key or file_key with url as first argument
    if not isinstance(request, Request):
        _warn()
        url = request
    else:
        url = request.url

    # detect if file_key() or image_key() methods have been overridden
    if not hasattr(self.file_key, '_base'):
        _warn()
        return self.file_key(url)
    elif not hasattr(self.image_key, '_base'):
        _warn()
        return self.image_key(url)
    ## end of deprecation warning block

    image_guid = hashlib.sha1(to_bytes(url)).hexdigest()  # change to request.url after deprecation
    return 'full/%s.jpg' % (image_guid)

That is hashlib.sha1(to_bytes(url)).hexdigest(), the urlhash value. As the picture needs associated with some of the information, so there are three kinds of solutions.

1, recorded directly hash picture url in the message
2, custom ImagesPipeline, in scrapy picture pass in the name of custom.
3, custom ImagesPipeline, generate their own images in the name ImagesPipeline

Leaving aside scheme 1, scheme 3 can refer to this program two individuals feel more scalability. Which may be similar to the address for storing image IMAGES_URLS_FIELDprocessing method, a configuration in the settings IMAGES_NAME_FIELD, and then reads the items in the pipeline as the picture corresponding to the field name, so that the control picture name scrapy.
Specific code as follows:

1, scrapy add name attribute ImagenetItem

item = ImagenetItem()
url = imgs[i]
item['src'] = [url]
item['name'] = url[-35: -3]

2, the definition picture item in items.py in.

class ImagenetItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    src = scrapy.Field()
    name = scrapy.Field()
    pass

3, the definition of a pipeline inherit crapy.pipelines.images.ImagesPipeline in pipeline.py

class PicPipeline(ImagesPipeline):
    IMAGES_NAME_FIELD = ''

    def __init__(self, store_uri, download_func=None, settings=None):
        super(ImagesPipeline, self).__init__(store_uri, settings=settings,
                                             download_func=download_func)

        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        resolve = functools.partial(self._key_for_pipe,
                                    base_class_name="PicPipeline",
                                    settings=settings)
        self.expires = settings.getint(
            resolve("IMAGES_EXPIRES"), self.EXPIRES
        )

        if not hasattr(self, "IMAGES_RESULT_FIELD"):
            self.IMAGES_RESULT_FIELD = self.DEFAULT_IMAGES_RESULT_FIELD
        if not hasattr(self, "IMAGES_URLS_FIELD"):
            self.IMAGES_URLS_FIELD = self.DEFAULT_IMAGES_URLS_FIELD

        self.images_urls_field = settings.get(
            resolve('IMAGES_URLS_FIELD'),
            self.IMAGES_URLS_FIELD
        )
        self.images_result_field = settings.get(
            resolve('IMAGES_RESULT_FIELD'),
            self.IMAGES_RESULT_FIELD
        )
        self.min_width = settings.getint(
            resolve('IMAGES_MIN_WIDTH'), self.MIN_WIDTH
        )
        self.min_height = settings.getint(
            resolve('IMAGES_MIN_HEIGHT'), self.MIN_HEIGHT
        )
        self.thumbs = settings.get(
            resolve('IMAGES_THUMBS'), self.THUMBS
        )
        self.images_name_field = settings.get(
            resolve('IMAGES_NAME_FIELD'),
            self.IMAGES_NAME_FIELD
        )

    def get_media_requests(self, item, info):
        return [Request(x, meta={'file_name': item.get(self.images_name_field, '')}) for x in item.get(self.images_urls_field, [])]

    def file_path(self, request, response=None, info=None):
        ## start of deprecation warning block (can be removed in the future)
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn('ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, '
                          'please use file_path(request, response=None, info=None) instead',
                          category=ScrapyDeprecationWarning, stacklevel=1)

        # check if called from image_key or file_key with url as first argument
        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url

        image_guid = request.meta['file_name']  # change to request.url after deprecation
        return 'full/%s.jpg' % (image_guid)

Substantially in the original code changes, focus images_name_field defined field, the initial value init in the name get_media_requests method will spread the meta attribute request, and then to get the value in file_path process.

4, configure the pipeline we have just defined in settings.py.

ITEM_PIPELINES = {
   'tutorial.pipelines.PicPipeline': 1,
}

IMAGES_STORE = 'imagesDownload'
IMAGES_URLS_FIELD = 'src'#定义url字段
IMAGES_NAME_FIELD = 'name'#定义图片名字字段与item对应

This will expand IMAGES_NAME_FIELD field as the name of the picture.

scrapy custom name to download pictures

Guess you like