python爬虫如何下载高清图片

代码编写

爬虫编写

提取精选图片页面中的套图链接

detail_urls = response.xpath("//ul[@class='content']/li/a/@href").getall()


精选图片页面中下一页的处理

next_page = response.xpath("//div[@class='pageindex']/a[last()-1]/@href").get()
        if next_page:
            yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse)


从套图页面中提取列表模式的链接

list_pattern = response.xpath("//*[@id='cMode']/div/div[@class='side']/script").get()  # 提取列表模式的URL
        list_pattern = re.findall("/photolist/.*.html", list_pattern)[0]  # 匹配列表模式的url


从列表模式中下载高清大图

category = response.xpath("//div[@class='mini_left']/a[last()-1]/text()").get()
image_urls = response.xpath("//ul[@id='imgList']/li/a/img/@src").getall()
        image_urls = list(map(lambda x: x.replace("t_", ""), image_urls))  # 去除url中的"t_"得到高清大图
        image_urls = list(map(lambda x: response.urljoin(x), image_urls))
        yield CarhomehdItem(category=category, image_urls=image_urls)


编写ItemPipeline保存图片

class ImagePipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        request_objs = super(ImagePipeline, self).get_media_requests(item, info)
        for request_obj in request_objs:
            request_obj.item = item
        return request_objs

    def file_path(self, request, response=None, info=None):
        path = super(ImagePipeline, self).file_path(request, response, info)
        category = request.item.get("category")
        image_store = IMAGES_STORE
        category_path = os.path.join(image_store, category)
        if not os.path.exists(category_path):
            os.mkdir(category_path)
        image_name = path.replace("full/", "")
        image_path = os.path.join(category_path, image_name)
        return image_path

猜你喜欢

转载自blog.csdn.net/qwertyuiopasdfgg/article/details/89295703