scrapy--- 爬虫框架爬取图片

setting設置

ITEM_PIPELINES = {
   'girls.pipelines.GirlsPipeline': 300,
    'scrapy.pipelines.images.ImagesPipeline': None
}
IMAGES_STORE = r'F:\myScrapy\girls\girls\spiders\img

DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
"User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36'
}

spider
选择器我用的是css 选择器

# -*- coding: utf-8 -*-
import scrapy
from ..items import *

class GetgirlSpider(scrapy.Spider):
    name = 'getgirl'
    allowed_domains = ['2717.com']
    start_urls = ['https://www.2717.com/tag/1756.html']

    def parse(self, response):
        item = GirlsItem()
        hrefs = response.css("div.TagPage li>a::attr(href)").getall()
        hrefs_pic = response.css("ul.w110.oh.Tag_list li>a::attr(href)").getall()
        for href_pic in hrefs_pic:
            yield  response.follow(href_pic,self.get_href)
        for href in hrefs:
            yield response.follow(href,self.parse)

    def get_href(self,response):
        hrefs = response.css("ul.articleV4Page.l li>a::attr(href)").getall()
        for href in hrefs:
            if ".html" in href:
                yield response.follow(href,self.get_pic)
    def get_pic(self,response):
        items = GirlsItem()
        src = response.css("p[align=center] img::attr(src)").get()
        title = response.css("p[align=center] img::attr(alt)").get()
        items['src'] = src
        items['title'] = title
        yield items

items


# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class GirlsItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    src = scrapy.Field()
    title = scrapy.Field()
    img_path = scrapy.Field()

pipelines 没用scrapy框架默认的管道

import scrapy
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
import time
import urllib
import os
class GirlsPipeline(object):
    def process_item(self, item, spider):
        time_now = time.time()
        path = "F:/myScrapy/girls/girls/spiders/PIC/"
        filename = path + str(time_now) + ".jpg"
        with open(filename,"wb") as f:
            req = urllib.request.urlopen(item['src'])
            f.write(req.read())
        return item

scrapy--- 爬虫框架爬取图片

猜你喜欢