scrapy download images

This information comes from the Internet. I'm just a porter. I don't remember the link address. Baidu.

---- create project

# scrapy startproject MyPicSpide

r # cd MyPicSpider

-----Create your own spider in the spiders folder of the project

 

 

 

MySpider.py is created

----- set settings.py

## set USER_AGENT

### Set a single USER_AGENT = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"

 

###Set multiple

USER_AGENT = [ "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)","Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", ]

###After setting multiple USER_AGENT, randomly set and modify middleware

middlewares.py ###Add this class at the end, you need to introduce random library

class RandomUserAgent(object):
    """Randomly change the user agent according to a predefined list"""

    def __init__(self, agents):
        self.agents = agents

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler.settings.getlist('USER_AGENTS'))

    def process_request(self, request, spider):
        request.headers.setdefault('User-Agent', random.choice(self.agents))

  

 ####Set the access scope, only access the content allowed by robot.txt

ROBOTSTXT_OBEY = True

####禁用cookies COOKIES_ENABLED = False

####Open ITEM_PIPELINES -----Use the default ImagesPipeline, you can use ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1}

-----Rewrite this class yourself ITEM_PIPELINES = {'project name.pipelines.MyImagesPipeline': 1}

####Set download path IMAGES_STORE='G:\\www\\scrapy_rpo\\pic'

When downloading, it will generate a full folder in the folder, and the pictures are stored in this folder, but it will be embarrassing if there are many directories involved, we will talk about this next time.

####Filter the picture, according to the width, height, below this data will not be collected

IMAGES_MIN_HEIGHT = 110 IMAGES_MIN_WIDTH = 110

####Generate Thumbnails IMAGES_THUMBS = { 'small': (50, 50), 'big': (270, 270), }

#### Modify pipelines.py

---------Use the default way to write nothing

--------Rewrite this class, mainly used to add new functions, see your own choice

import scrapy
from scrapy.pipeline.images import ImagesPipeline
from scrapy.exceptions import DropItem

class MyImagesPipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        for image_url in item['image_urls']:
            yield scrapy.Request(image_url,headers={'Referer':item['header_referer']})
    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        item['image_paths'] = image_paths
        return item

####Modify items.py, set fields

import scrapy
class PicspiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    tag = scrapy.Field()    
    image_urls = scrapy.Field() ##image path
    images = scrapy.Field()
    image_paths = scrapy.Field()  

 

####MySpider.py Get the image url path and pass in

def parse_item(self,response):
   print response.url
   item = PicspiderItem()
   tag = response.xpath('//h1[@class="articleV4Tit"]/text()').extract()
   #print tag [0]
   #obj = BeautifulSoup(response, 'html.parser')
   #li_list = obj.find('ul',{'class':'articleV4Page l'}).find_all('li')
   #li_list =   response.xpath('//ul[@class="articleV4Page l"]/li').extract()
   #print len ​​(li_list)
   srcs =  response.xpath('//*[@id="picBody"]/p/a[1]/img/@src').extract()
   item['image_urls'] = srcs
   return item
   #item['tag'] = tag[0]
   #items = []
   #items.append(item)
   #return items

#### run it

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=324575109&siteId=291194637