scrapy framework to crawl wallpaper site and download an image to a local file

First need to determine the contents taken to climb, so the first step should be to determine the fields you want to climb:

  First, determine what you want the items to climb

class MeizhuoItem (scrapy.Item):
     # the DEFINE at The Fields for your Item here Wallpaper like: 
    # name = scrapy.Field () 
    # atlas title 
    title = scrapy.Field ()
     # image url, need to crawl pictures 
    = URL scrapy.Field ()
     Pass

 

After completion of the field is determined to climb, is the analysis of the requested Web page, find the content they need, I'm here to climb inside the wallpaper picture is http://www.win4000.com

   First, when we analyze, point to a portfolio which can be found in the real photos are there, so we go two steps

    1, find all Atlas url

    2, according to the atlas to specify the url the URL to find out all the photos inside

      But then find the photo url facing a problem that the url is not what we really want, it also added a final layer of decoration, so we have to further action on this url

# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import Selector
from fake_useragent import UserAgent
from meizhuo.items import MeizhuoItem

headers = {
    'user-agent': UserAgent(verify_ssl=False).chrome
}


class MzSpider(scrapy.Spider):
    name = 'mz'
    allowed_domains = ['www.win4000.com']
    start_urls =[
         ' Http://www.win4000.com/wallpaper_2285_0_0_1.html ' ,
         ' http://www.win4000.com/wallpaper_204_0_0_1.html ' 
    ] 

    DEF the parse (Self, Response): 
        SEL = Selector (Response) 
        List = SEL .xpath ( ' // * [@ class = "list_cont Left_list_cont"] / div / div / div / ul / li / a ' ) 

        for img in List:
             # this is the url of each atlas get 
            url = img.xpath ( ' @href ' ) .extract_first () 
            title = img.xpath ( '@title ' ) .extract_first ()
             # my each URL parses 
            the yield scrapy.Request (URL, the callback = self.get_all_img, Meta = { ' title ' : title})
         # to be positioned next, if present to jump 
        next_url = sel.xpath ( ' // * [@ class = "Next"] / @ the href ' ) .extract_first ()
         IF next_url IS  Not None:
             the yield scrapy.Request (next_url, the callback = self.parse) 

    DEF get_all_img (Self, Response): 
        Item = MeizhuoItem () 

        Container = []
        SEL = Selector (the Response)
         # This is the total of all the pictures of all the pages 
        img_list = sel.xpath ( ' // * [@ class = "the Scroll-img-CONT"] / ul ' )
         for img in img_list: 
            img_url img.xpath = ( ' Li / a / IMG / @-Original Data ' ) .extract ()
             for url in img_url:
                 # this url or need to be processed, so the cycle to be modified one by one out 
                cmp_url = url.split ( ' _ ' ) [0] + ' .jpg '  
                container.append (cmp_url)
            Item [ ' URL'] = container
            item['title'] = response.meta['title']
            # print(container)

            yield item
            container.clear()

 

 

After all the photos of atlases we will find out the url to do is to download an image to the local, so we configured the relevant configuration download path of the pipelines, and then crawl to the use of pictures and url with rqeuests modules use to get to the real picture of binary data, and then the binary data is written to the specified file folders in your configuration,

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import os
import requests
from meizhuo.settings import IMAGES_STORE
from fake_useragent import UserAgent

headers = {
    'user-agent': UserAgent(verify_ssl=False).chrome
}


class MeizhuoPipeline(object):
    defprocess_item (Self, Item, Spider): 
        dir_path = IMAGES_STORE 
        title = Item [ ' title ' ] 
        img_list = Item [ ' url ' ]
         Print (title, img_list)
         "" " 
        1, I want to create the specified path 
        2, and then I want to use requests to that module obtains binary data is saved into a url 
        "" " 
        IF  not os.path.exists (IMAGES_STORE): 
            os.mkdir (IMAGES_STORE) 

        # If the head folder exists 
        COLLECTION_URL = os.path.join (IMAGES_STORE , title)
         Print ( '111', collection_url)
        if not os.path.exists(collection_url):
            os.mkdir(collection_url)
        for url_list in range(len(img_list)):
            cmp_url = os.path.join(collection_url, img_list[url_list])

            # print(cmp_url)
            file_path = os.path.join(collection_url, title) + str(url_list) + '.jpg'
            print(file_path)
            with open(file_path, 'wb') as fp:
                res = requests.get(img_list[url_list], headers=headers).content
                # print(img_list[url_list])
                fp.write(res)
                print('insert successfully!!!')

 

 

The final step is to talk about, in the configuration settings are:

= BOT_NAME ' meizhuo ' 

SPIDER_MODULES = [ ' meizhuo.spiders ' ] 
NEWSPIDER_MODULE = ' meizhuo.spiders ' 
ROBOTSTXT_OBEY = True 
ITEM_PIPELINES = {
    ' meizhuo.pipelines.MeizhuoPipeline ' : 300 , 
} 
# download path head 
IMAGES_STORE = ' / the Users / MAC / Documents / Girl ' 
# download latency 
DOWNLOAD_DELAY = 0.3

 

  

Guess you like

Origin www.cnblogs.com/tulintao/p/11594130.html