First need to determine the contents taken to climb, so the first step should be to determine the fields you want to climb:
First, determine what you want the items to climb
class MeizhuoItem (scrapy.Item): # the DEFINE at The Fields for your Item here Wallpaper like: # name = scrapy.Field () # atlas title title = scrapy.Field () # image url, need to crawl pictures = URL scrapy.Field () Pass
After completion of the field is determined to climb, is the analysis of the requested Web page, find the content they need, I'm here to climb inside the wallpaper picture is http://www.win4000.com
First, when we analyze, point to a portfolio which can be found in the real photos are there, so we go two steps
1, find all Atlas url
2, according to the atlas to specify the url the URL to find out all the photos inside
But then find the photo url facing a problem that the url is not what we really want, it also added a final layer of decoration, so we have to further action on this url
# -*- coding: utf-8 -*- import scrapy from scrapy.selector import Selector from fake_useragent import UserAgent from meizhuo.items import MeizhuoItem headers = { 'user-agent': UserAgent(verify_ssl=False).chrome } class MzSpider(scrapy.Spider): name = 'mz' allowed_domains = ['www.win4000.com'] start_urls =[ ' Http://www.win4000.com/wallpaper_2285_0_0_1.html ' , ' http://www.win4000.com/wallpaper_204_0_0_1.html ' ] DEF the parse (Self, Response): SEL = Selector (Response) List = SEL .xpath ( ' // * [@ class = "list_cont Left_list_cont"] / div / div / div / ul / li / a ' ) for img in List: # this is the url of each atlas get url = img.xpath ( ' @href ' ) .extract_first () title = img.xpath ( '@title ' ) .extract_first () # my each URL parses the yield scrapy.Request (URL, the callback = self.get_all_img, Meta = { ' title ' : title}) # to be positioned next, if present to jump next_url = sel.xpath ( ' // * [@ class = "Next"] / @ the href ' ) .extract_first () IF next_url IS Not None: the yield scrapy.Request (next_url, the callback = self.parse) DEF get_all_img (Self, Response): Item = MeizhuoItem () Container = [] SEL = Selector (the Response) # This is the total of all the pictures of all the pages img_list = sel.xpath ( ' // * [@ class = "the Scroll-img-CONT"] / ul ' ) for img in img_list: img_url img.xpath = ( ' Li / a / IMG / @-Original Data ' ) .extract () for url in img_url: # this url or need to be processed, so the cycle to be modified one by one out cmp_url = url.split ( ' _ ' ) [0] + ' .jpg ' container.append (cmp_url) Item [ ' URL'] = container item['title'] = response.meta['title'] # print(container) yield item container.clear()
After all the photos of atlases we will find out the url to do is to download an image to the local, so we configured the relevant configuration download path of the pipelines, and then crawl to the use of pictures and url with rqeuests modules use to get to the real picture of binary data, and then the binary data is written to the specified file folders in your configuration,
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html import os import requests from meizhuo.settings import IMAGES_STORE from fake_useragent import UserAgent headers = { 'user-agent': UserAgent(verify_ssl=False).chrome } class MeizhuoPipeline(object): defprocess_item (Self, Item, Spider): dir_path = IMAGES_STORE title = Item [ ' title ' ] img_list = Item [ ' url ' ] Print (title, img_list) "" " 1, I want to create the specified path 2, and then I want to use requests to that module obtains binary data is saved into a url "" " IF not os.path.exists (IMAGES_STORE): os.mkdir (IMAGES_STORE) # If the head folder exists COLLECTION_URL = os.path.join (IMAGES_STORE , title) Print ( '111', collection_url) if not os.path.exists(collection_url): os.mkdir(collection_url) for url_list in range(len(img_list)): cmp_url = os.path.join(collection_url, img_list[url_list]) # print(cmp_url) file_path = os.path.join(collection_url, title) + str(url_list) + '.jpg' print(file_path) with open(file_path, 'wb') as fp: res = requests.get(img_list[url_list], headers=headers).content # print(img_list[url_list]) fp.write(res) print('insert successfully!!!')
The final step is to talk about, in the configuration settings are:
= BOT_NAME ' meizhuo ' SPIDER_MODULES = [ ' meizhuo.spiders ' ] NEWSPIDER_MODULE = ' meizhuo.spiders ' ROBOTSTXT_OBEY = True ITEM_PIPELINES = { ' meizhuo.pipelines.MeizhuoPipeline ' : 300 , } # download path head IMAGES_STORE = ' / the Users / MAC / Documents / Girl ' # download latency DOWNLOAD_DELAY = 0.3