scrapy picture crawling multi-layered multiple pages are saved in different folders rename a folder full

Record the entire reptile code, I've climbed the test site. .

items.py

1  Import scrapy
 2  
3  
4  class DemoItem (scrapy.Item):
 5      # the DEFINE at The Fields for your Item here Wallpaper like:
 6      folder_name = scrapy.Field () # select the page where the topic or title as a folder name, instead of full folder
 7      #img_name = scrapy.Field () # extract the name of the picture, if not do not use
 8      img_url = scrapy.Field () # image link

spider.py

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 from demo.items import DemoItem
 4 
 5 
 6 class LogosSpider(scrapy.Spider):
 7     name = 'logos'
 8     allowed_domains = ['tttt8.net']
 9     #start_urls = ['http://www.tttt8.net/category/legbaby/']
10     #start_urls = ['http://www.tttt8.net/category/ugirls/']
11     #start_urls = ['http://www.tttt8.net/category/kelagirls/ ' ]
 12 is      start_urls = [ ' http://www.tttt8.net/category/xiurenwang/micatruisg/ ' ]
 13 is      # where the page is used to construct a page turning link
 14      page = 1 
15  
16      DEF the parse (Self, the Response):
 17          # to extract a list of all the columns
 18          li_list response.xpath = ( ' // * [@ the above mentioned id = "post_container"] / li ' )
 19          for Li in li_list:
 20 is              # of example
 21 is              Item = DemoItem ()
 22 is             # Extracting only the name column here, for later storage folder name
 23 is              Item [ ' FOLDER_NAME ' ] = li.xpath ( ' ./div [2] / H2 / A / text () ' ) .extract_first ()
 24              # extract links two pages, two pages ready to function to work
 25              next_plink = li.xpath ( ' ./div[1]/a/@href ' ) .extract_first ()
 26 is              #item to a page instantiation received content to the two page
 27              the yield scrapy.Request (URL = next_plink, the callback = self.parse2, Meta = { ' Item ' : Item})
 28          # turning a page list / self turn find other methods pages, online a lot, I think this simpler structure
 29         = response.xpath page_list ( ' // div [@ class = "the pagination"] / A / @ the href ' ) .extract ()
 30          # to find the last page
 31 is          LAST_PAGE page_list = [- . 1 ]
 32          # extracted maximum the page
 33 is          MAX_NUM = int (LAST_PAGE [- 2 ])
 34 is          # configured turning the page of the link
 35          IF self.page <= MAX_NUM:
 36              self.page + = . 1 
37 [              new_page_url self.start_urls = [ 0 ] + ' Page / ' + STR (self.page) + ' /' 
38 is              the yield scrapy.Request (URL = new_page_url, the callback = self.parse)
 39  
40      DEF parse2 (Self, Response):
 41 is          # receiving a content page, and the adapter request flip, or flip is lost, being given
 42          Item = response.meta [ ' Item ' ]
 43          P_LIST response.xpath = ( ' // * [@ the above mentioned id = "post_content"] / the p-/ img ' )
 44          # normal link to pepeline extract pictures to download
 45          for IMG in P_LIST:
 46 is              img_url = img.xpath ( ' ./@src ' ) .extract_first ()
 47             # This must be added in brackets [], a picture download function, a list type requires
 48              Item [ ' img_url ' ] = [img_url]
 49              yield Item
 50          page list # two page, then subsequent requests yield
 51 is          next_page_list = response.xpath ( ' // * [@ ID = "Content"] / div [. 1] / div [. 3] / div [2] / A / @ the href ' ) .extract ()
 52 is          for next_page in next_page_list:
 53 is              # here we must add meta, or else two flip on the error, and here I found out a long time.
54 is              the yield scrapy.Request (URL = next_page, the callback = self.parse2, Meta = { ' Item ' : Item})

settings.py

 1 # -*- coding: utf-8 -*-
 2 
 3 
 4 BOT_NAME = 'demo'
 5 SPIDER_MODULES = ['demo.spiders']
 6 NEWSPIDER_MODULE = 'demo.spiders'
 7 
 8 #存储路径和header
 9 IMAGES_STORE = 'D:\pics'
10 USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
11 
12 DOWNLOAD_DELAY = 0.2
13 Turn off the robot #
 14 ROBOTSTXT_OBEY = False
 15  
16 ITEM_PIPELINES = {
 . 17      ' demo.pipelines.DemoPipeline ' : 300 ,
 18 is  }
 . 19  # download the designated Field
 20 is IMAGES_URLS_FIELD = ' img_url '

pipelines.py

. 1  Import Scrapy
 2  from scrapy.exceptions Import DropItem
 . 3  from scrapy.pipelines.images Import ImagesPipeline
 . 4  
. 5  
. 6  class DemoPipeline (ImagesPipeline):
 . 7      # rewritable fixed function without modification
 . 8      DEF get_media_requests (Self, Item, info):
 . 9          for img_url in Item [ ' img_url ' ]:
 10              # after downloading to other functions to change the name, so use meta to generation
 . 11              the yield scrapy.Request (img_url, meta = { ' Item ' : Item})
 12 is  
13 is     file_path DEF (Self, Request, Response = None, info = None):
 14          Item request.META = [ ' Item ' ]
 15          FOLDER_NAME = Item [ ' FOLDER_NAME ' ]
 16          # Item img_name = [ ' img_name ' ] # image no name this statement is not enabled
 17          # name is not used because the picture taken url string as the name of the last
 18 is          image_guid request.url.split = ( ' / ' ) [- . 1 ]
 . 19          img_name = image_guid
 20 is          # name = img_name + image_guid
 21 is         Name = name + # ' .jpg ' 
22 is          # 0 represent a folder, a representative file
 23 is          filename = U ' {0} / {1} ' .format (FOLDER_NAME, img_name)
 24          return filename
 25  
26 is      # rewritable fixed function, no need to modify
 27      DEF item_completed (Self, Results, Item, info):
 28          image_paths = [X [ ' path ' ] for OK, X in Results IF OK]
 29          IF Not image_paths:
 30             raise DropItem('Image Downloaded Failed')
31         return item

result:

 

 

 

Guess you like

Origin www.cnblogs.com/passagain/p/11607711.html