Small reptiles Exercise: heap sugar image crawl

#菠萝tang
#
coding:utf-8 import urllib2 import urllib import os import time import json import jsonpath def handle_request(url, sort, page ): qurey_string = '&type=feed&include_fields=top_comments%2Cis_root%2Csource_link%2Citem%2Cbuyable%2Croot_id%2Cstatus%2Clike_count%2Clike_id%2Csender%2Calbum%2Creply_count%2Cfavorite_blog_id&_type=&' url_use = url + sort + qurey_string + 'start=' + str(24*page) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', } request = urllib2.Request(url=url_use, headers=headers) return request def download_image(content): unicodestr = json.loads(content) url_list = jsonpath.jsonpath(unicodestr, "$..path") for li in url_list: dirname = 'DuiTang' if notos.path.exists (dirname): os.mkdir (dirname) filename = li.split ( ' / ' ) [-. 1 ] # Print (filename) filepath = dirname + ' / ' + filename # Print (filepath) the urllib. urlretrieve (Li, filepath) the time.sleep ( . 1 ) DEF main (): URL = ' https://www.duitang.com/napi/blog/list/by_search/?kw= ' START_PAGE = int (INPUT ( " Please input initial gripping position (24 a portion of the graph): " )) end_page = int (INPUT ( " Please enter an end gripping position: " )) Sort = the raw_input ( " Please enter the type of query: " ) for Page in Range (. 1-START_PAGE , end_page): Print ( ' % s of section begins Download ...... ' % (Page + 1 )) Request = handle_request (url, the Sort, Page) content = urllib2.urlopen (Request) .read () Print (content) # # parse the contents, extracts all image links , download pictures download_image (Content) Print ( '% S of partially downloaded ' % (Page +. 1 )) the time.sleep ( 2 ) IF the __name__ == ' __main__ ' : main ()

# Use python2.7

# Heap sugar pictures show is in accordance with the json come, paging just a cover-up, the main parameters are: kw, and start position!

# Get json data you need to learn! ! !

#unicodestr = json.loads(content)

#url_list = jsonpath.jsonpath(unicodestr, "$..path")

Guess you like

Origin www.cnblogs.com/lst-315/p/11493170.html