spider_ crawling bucket Figure it all expression package (save picture)

"" " 
Crawling inside the bucket map it all expression package

knowledge summary:
First, use the library requests crawling, random request header (website anti-climb measures less hang a request header is enough.)

Second, the specific ideas:
1. first crawling all pictures url, on the list
2. loop through url request these images
3. save the picture classification
Third, the classification is used to save a knowledge point, endwith (ending ***)
using this function, the different types of jpg, gif, png and other image separately
Fourth, presentation case, only the front crawl 20.
Friday, all definition digital variable meaning:
the n-1 = web pages
num = 1 save the image number

Note: the case mainly exercises
    1. save the picture, it should be noted that the use of a stream of bytes when saving.
    2. reptile writing code logical thinking.
"" "
from fake_useragent Import FakeUserAgent
Import Requests
Import Re
Import time
Import Random
DEF getURLs ( ):
# url will get to the list in this list into the list
= URL_LIST []
n-=. 1
the while True:
URL = F "http://www.doutula.com/article/list/?page={n}"
headers = {
"the User-- Agent": FakeUserAgent () Random.
}
= requests.get htmlStr (url, headers = headers) .text
# Print (htmlStr)

# define the picture extraction url regular
urls = r'data-Original = "(. *?)" '
# compiler
urls = re.compile ( URLs)
# extract regular
URLs = the re.findall (URLs, htmlStr)
url_list.append (URLs)
Print ( "s are collected on page% picture" n-%)
n-n-+. 1 =
# sets the delay
time.sleep (0.5 )
# when n = 21 when the crawler is stopped.To demonstrate easy to save only the first twenty in fact, a total of 615
== 21 is n-IF:
BREAK
Print (len (URL_LIST))
return URL_LIST


# Save Image
# in bytes of the picture stored
DEF downloads (URL_LIST):
NUM =. 1
for URLs in URL_LIST:
for URLs in URL:
header = {
"the User-Agent": FakeUserAgent () Random.
}
# crawling into the picture because there are two, so write a judgment condition, classification preservation,
IF url.endswith ( 'JPG.'):
# saved locally
with open ( './img/%s.jpg'%num,'wb') AS File:
# of bytes acquired image
IMG = requests.get (URL, header = headers)
a file.write (img.content)
print ( "% s of Saving images"% NUM)
NUM = +. 1
the time.sleep (0.3)
elif url.endswith ( 'GIF.'):
# saved locally
with open ( './ img /% s. GIF '% NUM,' wb ') AS File:
# get the picture byte
img = requests.get (url, headers = header)
file.write (img.content)
Print ( "saving% s of pictures"% NUM)
NUM = +. 1
the time.sleep (0.3)

IF the __name__ == '__main__':
URL_LIST = getURLs ()
downloads (URL_LIST)

Guess you like

Origin www.cnblogs.com/YangQingHong/p/11006257.html