Reptile exercise - climbing sister map

the urllib.request Import 
Import The urllib.parse
Import Re, OS, Time
'' '
splice url, sends a request to obtain the contents of the response, response content analysis, data storage
' ''
DEF Get_Request (NEW_URL):
headers = {
'- Agent-the User' : 'the Mozilla / 5.0 (the Windows NT 10.0; Win64; x64-) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 76.0.3809.100 Safari / 537.36',
}
Request = urllib.request.Request (URL = NEW_URL, headers = headers)
Request return

DEF get_content (Request):
Response = the urllib.request.urlopen (Request)
# Print (. response.read () decode ( 'GBK'))
return response.read () decode ( 'GBK').

DEF parse_content ( Content):
'' '
<div class = "thumb">
href="/article/122125840" target="_blank"> <a
<IMG the src = "// pic.qiushibaike.com/system/pictures/12212/122125840/medium/EAB3PVYM1XFGJF1A.jpg" Alt = "is a fat chin They are the meat of me ">
</a>
</ div>
<div class =" PIC ">
<a target="_blank" href="https://www.meizitu.com/a/5521.html">
<img src = "http://pic.topmeizi.com/wp-content/uploads/2017a/04/08/limg.jpg" alt = "<b> Figure welfare weekend, my uncle served last with a sense of </ B> ">
</a>
</ div>
<H3 class =" TIT ">
<a href="https://www.meizitu.com/a/5521.html" target="_blank">
<b> Figure welfare weekend, my uncle served last with a sense of </ b>
</a>
</ h3>
'' '
patten = re.compile (r '<div class = "pic">. *? <img src = "(. *?)" alt = "(. *?)". *? </ div>', re. S)
RET = patten.findall (Content)
Print (RET)
# Print (len (RET))
down_load (RET)
#
DEF down_load (RET):
dirname = 'MZ'
for TP in RET:
# extracted image address
image_url = tp [0]
# name image taken
image_name TP = [-1] [. 3: -4]
# image_name = TP [. 1]
# = patten1 the re.compile (R & lt '<B> (*) </ B>.')
# = patten1.findall RET1 (image_name)
# generate folder name
IF not os.path.exists (dirname):
os.mkdir(dirname)
filename =image_name+'.'+ image_url.split('.')[-1]
Print # (filename)
filepath = os.path.join (dirname, filename)
# Print (filepath)
Print ( 'S .... Picture Downloading%'% filename)
urllib.request.urlretrieve (image_url, filepath)
Print ( 'end download pictures S ....%'% filename)
the time.sleep (2)

DEF main ():
# enter the starting page number
start_page = int (input ( "Please enter the starting page number"))
end_page = int (the iNPUT ( "Please enter an end page number"))
url = 'https://www.meizitu.com/a/'
for page in the Range (START_PAGE, end_page + 1):
Print ( "downloading% s of pages .... . "Page%)
# splicing URL
NEW_URL URL = + 'list_1 _' + STR (Page) + '.html'
# print(page)
# print(new_url)
request = get_request(new_url)
content = get_content(request)
# print(content.read().decode('gbk'))
parse_content(content)
print("结束下载第%s页....." % page)
time.sleep(2)
if __name__ == '__main__':
main()

Guess you like

Origin www.cnblogs.com/zhangshuntao123/p/11626704.html