import urllib.request
import urllib.parse
import re
import os
import time
def handle_request(url,page):
url = url + str(page)+'.html'+'/'
headers = {"User-Agent":"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Mobile Safari/537.36"}
requrst = urllib.request.Request(url=url,headers=headers)
return request
def download_image (content):
pattern = re.compile(r'<div class="center">.*?<img scr="(.*?)".*?></div',re.S)
It = pattern.findall(content)
#print(It)
for image_src in It:
image_src = 'https:' + image_src
#发送请求下载图片
#创建文件夹
dirname = '糗图'
if not os.path.exists(dirname):
os.mkdir(dirname)
#图片的名字叫什么
filename = image_src.split('/')[-1]
filepath = dirname + '/' + filename
print('%s图片正在下载....'% filename)
urllib.request.urlretrieve(image_src,filepath)
print('%s图片结束下载....'% filename)
time.sleep(1)
def main():
url=''
start_page = int(input('请输入起始页:'))
end_page = int(input('请输入结束页码:'))
for page in range(start_page,end_page+1):
#生产请求对象
print('第%s页开始下载...'% page)
request = handle_request(url,page)
content = urllib.request.urlopen(request).read().decode('gbk') #已经下载好了整个的HTML
#解析内容提取图片链接,下载图片
download_image(content)
print('第%s页结束下载'% page)
print()
print()
time.sleep(2)
if __name__ == '__main__':
main()
练手代码(提供的是框架需修改才可用)
猜你喜欢
转载自blog.csdn.net/weixin_46244909/article/details/104214573
今日推荐
周排行