一: 导入的包和网址请求
import urllib.request import re def handle_url(url,page): url = url + str(page) headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"} req = urllib.request.Request(url=url,headers=headers) return reqer
二: 响应后数据的处理
# 定一个函数用于处理每一个页面 def handle_pages(req): res = urllib.request.urlopen(req) html = res.read().decode('utf8') # 匹配图片所在的div pat = re.compile(r'<div class="thumb">.*?<img src="(.*?)" alt=.*?>.*?</div>', re.S) src = pat.findall(html) # print(src) num = 1 for url in src: # 拼接上“http:” url = "http:" + url urllib.request.urlretrieve(url, "./images/"+str(num)+".jpg") num += 1
三: 主函数
def main(): url = "https://www.qiushibaike.com/pic/page/" start_page = input("请输入起始页:") end_page = input("请输入结束页:") print("开始下载") for page in range(int(start_page),int(end_page)+1): # print(handle_url(url,page)) handle_pages(handle_url(url,page)) print("下载完毕!") if __name__ == '__main__': main()
四: 整体代码
import urllib.request import re def handle_url(url,page): url = url + str(page) headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"} req = urllib.request.Request(url=url,headers=headers) return req # 定一个函数用于处理每一个页面 def handle_pages(req): res = urllib.request.urlopen(req) html = res.read().decode('utf8') # 匹配图片所在的div pat = re.compile(r'<div class="thumb">.*?<img src="(.*?)" alt=.*?>.*?</div>', re.S) src = pat.findall(html) # print(src) num = 1 for url in src: # 拼接上“http:” url = "http:" + url urllib.request.urlretrieve(url, "./images/"+str(num)+".jpg") num += 1 def main(): url = "https://www.qiushibaike.com/pic/page/" start_page = input("请输入起始页:") end_page = input("请输入结束页:") print("开始下载") for page in range(int(start_page),int(end_page)+1): # print(handle_url(url,page)) handle_pages(handle_url(url,page)) print("下载完毕!") if __name__ == '__main__': main()