python基础正则复习【糗事】

 
 

一:    导入的包和网址请求

import urllib.request
import re
def handle_url(url,page):
    url = url + str(page)
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
    req = urllib.request.Request(url=url,headers=headers)
    return reqer

二:    响应后数据的处理

# 定一个函数用于处理每一个页面
def handle_pages(req):
    res = urllib.request.urlopen(req)
    html = res.read().decode('utf8')
    # 匹配图片所在的div
    pat = re.compile(r'<div class="thumb">.*?<img src="(.*?)" alt=.*?>.*?</div>', re.S)
    src = pat.findall(html)
    # print(src)
    num = 1
    for url in src:
        # 拼接上“http:”
        url = "http:" + url
        urllib.request.urlretrieve(url, "./images/"+str(num)+".jpg")
        num += 1

三:    主函数

def main():
    url = "https://www.qiushibaike.com/pic/page/"
    start_page = input("请输入起始页:")
    end_page = input("请输入结束页:")
    print("开始下载")
    for page in range(int(start_page),int(end_page)+1):
        # print(handle_url(url,page))
        handle_pages(handle_url(url,page))

    print("下载完毕!")
if __name__ == '__main__':
    main()

四:    整体代码

import urllib.request
import re
def handle_url(url,page):
    url = url + str(page)
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
    req = urllib.request.Request(url=url,headers=headers)
    return req

# 定一个函数用于处理每一个页面
def handle_pages(req):
    res = urllib.request.urlopen(req)
    html = res.read().decode('utf8')
    # 匹配图片所在的div
    pat = re.compile(r'<div class="thumb">.*?<img src="(.*?)" alt=.*?>.*?</div>', re.S)
    src = pat.findall(html)
    # print(src)
    num = 1
    for url in src:
        # 拼接上“http:”
        url = "http:" + url
        urllib.request.urlretrieve(url, "./images/"+str(num)+".jpg")
        num += 1
def main():
    url = "https://www.qiushibaike.com/pic/page/"
    start_page = input("请输入起始页:")
    end_page = input("请输入结束页:")
    print("开始下载")
    for page in range(int(start_page),int(end_page)+1):
        # print(handle_url(url,page))
        handle_pages(handle_url(url,page))

    print("下载完毕!")
if __name__ == '__main__':
    main()

猜你喜欢

转载自blog.csdn.net/mjp_erhuo/article/details/80278332