python 单线程图片下载

import urllib.request
import urllib.parse
import urllib.error
import re
import os
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

path = "./images"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
    "referer": "https://www.mzitu.com/xinggan/"
}


def handler_request(url, pageIndex):
    url = url + str(pageIndex)
    # 构建请求对象
    request = urllib.request.Request(url=url, headers=headers)

    return request


def get_images_url(content, basePath):
    patternNames = re.compile(r'<li>.*?<img .* alt=(.*?) .*? />.*?')
    patternHrefs = re.compile(r'<li><a href=(.*?) .*?>.*?')
    alts = patternNames.findall(content, re.S)
    hrefs = patternHrefs.findall(content, re.S)
    image_map = {}
    for i in range(len(hrefs)):
        key = alts[i][1: len(alts[i]) - 1]
        image_map[key] = hrefs[i]

    for item in image_map.items():
        image_category_response(item, basePath)


def image_category_response(item, basePath):
    alt = item[0]
    save_folder = os.path.join(basePath, alt)
    if not os.path.exists(save_folder):
        os.mkdir(save_folder)

    baseurl = item[1][1: len(item[1]) -1]
    pageCount = 1000
    try:
        for pageIndex in range(pageCount):
            page_url = baseurl + "/" + str(pageIndex)
            try:
                # 构建请求对象
                request = urllib.request.Request(url=page_url, headers=headers)
                # 发送请求
                response = urllib.request.urlopen(request)
                content = response.read().decode()
                imgPattern = re.compile(r'<div class="main-image"><p>.*?<img src=(.*?) .*? />.*?')
                imgUrl = imgPattern.findall(content, re.S)
                download_images(imgUrl[0], save_folder)
            except urllib.error.URLError as e:
                raise TypeError("最大页面数{0}".format(pageIndex - 1))
    except Exception as e:
        print(e)


def download_images(url, save_path):
    url = url[1: len(url) - 1]
    print(url)
    # 构建请求对象
    request = urllib.request.Request(url=url, headers=headers)
    # 发送请求
    response = urllib.request.urlopen(request)

    filename = url.split('/')[-1]
    with open(os.path.join(save_path, filename), 'wb') as fb:
        fb.write(response.read())


def parse_pages(content):
    print(content)


def main():
    url = 'https://www.mzitu.com/xinggan/page/'
    start_page = int(input("请输入起始页码:"))
    end_page = int(input("请输入结束页码:"))
    # 创建根文件夹
    if not os.path.exists(path):
        os.mkdir(path)

    for pageIndex in range(start_page, end_page + 1):

        print("...........开始下载第{0}页".format(pageIndex))
        # 创建文件夹
        save_path = create_folder(pageIndex)
        # 生成request
        request = handler_request(url, pageIndex)
        # 发送请求对象,获取相应内容
        response = urllib.request.urlopen(request)
        content = response.read().decode()
        # 解析内容,提取图片并且下载
        get_images_url(content, save_path)

        print("...........结束下载第{0}页".format(pageIndex))


def create_folder(pageIndex):
    save_path = os.path.join(path, str(pageIndex))
    if not os.path.exists(save_path):
        os.mkdir(save_path)

    return save_path.replace("\\", "/") + "/"


if __name__ == "__main__":
    main()

猜你喜欢

转载自www.cnblogs.com/KruceCoder/p/12076682.html