single-threaded python picture download

import urllib.request
import urllib.parse
import urllib.error
import re
import os
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

path = "./images"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
    "referer": "https://www.mzitu.com/xinggan/"
}


def handler_request(url, pageIndex):
    url = url + str(pageIndex)
    # 构建请求对象
    request = urllib.request.Request(url=url, headers=headers)

    return request


def get_images_url(content, basePath):
    patternNames = re.compile(r'<li>.*?<img .* alt=(.*?) .*? />.*?')
    patternHrefs = re.compile(r'<li><a href=(.*?) .*?>.*?')
    alts = patternNames.findall(content, re.S)
    hrefs = patternHrefs.findall(content, re.S)
    image_map = {}
    for i in range(len(hrefs)):
        key = alts[i][1: len(alts[i]) - 1]
        image_map[key] = hrefs[i]

    for item in image_map.items():
        image_category_response(item, basePath)


def image_category_response(item, basePath):
    alt = item[0]
    save_folder = os.path.join(basePath, alt)
    if not os.path.exists(save_folder):
        os.mkdir(save_folder)

    baseurl = item[1][1: len(item[1]) -1]
    pageCount = 1000
    try:
        for pageIndex in range(pageCount):
            page_url = baseurl + "/" + str(pageIndex)
            try:
                # 构建请求对象
                request = urllib.request.Request(url=page_url, headers=headers)
                # 发送请求
                response = urllib.request.urlopen(request)
                content = response.read().decode()
                imgPattern = re.compile(r'<div class="main-image"><p>.*?<img src=(.*?) .*? />.*?')
                imgUrl = imgPattern.findall(content, re.S)
                download_images(imgUrl[0], save_folder)
            except urllib.error.URLError as e:
                raise TypeError("最大页面数{0}".format(pageIndex - 1))
    except Exception as e:
        print(e)


def download_images(url, save_path):
    url = url[1: len(url) - 1]
    print(url)
    # 构建请求对象
    request = urllib.request.Request(url=url, headers=headers)
    # 发送请求
    response = urllib.request.urlopen(request)

    filename = url.split('/')[-1]
    with open(os.path.join(save_path, filename), 'wb') as fb:
        fb.write(response.read())


def parse_pages(content):
    print(content)


def main():
    url = 'https://www.mzitu.com/xinggan/page/'
    start_page = int(input("Please enter the starting page number:" )) 
    End_page = int (the INPUT ( " Please enter an end page number: " )) 
    # Creating a root folder 
    IF not os.path.exists (path): 
        os.mkdir (path) 

    for pageIndex in the Range (START_PAGE, end_page + 1 ): 

        Print ( " ........... start the download page {0} " .format (pageIndex)) 
        # create a folder 
        save_path = create_folder (pageIndex) 
        # generate Request 
        Request = handler_request (url, pageIndex ) 
        # send the requested object, to obtain the corresponding content 
        Response= The urllib.request.urlopen (Request) 
        Content = response.read (). Decode () 
        # parse the contents, extract pictures and download 
        get_images_url (Content, save_path) 

        Print ( " ........... the end of the download page {0} " .format (the pageIndex)) 


DEF create_folder (the pageIndex): 
    the save_path = the os.path.join (path, STR (the pageIndex))
     IF Not os.path.exists (the save_path): 
        os.mkdir (the save_path) 

    return save_path.replace ( " \\ " , " / " ) + " / " 


IF the __name__ == "__main__":
    main()

 

Guess you like

Origin www.cnblogs.com/KruceCoder/p/12076682.html