[Crawler] Use magic to grab pictures of a website

Generally, magical is not needed, but it is not the general situation now! ! !

import random
import requests
import socket
import time
import os

if not os.path.exists('./data'):
    os.mkdir('data')

for i in range(14,17):
    url = 'https://baidu/72/{:0>3.0f}.jpg'.format(i)
    proxies = [
        {'http':'socks5://127.0.0.1:1080'},
        {'https':'socks5://127.0.0.1:1080'}
    ]
    proxies = random.choice(proxies)
    # print(proxies)
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3861.400 QQBrowser/10.7.4313.400'}
    try:
        response = requests.get(url,proxies=proxies, headers=headers) #使用代理
        path = './data/{:0>3.0f}.jpg'.format(i)  # 文件储存地址
        with open(path, 'wb') as f:  # 把图片数据写入本地,wb表示二进制储存
            for chunk in response.iter_content(chunk_size=128):
                f.write(chunk)
        response.close()  # 记得要关闭
        if response.status_code == 200:
            print(f'正在下载: {url}')
    except requests.ConnectionError as e:
        print(e.args)

    timeout = 20
    socket.setdefaulttimeout(timeout)  # 这里对整个socket层设置超时时间。后续文件中如果再使用到socket,不必再设置
    sleep_download_time = 10
    time.sleep(sleep_download_time)  # 这里时间自己设定

Optimized version:

#  开发时间:    2022/10/22 22:40
#  功能作用:    未知
import random
import requests
import socket
import time
import os

dataname = 51    ## 这里改第几个画集
star = 1         ## 从第几页开始下载

if not os.path.exists(f'./{dataname}'):
    os.mkdir(f'{dataname}')

def delay():
    timeout = 20
    socket.setdefaulttimeout(timeout)  # 这里对整个socket层设置超时时间。后续文件中如果再使用到socket,不必再设置
    sleep_download_time = 5
    time.sleep(sleep_download_time)  # 这里时间自己设定

for i in range(star,999):
    url = 'https://baidu/data/1602/{}/{:0>3.0f}.jpg'.format(dataname,i)    ## 改自己的网址
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3861.400 QQBrowser/10.7.4313.400'}
    down = True
    num = 0
    while(down):
        try:
            proxies = [
                {'http': 'socks5://127.0.0.1:1080'},
                {'https': 'socks5://127.0.0.1:1080'}
            ]
            proxies = random.choice(proxies)
            # print(proxies)
            response = requests.get(url,proxies=proxies, headers=headers) #使用代理
            path = './{}/{:0>3.0f}.jpg'.format(dataname,i)  # 文件储存地址
            with open(path, 'wb') as f:  # 把图片数据写入本地,wb表示二进制储存
                for chunk in response.iter_content(chunk_size=128):
                    f.write(chunk)
            response.close()  # 记得要关闭
            if response.status_code == 200:
                print(f'正在下载: {url}')
            down = False
        except requests.ConnectionError as e:
            print(e.args)
            num += 1
            print(f'Error========>>:下载失败,接着重试第{num}/20次: <<=================== {url}')
            delay()
            down = (False if(num >= 20) else True)
    delay()

Guess you like

Origin blog.csdn.net/qq_42792802/article/details/127472288