Python benefits-crawling girl pictures

I have been doing Python crawlers for so long
.

Crawl URL https://www.mzitu.com/

Those who come to the website can already go out, because the URL will not be mentioned later

The sister map website is easy to block ip, so here is best to use proxy ip pool. For details, see my previous blog crawler ip is always blocked? Teach you to construct proxy ip pool

Library preparation: requests lxml os fake_useragent
Here I randomly found an agent IP, you can also use other (just take it directly from the database)

The complete code is as follows

# -*- encoding = utf-8 -*-
# D:\Program\Pycharm\PyCharm Community Edition 2019.3.3\Project
import requests
from lxml import etree
import os
from fake_useragent import UserAgent

def get_url_list():
    try:
        html = etree.HTML(requests.get('https://www.mzitu.com/',headers=headers,proxies=proxies).text)
        return html.xpath('//ul[@id="pins"]/li/a/@href')
    except:
        pass

def download(url):
    try:
        html = etree.HTML(requests.get(url,proxies=proxies,headers=headers).text)
        name = html.xpath('//h2/text()')[0]
        number = int(html.xpath('//div[@class="pagenavi"]/a[5]/span/text()')[0])
        id = html.xpath('//div[@class="main-image"]//img/@src')[0][22:33]
        if not os.path.exists(name):
            os.mkdir(name)
        os.chdir(name)
        list = []
        for i in range(number):
            if i < 10:
                list.append('https://i3.mmzztt.com/%s0%d.jpg'%(id,(i+1)))
            else:
                list.append('https://i3.mmzztt.com/%s%d.jpg'%(id,(i+1)))
        for i in range(len(list)):
            file = open('%s%d.jpg'%(name,(i+1)),'wb')
            file.write(requests.get(list[i],headers=headers).content)
            file.close()
            print("%s%d 保存成功"%(name,i+1))
        os.chdir('..')
    except:
        pass

if __name__ == '__main__':
    ua=UserAgent()
    headers = {
        'user-agent': ua.random,
        'Referer': 'https://www.mzitu.com/',
    }
    proxies={
        'HTTPS':'61.164.39.67:53281'
    }
    if not os.path.exists('image'):
        os.mkdir('image')
    os.chdir('image')
    for item in get_url_list():
        download(item)
Published 11 original articles · won 15 · views 2150

Guess you like

Origin blog.csdn.net/realmels/article/details/105604110