I have been doing Python crawlers for so long
.
Crawl URL https://www.mzitu.com/
Those who come to the website can already go out, because the URL will not be mentioned later
The sister map website is easy to block ip, so here is best to use proxy ip pool. For details, see my previous blog crawler ip is always blocked? Teach you to construct proxy ip pool
Library preparation: requests lxml os fake_useragent
Here I randomly found an agent IP, you can also use other (just take it directly from the database)
The complete code is as follows
# -*- encoding = utf-8 -*-
# D:\Program\Pycharm\PyCharm Community Edition 2019.3.3\Project
import requests
from lxml import etree
import os
from fake_useragent import UserAgent
def get_url_list():
try:
html = etree.HTML(requests.get('https://www.mzitu.com/',headers=headers,proxies=proxies).text)
return html.xpath('//ul[@id="pins"]/li/a/@href')
except:
pass
def download(url):
try:
html = etree.HTML(requests.get(url,proxies=proxies,headers=headers).text)
name = html.xpath('//h2/text()')[0]
number = int(html.xpath('//div[@class="pagenavi"]/a[5]/span/text()')[0])
id = html.xpath('//div[@class="main-image"]//img/@src')[0][22:33]
if not os.path.exists(name):
os.mkdir(name)
os.chdir(name)
list = []
for i in range(number):
if i < 10:
list.append('https://i3.mmzztt.com/%s0%d.jpg'%(id,(i+1)))
else:
list.append('https://i3.mmzztt.com/%s%d.jpg'%(id,(i+1)))
for i in range(len(list)):
file = open('%s%d.jpg'%(name,(i+1)),'wb')
file.write(requests.get(list[i],headers=headers).content)
file.close()
print("%s%d 保存成功"%(name,i+1))
os.chdir('..')
except:
pass
if __name__ == '__main__':
ua=UserAgent()
headers = {
'user-agent': ua.random,
'Referer': 'https://www.mzitu.com/',
}
proxies={
'HTTPS':'61.164.39.67:53281'
}
if not os.path.exists('image'):
os.mkdir('image')
os.chdir('image')
for item in get_url_list():
download(item)