# -*- coding=UTF-8 -*-
import time
import requests
import re
import os
from bs4 import BeautifulSoup
from requests.exceptions import ConnectionError, ReadTimeout
##
url = 'https://www.meitulu.com/rihan/'
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Mobile Safari/537.36',
'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Host': 'm.meitulu.com',
"Referer": "https://mtl.gzhuibei.com/"
} # 创建头部信息
headers2 = {'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Mobile Safari/537.36'
} # 创建头部信息
def getURL(url): #发送网络请求
a = requests.get(url,headers=headers)
a.encoding = 'utf-8' #改变乱码问题
html = a.text
#html_doc = str(html, 'utf-8')
return html
def doDown():
# BeautifulSoup是一个HTML / XML解析器
# soup中是整个html对象
soup = BeautifulSoup(getURL(url), 'html.parser') # 解析爬取网址
#print(soup)
# 获取此类数据的总页数 94页 - 循环下载
for i in range(1, 95):
# https://www.meitulu.com/rihan/2.html
mu_lu_url = "https://www.meitulu.com/rihan/" + str(i) + ".html"
# print('第',i,'页,链接地址为:',mu_lu_url)
soup2 = BeautifulSoup(getURL(mu_lu_url), 'html.parser') # 解析爬取网址
# print(soup2)
#获取当前目录 有的所有图片集
img_mu_lu = soup2.find("ul",{"class":"img"}).find_all("li") #套图的第一页的图片实际URL
#print('第', i, '页,包含地址为:', img_mu_lu)
#遍历含有图集的URL标签
for mu_lu in img_mu_lu: # 遍历URL列表
result = {
'title': mu_lu.find("img")['alt'], #从img标签中获取文件头
'link': mu_lu.find("img")['src']
} # 过滤出字典
print('图集名称为:', result['title'],',图片地址为:',result['link'])
img_title = result['title'][-5:] #图集的文件夹名称
start1 = img_title.find('[')
end1 = img_title.rfind(']')
img_count = img_title[start1 + 1:end1] #图片的实际个数
# 创建文件夹
mkdir('C:/Users/25308/Desktop/Python/美图录/' + result['title'])
# 循环图集
try:
for j in range(1, int(img_count)):
start2 = result['link'].find('img/')
end2 = result['link'].rfind('/0.jpg')
img_url = result['link'][start2 + 4:end2]
#拼接出图片实际URL : https://mtl.gzhuibei.com/images/img/17868/2.jpg
real_img_url = 'https://mtl.gzhuibei.com/images/img/' + img_url + '/' + str(j) + '.jpg'
#print('第',j,'张图片',',图片实际地址为:', real_img_url)
#下载
downImage(j,real_img_url)
time.sleep(yanshi)
except Exception as ex:
print("出现如下异常%s"%ex)
def downImage(name,image):
print('第', name, '张图片', ',图片实际地址为:', image)
f = open(str(name) + '.jpg', 'wb+')
img = requests.get(image, headers=headers2) #用一个新的请求头,不然返回值为404
print('请求返回结果:',img)
if str(img) == '<Response [200]>':
print('下载图片...', end='')
f.write(img.content)
f.close()
def mkdir(name):
if os.path.exists(name):
print('文件已经存在,不需要创建!')
os.chdir(name) # 改变当前工作目录到指定的路径
else:
print('创建文件夹:')
os.mkdir(name)
os.chdir(name) # 改变当前工作目录到指定的路径
if __name__ == '__main__':
# 创建文件夹
mkdir('C:/Users/25308/Desktop/Python/美图录/') # 设定存储爬取图片的路径
yanshi = 0.5 #设定抓取图片延迟(0.5秒)
doDown() # 执行下载方法
Python3爬取meitulu(源码+详细注释)
猜你喜欢
转载自blog.csdn.net/beishanyingluo/article/details/103540487
今日推荐
周排行