# -*- coding: utf-8 -*-
# @Time : 2018/11/16 6:00 PM
# @Author : cxa
# @File : a.py
# @Software: PyCharm
# !/usr/bin/env python
# import logging
import os
import uuid
from lxml import html
import aiofiles
import logging
from ruia import Spider, Request
from ruia_ua import middleware
demo = "https://www.mzitu.com/page/{}/"
class MZTImgSpider(Spider):
start_urls = []
img_path = 'data/'
async def parse(self, res):
source = res.html
root = html.fromstring(source)
url_list = root.xpath("//ul[@id='pins']/li/a/@href")
name_list = root.xpath("//ul[@id='pins']/li/a/img/@alt")
next_page_urls = []
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'referer': 'https://www.mzitu.com/mm/',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
}
for each_data in url_list:
next_page_urls.append(each_data)
for name, url in zip(name_list, next_page_urls):
yield Request(url, headers=headers, callback=self.next_page, metadata={"name": name}, res_type='text')
async def next_page(self, res):
source = res.html
root = html.fromstring(source)
name = res.metadata.get("name")
refere_url = res.url
print(name, refere_url)
# 最后一页xpath
max_page_list = "//div[@class='pagenavi']/a[last()-1]/span/text()"
_max_page_num = root.xpath(max_page_list)
max_page_num = _max_page_num[0] if _max_page_num else None
img_url_node = root.xpath("//div[@class='main-image']/p/a/img/@src")
img_url = img_url_node[0] if img_url_node else None
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'if-modified-since': 'Thu, 15 Nov 2018 04:24:11 GMT',
'if-none-match': '"5becf4eb-1b7d4"',
'referer': refere_url,
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
}
yield Request(img_url, callback=self.save_img, headers=headers,
metadata={"url": img_url, "name": name, "id": "1"},
res_type='bytes')
print("最大页数", max_page_num)
for page in range(2, int(max_page_num) + 1):
headers["referer"] = f"{refere_url}{str(page).zfill(2)}"
next_img_url = img_url.replace("01.", f"{str(page).zfill(2)}.")
print("next",next_img_url)
yield Request(next_img_url, callback=self.save_img, headers=headers,
metadata={"url": img_url, "name": name, "id": page},
res_type='bytes')
async def save_img(self, res):
url = res.metadata.get("url")
_img_type = url.rsplit(".", 1)
img_type = _img_type[1] if _img_type else None
name = res.metadata.get("name")
img_id = res.metadata.get("id")
img_all_path = f"{self.img_path}{name}/"
if not os.path.exists(img_all_path):
os.makedirs(img_all_path)
# img_name = str(uuid.uuid1()) + "_" + res.url[-10:].replace('/', '-')
img_name = f"{img_id}.{img_type}"
async with aiofiles.open(img_all_path + img_name, 'wb') as fp:
await fp.write(res.html)
logging.info('Img downloaded successfully in {dir}'.format(dir=img_all_path + img_name))
if __name__ == '__main__':
word = '妹子图' # 目录名
pages = 10 # 页数
MZTImgSpider.img_path = word + "/"
MZTImgSpider.start_urls = [demo.format(page) for page in range(pages)]
MZTImgSpider.start(middleware=middleware)
福利爬虫妹子图
猜你喜欢
转载自www.cnblogs.com/c-x-a/p/10014425.html
今日推荐
周排行