# 用bs4爬取网页图片--天堂图片网
# shutil:删除多层文件夹
import requests, os, shutil, re
from bs4 import BeautifulSoup
from multiprocessing import Pool
class ImageSpider(object):
def __init__(self):
self.base_url = 'http://www.ivsky.com/tupian/ziranfengguang/'
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0",
}
def get_page_list(self, url):
"""
:param url: 列表页的地址
:return:
"""
abs_url = self.base_url + url
try:
response = requests.get(abs_url, headers=self.headers)
if response.status_code == 200:
return response.text
print('状态码:',response.status_code)
return None
except Exception as e:
print('请求列表页失败:',e,abs_url)
return None
def parse_page_list(self, page_list_html):
if page_list_html:
bs = BeautifulSoup(page_list_html, "lxml")
# .ali div a: 后代选择器
a_list = bs.select('.ali div a')
print(a_list)
for a in a_list:
print(a)
href = a.get('href')
title = a.get('title').strip()
print('开始下载:', title)
# 在创建目录之前,判断该目录是否存在
if os.path.exists(title):
continue
# shutil.rmtree(title,ignore_errors=True) # 移除目录树
os.mkdir(title)
os.chdir(title)
# 根据href请求详情页源代码
detail_url = 'http://www.ivsky.com' + href
yield detail_url
def get_page_detail(self, detail_url):
"""
:param detail_url: 详情页地址
:return:
"""
try:
response = requests.get(detail_url, headers=self.headers)
if response.status_code == 200:
return response.text
print('状态码:', response.status_code)
return None
except Exception as e:
print('请求详情页失败:', e, detail_url)
return None
def parse_page_detail(self, detail_html):
if detail_html:
bs = BeautifulSoup(detail_html, "lxml")
images = bs.select('.pli img')
for image in images:
src = image.get('src')
img_name = src.split('/')[-1]
response = requests.get(src, headers=self.headers)
if response.status_code == 200:
with open(img_name, 'wb') as f:
f.write(response.content)
# 回到父级目录
os.chdir(os.path.pardir)
img = ImageSpider()
def main(page_num):
print('开始下载第{}页...'.format(page_num))
list_html = img.get_page_list('index_{}.html'.format(page_num))
if list_html:
list_data = img.parse_page_list(list_html)
for detail_url in list_data:
detail_html = img.get_page_detail(detail_url)
if detail_html:
img.parse_page_detail(detail_html)
if __name__ == '__main__':
pool = Pool()
pool.map(main, [x for x in range(1, 8)])