导读:爬取反爬虫图片网站
预览效果
遇到的问题:
刚开始爬虫的时候,爬取到的所有图片都是一张重定向推广图片
解决办法:在requests
请求头headers
中配置Referer
属性,指向爬取网站的顶级域名(根据情况而定)
爬虫代码
import os,re
import requests
from contextlib import closing
from bs4 import BeautifulSoup
import json
import random
import time
# 下载路径
DOWNLOAD_PATH = 'C:\\pictures\\美女校花\\'
# 最大页数
MAX_PAGES = 30
# 基础网址
BASE_URL = 'http://www.xxx.com/'
# 清纯美女网站
BEAUTY_BASE_URL = BASE_URL + 'xiaohua/'
# 列表URL
LIST_URL_ITEM = 'list_1_1.html'
# 图片地址
BASIC_IMG_URL = 'http://img1.xxx.me/pic/' #http://img1.xxx.me/pic/3683/1.jpg
class mmPicture(object):
""" 图片下载 """
def __init__(self):
super(mmPicture, self).__init__()
self.offset = 1
self.list_url = BEAUTY_BASE_URL
self.all_group_links = []
self.all_img_links = []
#请求列表页
def requestDataList(self):
headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" }
if(self.offset != 1):
self.list_url = BEAUTY_BASE_URL + 'list_1_' + str(self.offset) + '.html'
print(self.list_url)
with closing(requests.get(self.list_url, headers=headers)) as response:
response.encoding="GBK"
soup = BeautifulSoup(response.text, "html.parser")
for sou in soup.find_all("dd"):
AHref = sou.find('a')['href']
#print(sou.find('a').string)
if(re.match('.*\/{2}www.mm131.com\/.*', AHref)):
self.all_group_links.append(AHref)
#print(self.all_group_links)
pass
self.offset += 1
print(self.offset)
if(self.offset < MAX_PAGES):
self.requestDataList()
time.sleep(1)
else:
with open('./group.json','w') as f:
f.write(json.dumps(self.all_group_links))
# 详情页
self.requestDetail()
#请求详情页
def requestDetail(self):
# 读取本地json
with open('./group.json','r') as f:
self.all_group_links = json.load(f)
# 遍历获取图片链接
for list_item_url in self.all_group_links:
headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" }
with closing(requests.get(list_item_url, headers=headers)) as response:
response.encoding="GBK"
soup = BeautifulSoup(response.text, "html.parser")
#获取图片详情页码 href="1264_2.html"
page_links = soup.select('.page-en')
for page in page_links:
img_url = re.match('\d+_{1}\d+',page['href'])[0].split('_')[0] + '/' + re.match('\d+_{1}\d+',page['href'])[0].split('_')[1] + '.jpg'
self.all_img_links.append({
'name': soup.find('h5').string,
'url': img_url
})
print(list_item_url)
# 写入本地文件
with open('./links.json','w') as f:
f.write(json.dumps(self.all_img_links))
# 下载所有图片
self.donwloadALLImgs()
# 下载所有图片
def donwloadALLImgs(self):
with open('./links.json','r') as f:
self.all_img_links = json.load(f)
for imgItem in self.all_img_links:
print(BASIC_IMG_URL+imgItem['url'])
#下载文件
self.downloads(imgItem)
pass
# 创建目录
def mkdir(self, path):
# 去除首位空格
path = path.strip()
# 去除尾部 \ 符号
path = path.rstrip("\\")
isExists = os.path.exists(path)
# 判断结果
if not isExists:
os.makedirs(path) # 如果不存在则创建目录 # 创建目录操作函数
return True
else:
# 如果目录存在则不创建,并提示目录已存在
return False
# 下载
def downloads(self, item):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Referer": "http://www.xxx.com/xiaohua/"
}
with closing(requests.get(BASIC_IMG_URL+item['url'], headers=headers, stream=True)) as response:
chunk_size = 1024 # 单次请求最大值
content_size = (int(response.headers['content-length'])/chunk_size/chunk_size) # 内容体总大小
data_count = 0
print('\n开始下载:\n')
#创建目录
self.mkdir(DOWNLOAD_PATH+item['name'])
# 开始下载操作
with open( DOWNLOAD_PATH + item['name'] + '\\' + item['name'] + str(random.randrange(0, 1000)) + '.jpg', 'wb') as file:
for data in response.iter_content(chunk_size = chunk_size):
file.write(data)
data_count += (len(data)/chunk_size/chunk_size)
now_progress = (data_count / content_size) * 100
print("\r 文件下载进度:%d%%(%d M/%d M) - %s " % (now_progress, data_count, content_size, item['name']), end=" ")
print('\n\n下载成功!\n')
# 下载图片
mm = mmPicture()
mm.donwloadALLImgs()