requests+bs4批量爬取反爬虫图片网站

导读:爬取反爬虫图片网站

预览效果

在这里插入图片描述

遇到的问题:

刚开始爬虫的时候,爬取到的所有图片都是一张重定向推广图片
解决办法:在requests请求头headers中配置Referer属性,指向爬取网站的顶级域名(根据情况而定)

爬虫代码

import os,re
import requests
from contextlib import closing
from bs4 import BeautifulSoup
import json
import random
import time

# 下载路径
DOWNLOAD_PATH = 'C:\\pictures\\美女校花\\'

# 最大页数
MAX_PAGES = 30

# 基础网址
BASE_URL = 'http://www.xxx.com/' 

# 清纯美女网站
BEAUTY_BASE_URL = BASE_URL + 'xiaohua/'

# 列表URL
LIST_URL_ITEM = 'list_1_1.html'

# 图片地址
BASIC_IMG_URL = 'http://img1.xxx.me/pic/' #http://img1.xxx.me/pic/3683/1.jpg 
 
class mmPicture(object):

    """ 图片下载 """
    def __init__(self):
        super(mmPicture, self).__init__()
        self.offset = 1
        self.list_url = BEAUTY_BASE_URL
        self.all_group_links = []
        self.all_img_links = []

        
    #请求列表页
    def requestDataList(self):
        headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" }
        if(self.offset != 1):
            self.list_url = BEAUTY_BASE_URL + 'list_1_' + str(self.offset) + '.html'

        print(self.list_url)

        with closing(requests.get(self.list_url, headers=headers)) as response:
            response.encoding="GBK"
            soup = BeautifulSoup(response.text, "html.parser")
            for sou in soup.find_all("dd"):
                AHref = sou.find('a')['href']
                #print(sou.find('a').string)

                if(re.match('.*\/{2}www.mm131.com\/.*', AHref)):
                    self.all_group_links.append(AHref)
                    #print(self.all_group_links)    
                pass
            self.offset += 1
            print(self.offset)
            if(self.offset < MAX_PAGES):
                self.requestDataList()
                time.sleep(1)
            else:
                with open('./group.json','w') as f:
                    f.write(json.dumps(self.all_group_links))
        # 详情页
        self.requestDetail()

    #请求详情页
    def requestDetail(self):

        # 读取本地json
        with open('./group.json','r') as f:
            self.all_group_links = json.load(f)

        # 遍历获取图片链接
        for list_item_url in self.all_group_links:
            headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" }
            with closing(requests.get(list_item_url, headers=headers)) as response:
                response.encoding="GBK"
                soup = BeautifulSoup(response.text, "html.parser")
                
                #获取图片详情页码 href="1264_2.html"
                page_links = soup.select('.page-en')
                for page in page_links:
                    img_url = re.match('\d+_{1}\d+',page['href'])[0].split('_')[0] + '/' + re.match('\d+_{1}\d+',page['href'])[0].split('_')[1] + '.jpg'
                    self.all_img_links.append({
                        'name': soup.find('h5').string,
                        'url': img_url
                    })
            print(list_item_url)

        # 写入本地文件
        with open('./links.json','w') as f:
            f.write(json.dumps(self.all_img_links))
        
        # 下载所有图片
        self.donwloadALLImgs()


    # 下载所有图片
    def donwloadALLImgs(self):
        with open('./links.json','r') as f:
            self.all_img_links = json.load(f)

        for imgItem in self.all_img_links:
            print(BASIC_IMG_URL+imgItem['url'])
            #下载文件
            self.downloads(imgItem)
        pass

    # 创建目录
    def mkdir(self, path):

        # 去除首位空格
        path = path.strip()
        # 去除尾部 \ 符号
        path = path.rstrip("\\")
        isExists = os.path.exists(path)

        # 判断结果
        if not isExists:
            os.makedirs(path) # 如果不存在则创建目录 # 创建目录操作函数
            return True
        else:
            # 如果目录存在则不创建,并提示目录已存在
            return False
    # 下载
    def downloads(self, item):
        headers = { 
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36", 
            "Referer": "http://www.xxx.com/xiaohua/"
        }
        with closing(requests.get(BASIC_IMG_URL+item['url'], headers=headers, stream=True)) as response:
            chunk_size = 1024  # 单次请求最大值
            content_size = (int(response.headers['content-length'])/chunk_size/chunk_size)  # 内容体总大小
            data_count = 0
            print('\n开始下载:\n')
            #创建目录
            self.mkdir(DOWNLOAD_PATH+item['name'])
            # 开始下载操作
            with open( DOWNLOAD_PATH + item['name'] + '\\' + item['name'] + str(random.randrange(0, 1000)) + '.jpg', 'wb') as file:
                for data in response.iter_content(chunk_size = chunk_size):
                    file.write(data)
                    data_count += (len(data)/chunk_size/chunk_size)
                    now_progress = (data_count / content_size) * 100
                    print("\r 文件下载进度:%d%%(%d M/%d M) - %s " % (now_progress, data_count, content_size, item['name']), end=" ")
            print('\n\n下载成功!\n')

# 下载图片
mm = mmPicture()
mm.donwloadALLImgs()


猜你喜欢

转载自blog.csdn.net/WU5229485/article/details/87968617