版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/pengjunlee/article/details/90174453
一、为什么要搭建爬虫代理池
在众多的网站防爬措施中,有一种是根据ip的访问频率进行限制,即在某一时间段内,当某个ip的访问次数达到一定的阀值时,该ip就会被拉黑、在一段时间内禁止访问。
应对的方法有两种:
1. 降低爬虫的爬取频率,避免IP被限制访问,缺点显而易见:会大大降低爬取的效率。
2. 搭建一个IP代理池,使用不同的IP轮流进行爬取。
二、搭建思路
1、从代理网站(如:西刺代理、快代理、云代理、无忧代理)爬取代理IP;
2、验证代理IP的可用性(使用代理IP去请求指定URL,根据响应验证代理IP是否生效);
3、将可用的代理IP保存到数据库;
常用代理网站:西刺代理 、云代理 、IP海 、无忧代理 、飞蚁代理 、快代理
三、代码实现
工程结构如下:
ipproxy.py
IPProxy代理类定义了要爬取的IP代理的字段信息和一些基础方法。
# -*- coding: utf-8 -*-
import re
import time
from settings import PROXY_URL_FORMATTER
schema_pattern = re.compile(r'http|https$', re.I)
ip_pattern = re.compile(r'^([0-9]{1,3}.){3}[0-9]{1,3}$', re.I)
port_pattern = re.compile(r'^[0-9]{2,5}$', re.I)
class IPProxy:
'''
{
"schema": "http", # 代理的类型
"ip": "127.0.0.1", # 代理的IP地址
"port": "8050", # 代理的端口号
"used_total": 11, # 代理的使用次数
"success_times": 5, # 代理请求成功的次数
"continuous_failed": 3, # 使用代理发送请求,连续失败的次数
"created_time": "2018-05-02" # 代理的爬取时间
}
'''
def __init__(self, schema, ip, port, used_total=0, success_times=0, continuous_failed=0,
created_time=None):
"""Initialize the proxy instance"""
if schema == "" or schema is None:
schema = "http"
self.schema = schema.lower()
self.ip = ip
self.port = port
self.used_total = used_total
self.success_times = success_times
self.continuous_failed = continuous_failed
if created_time is None:
created_time = time.strftime('%Y-%m-%d', time.localtime(time.time()))
self.created_time = created_time
def _get_url(self):
''' Return the proxy url'''
return PROXY_URL_FORMATTER % {'schema': self.schema, 'ip': self.ip, 'port': self.port}
def _check_format(self):
''' Return True if the proxy fields are well-formed,otherwise return False'''
if self.schema is not None and self.ip is not None and self.port is not None:
if schema_pattern.match(self.schema) and ip_pattern.match(self.ip) and port_pattern.match(self.port):
return True
return False
def _is_https(self):
''' Return True if the proxy is https,otherwise return False'''
return self.schema == 'https'
if __name__ == '__main__':
proxy = IPProxy('HTTPS', '1922.168.2.25', "8080")
print(proxy._get_url())
print(proxy._check_format())
print(proxy._is_https())
settings.py
settings.py中汇聚了工程所需要的配置信息。
# 指定Redis的主机名和端口
REDIS_HOST = 'localhost'
REDIS_PORT = 6379
# 代理保存到Redis key 格式化字符串
PROXIES_REDIS_FORMATTER = 'proxies::{}'
# 已经存在的HTTP代理和HTTPS代理集合
PROXIES_REDIS_EXISTED = 'proxies::existed'
# 最多连续失败几次
MAX_CONTINUOUS_TIMES = 3
# 代理地址的格式化字符串
PROXY_URL_FORMATTER = '%(schema)s://%(ip)s:%(port)s'
USER_AGENT_LIST = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
# 爬取到的代理保存前先检验是否可用,默认True
PROXY_CHECK_BEFOREADD = True
# 检验代理可用性的请求地址
PROXY_CHECK_URLS = {'https':['https://icanhazip.com'],'http':['http://icanhazip.com']}