简介
在众多网站中通过检测某一ip的请求频率来判断是否为爬虫的请求是最有效率和准确的反爬策率之一,所以写爬虫程序时限制请求的频率尤其重要,但爬虫限速也无法幸免于难时,就需要通过其他的手段了。通过ip代理池随机更改每次请求时的ip可以非常有效的应对这种反爬策略。这里主要说一下不要钱的方法。
不要钱的方法
免费的ip代理在百度一搜一大堆,是的这种方法就是将免费的代理ip爬下来并存到数据库,自己维护一个ip代理池。当免费的ip代理非常不稳定,不推荐,主要是给初学者学习参考。
import requests
from scrapy.selector import Selector
import pymysql
from urllib import request
import time
dbparms = dict(
host='127.0.0.1',
user='root',
password='123456',
database='ProxyIpPool',
port=3306,
charset='utf8'
)
class ProxyIp(object):
def __init__(self):
self.url = 'https://www.xicidaili.com/nn/'
self.headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0'
}
try:
self.db = pymysql.Connect(host=dbparms['host'],
user=dbparms['user'],
password=dbparms['password'],
port=dbparms['port'],
database=dbparms['database'],
charset=dbparms['charset'])
self.Cursor = self.db.cursor()
except Exception as e:
print(e)
"""
更新代理
爬取西刺网站中的代理,
将可用的代理存入数据库
"""
def UpdateProxyIp(self):
url = self.url
while True:
try:
response = requests.get(url=url, headers=self.headers)
print('{0}:请求成功'.format(url))
except:
print('{0}:请求出错'.format(url))
return False
selector = Selector(text=response.text)
ip_list = selector.xpath('.//div[@id="body"]/table[@id="ip_list"]/tr')
for each in ip_list[1:]:
ip = each.xpath('./td[2]/text()').extract_first()
port = each.xpath('./td[3]/text()').extract_first()
IsAvailable = self.__TestProxyIp(ip, port)
if IsAvailable == True:
self.__InsertProxyIp(ip, port)
href = selector.xpath('.//div[@class="pagination"]/a[@class="next_page"]/@href').extract_first()
if href == None:
print('爬取结束')
break
url = request.urljoin('https://www.xicidaili.com/', href)
"""
获取代理ip
从数据库中随即取出一个可用的代理ip
"""
def GetProxyIp(self):
while True:
result = self.__SelectProxyIp()
if result == None:
print("查询出错")
return None
IsAvailable =self.__TestProxyIp(ip=result[0], port=result[1])
if IsAvailable == True:
return 'http://{0}:{1}'.format(result[0], result[1])
else:
self.__DeleteProxyIp(result[0], result[1])
"""
测试代理是否可用
"""
def __TestProxyIp(self, ip, port):
proxy = 'http://{0}:{1}'.format(ip, port)
proxies={
'http': proxy,
'https': proxy
}
url = 'http://www.baidu.com'
try:
requests.get(url=url, proxies=proxies, timeout=2)
print('{0}:{1} :代理可用'.format(ip, port))
return True
except:
print('{0}:{1} :代理不可用'.format(ip, port))
return False
"""
将代理ip插入proxies表中
"""
def __InsertProxyIp(self, ip, port):
sql = """
INSERT INTO proxies(ip, port)
SELECT %s, %s
WHERE NOT EXISTS(
SELECT ip
FROM proxies
WHERE ip = %s);
"""
proxies = (ip, port, ip)
try:
self.Cursor.execute(sql, proxies)
self.db.commit()
except Exception as e:
print(e)
"""
从proxies随机获取一条记录
"""
def __SelectProxyIp(self):
sql = """
SELECT ip, port
FROM proxies
ORDER BY rand()
LIMIT 1
"""
try:
self.Cursor.execute(sql)
result = self.Cursor.fetchall()[0]
return result
except Exception as e:
print(e)
return None
"""
删除不可用的代理ip
"""
def __DeleteProxyIp(self, ip, port):
sql = """
DELETE FROM proxies
WHERE ip=%s AND port=%s
"""
proxies =(ip, port)
try:
self.Cursor.execute(sql, proxies)
self.db.commit()
except Exception as e:
print(e)
要钱的方法
付费的ip代理网上也是一大堆,在里就推荐一下crwalera,下面是转载秋楓写的具体的使用方法。