This article describes how python redis achieve ip-based agent pool, the paper sample code described in great detail, has a certain reference value of learning for all of us to learn or work, a friend in need can refer
Use apscheduler library timed crawling ip, ip delete timing detection ip, made 2-layer inspection, the first layer of crawling into redis - db0 testing, successfully placed redis - db1 again testing to ensure that acquired proxy ip availability
import requests, redis
import pandas
import random
from apscheduler.schedulers.blocking import BlockingScheduler
import datetime
import logging
db_conn = redis.ConnectionPool(host="*.*.*.*", port=6379, password="123456")
redis_conn_0 = redis.Redis(connection_pool=db_conn, max_connections=10,db=0)
redis_conn_1 = redis.Redis(connection_pool=db_conn, max_connections=10,db=1)
# 删除redis数据库里的ip
def remove_ip(ip,redis_conn):
redis_conn.zrem("IP", ip)
print("已删除 %s..." % ip)
# 获取redis数据库里一共有多少ip
def get_ip_num(redis_conn):
num = redis_conn.zcard("IP")
return num
# 获取ip的端口
def get_port(ip,redis_conn):
port = redis_conn.zscore("IP", ip)
port = str(port).replace(".0", "")
return port
# 添加ip和端口到数据库里
def add_ip(ip, port,redis_conn):
# nx: 不要更新已有的元素。总是添加新的元素,只有True,False
redis_conn.zadd("IP", {ip: port}, nx=55)
print("已添加 %s %s...ok" % (ip, port))
# 列出所有的ip
def get_all_ip(redis_conn):
all_ip = redis_conn.zrange("IP", 0, -1)
return all_ip
# 随机获取一个ip
def get_random_ip(redis_conn):
end_num = get_ip_num(redis_conn)
num = random.randint(0, end_num)
random_ip = redis_conn.zrange("IP", num, num)
if not random_ip:
return "",""
random_ip = str(random_ip[0]).replace("b", '').replace("'", "")
port = get_port(random_ip,redis_conn)
return random_ip, port
# 获取代理ip
def spider_ip(x,redis_conn):
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), x)
for p in range(1, 20):
res = pandas.read_html("http://www.89ip.cn/index_{}.html".format(p))
# print(res)
# print(type(res[0]))
for i in range(len(res[0])):
ip = res[0].iloc[i, 0]
port = res[0].iloc[i, 1]
print("ip", ip)
print("port", port)
add_ip(str(ip), str(port),redis_conn)
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
filename='log1.txt',
filemode='a')
def aps_detection_ip(x,redis_conn):
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), x)
res=get_random_ip(redis_conn)
ip=res[0]
port=res[1]
try:
requests.get("http://www.baidu.com",proxies={'https':'{ip}:{port}'.format(ip=ip,port=port)})
print("可用",ip,port,res)
if redis_conn!=redis_conn_1:
add_ip(str(ip), str(port), redis_conn_1)
except Exception:
# ip错误失效就删除
remove_ip(ip,redis_conn)
scheduler = BlockingScheduler()
scheduler.add_job(func=aps_detection_ip, args=('检测循环任务0',redis_conn_0), trigger='interval', seconds=3, id='aps_detection_ip_task0',max_instances=10)
scheduler.add_job(func=spider_ip, args=('获取循环任务0',redis_conn_0), trigger='interval', seconds=60*60*2, id='spider_ip_task0',max_instances=10)
scheduler.add_job(func=aps_detection_ip, args=('检测循环任务1',redis_conn_1), trigger='interval', seconds=3, id='aps_detection_ip_task1',max_instances=10)
scheduler._logger = logging
# scheduler.start()
if __name__ == '__main__':
# print(get_ip_num())
# spider_ip("获取循环任务")
scheduler.start()
# aps_detection_ip("检测循环任务")
That's all the content, and finally to recommend a good reputation in the number of public institutions [programmers], there are a lot of old-timers learning skills, learning experience, interview skills, workplace experience and other share, the more we carefully prepared the zero-based introductory information on actual project data every day to explain the timing of Python programmers technology, to share some of the ways to learn and need to pay attention to small details, like I want to remember my attention