python 爬取ip代理池放入redis数据库(比较基础的代码)

from bs4 import BeautifulSoup as Soup
import requests,redis
from selenium import webdriver
import random

db_conn = redis.ConnectionPool(host="10.1.1.11", port=6379, password="password")
redis_conn = redis.Redis(connection_pool=db_conn,max_connections=10)


#通过浏览器访问来获取cookie
def get_cookie():
    driver = webdriver.Chrome(r'C:\Users\jw\AppData\Local\Programs\Python\Python37-32\seleniumDriver\chromedriver.exe')
    url = "http://www.66ip.cn/areaindex_1/1.html"
    driver.get(url)
    cookie_list = driver.get_cookies()
    Cookie = cookie_list[0]["name"] + "=" + cookie_list[0]["value"] + ";" + cookie_list[1]["name"] + "=" + cookie_list[1]["value"]
    return Cookie

#然后拿cookie去访问该网站,将获取到的ip和端口写入redis数据库中
def get_ip(page):
    url = "http://www.66ip.cn/areaindex_1/%s.html"%page
    Cookie = get_cookie()
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        "Cookie": Cookie
    }
    respones = requests.get(url,headers=headers)
    respones.encoding = respones.apparent_encoding  #获取页面的真实编码,并且定义返回的编码是真实编码
    soup = Soup(respones.text,'lxml')
    items = soup.find_all('table')[2].find_all('tr')[1:]
    for i in items:
        ips = i.find_all('td')
        ip,port,city = ips[0].text,ips[1].text,ips[2].text
        print("%s,%s,%s"%(ip,port,city))
        add_ip(ip,port)

#删除redis数据库里的ip
def remove_ip(ip):
    redis_conn.zrem("IP",ip)
    print("已删除 %s..."%ip)

#获取redis数据库里一共有多少ip
def get_ip_num():
    num = redis_conn.zcard("IP")
    return num
#获取ip的端口
def get_port(ip):
    port = redis_conn.zscore("IP",ip)
    port = str(port).replace(".0","")
    return port
#添加ip和端口到数据库里
def add_ip(ip,port):
    redis_conn.zadd("IP",{ip:port},nx=55)
    print("已添加 %s %s...ok"%(ip,port))
#列出所有的ip
def get_all_ip():
    all_ip = redis_conn.zrange("IP",0,-1)
    return all_ip
#随机获取一个ip
def get_random_ip():
    end_num = get_ip_num()
    num = random.randint(0,end_num)
    random_ip = redis_conn.zrange("IP",num,num)
    random_ip = str(random_ip[0]).replace("b",'').replace("'","")
    port = get_port(random_ip)
    return random_ip,port



if __name__ == '__main__':
    while True:
        print('''
        1.获取当前ip池的ip总数。
        2.列出ip。
        3.添加ip。
        4.删除ip。
        5.获取ip的端口。
        6.更新ip池。
        7.随机获得一个ip和端口。
        8.输入q或者exit退出。
                ''')
        user_input = input("请输入: ")
        if user_input == 'exit' or user_input == 'q':
            print("Bye...")
            break
        elif user_input == "1":
            num = get_ip_num()
            print("IP的总数量是: %s" %num)
        elif user_input == "4":
            ip = input("请输入要删除的ip: ")
            remove_ip(ip)
        elif user_input == "5":
            ip = input("请输入要查询的ip: ")
            port = get_port(ip)
            print("%s 的端口是: %s"%(ip,port))
        elif user_input == "3":
            ip = input("请输入要添加的ip: ")
            port = input("请输入要添加的port:")
            add_ip(ip,port)
        elif user_input == "2":
            all_ip = get_all_ip()
            for i in all_ip:
                print(i)
        elif user_input == "6":
            update_page = input("请输入更新(http://www.66ip.cn)第几页的ip?")
            get_ip(update_page)
            print("更新完成...")
        elif user_input == "7":
            a = get_random_ip()
            print(a[0],a[1])

猜你喜欢

转载自blog.csdn.net/wojiuwangla/article/details/98875814
今日推荐