ip代理抓取 python 练习

import requests
from bs4 import BeautifulSoup
# 目标地址和头
# 得到的都是墙外的ip,问题就是要想登录目标地址必须先出去
url = 'https://www.sslproxies.org/'
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}

# 获取数据
def get_raw_data(url, headers):
    response = requests.get(url, headers=headers)
    raw_data = response.content.decode()
    return raw_data

# 利用beautifulsoup处理
def bs_data(raw_data):
    soup = BeautifulSoup(raw_data, features='lxml')
    return soup

# 提取IP及其地址信息,返回列表
def bs_extract_ip(soup):
    ip_data = soup.find_all('td', class_='', colspan='')
    ip_list = []

    for i in range(0, 400, 4):
        ip_raw = ip_data[i].contents[0]
        ip_port = ip_data[i+1].contents[0]
        ip_address = ip_data[i+2].contents[0]
        ip_feature = ip_data[i+3].contents[0]
        ip_proxy = '{}:{}'.format(ip_raw, ip_port)
        proxies = "{{'http': 'http://{ip_input}', 'https': 'https://{ip_input}'}} {ip_address}".format(
            ip_input=ip_proxy, ip_address=ip_address)
        ip_list.append(proxies)
    return ip_list


if __name__ == "__main__":
    f = open(r'ip_proxies\ip代理池.txt',
             'w', encoding='utf-8')
    raw_data = get_raw_data(url, headers)

    soup = bs_data(raw_data)
    ip_list = bs_extract_ip(soup)
    for i in ip_list:
        f.write(i)
        f.write('\n')
    f.close()

猜你喜欢

转载自blog.csdn.net/weixin_43977865/article/details/89556521