import requests
from bs4 import BeautifulSoup
# 目标地址和头
# 得到的都是墙外的ip,问题就是要想登录目标地址必须先出去
url = 'https://www.sslproxies.org/'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
# 获取数据
def get_raw_data(url, headers):
response = requests.get(url, headers=headers)
raw_data = response.content.decode()
return raw_data
# 利用beautifulsoup处理
def bs_data(raw_data):
soup = BeautifulSoup(raw_data, features='lxml')
return soup
# 提取IP及其地址信息,返回列表
def bs_extract_ip(soup):
ip_data = soup.find_all('td', class_='', colspan='')
ip_list = []
for i in range(0, 400, 4):
ip_raw = ip_data[i].contents[0]
ip_port = ip_data[i+1].contents[0]
ip_address = ip_data[i+2].contents[0]
ip_feature = ip_data[i+3].contents[0]
ip_proxy = '{}:{}'.format(ip_raw, ip_port)
proxies = "{{'http': 'http://{ip_input}', 'https': 'https://{ip_input}'}} {ip_address}".format(
ip_input=ip_proxy, ip_address=ip_address)
ip_list.append(proxies)
return ip_list
if __name__ == "__main__":
f = open(r'ip_proxies\ip代理池.txt',
'w', encoding='utf-8')
raw_data = get_raw_data(url, headers)
soup = bs_data(raw_data)
ip_list = bs_extract_ip(soup)
for i in ip_list:
f.write(i)
f.write('\n')
f.close()
ip代理抓取 python 练习
猜你喜欢
转载自blog.csdn.net/weixin_43977865/article/details/89556521
今日推荐
周排行