目的: 爬取西刺代理IP并验证是否可用导出CSV文件
需注意的点: 网页代码中代理ip部分奇偶数稍微不同,火狐和谷歌浏览器 筛选得不全,最后用了find_all 和 next_siblings来获取
所用模块: requests BeautifulSoup csv
每天进步一点点~~~~~~~~~~~~~~~
爬取西刺代理IP并验证是否可用
import requests
from bs4 import BeautifulSoup
import csv
urls = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
valid_ip = []
def find_ip(number):
ip_list = []
for i in range(1, number + 1):
url = 'https://www.xicidaili.com/nn/' + str(i)
urls.append(url)
for url in urls:
html = requests.get(url, headers=headers).text
soup = BeautifulSoup(html, 'html.parser')
tags = soup.find_all('tr', class_='odd')
# 网站编码坑得很,奇偶数行稍微有点不同且谷歌、火狐直接复制CSS运行部分不行,只能又用find_all+next_siblings方法了
for tag in tags:
# 查找奇数行ip
ip_tag = tag.find('td', class_='country').next_siblings
ip_tag = [x for x in ip_tag if x != '\n']
ip_dict = {}
ip = ip_tag[0].string
duankou = ip_tag[1].string
yewu = ip_tag[4].string
ip_dict[yewu] = ip + ':' + duankou
ip_list.append(ip_dict)
# 查找偶数行ip
ip_tag = tag.next_sibling.next_sibling.find('td', class_='country').next_siblings
ip_tag = [x for x in ip_tag if x != '\n']
ip_dict = {}
ip = ip_tag[0].string
duankou = ip_tag[1].string
yewu = ip_tag[4].string
ip_dict[yewu] = ip + ':' + duankou
ip_list.append(ip_dict)
return ip_list
def test_ip(ip_list):
# 测试用网站,随意使用(豆瓣电影对不住了)
url = 'https://movie.douban.com/'
i = 0
proxies = ip_list
for proxy in proxies:
# 使用try except来判断代理是否成功,requests.exceptions.ConnectionError 表示请求失败异常
try:
# 设置超时时间为5秒
response = requests.get(url, headers=headers, proxies=proxy, timeout=5)
# i += 1
# 字典方法.keys()返回字典结果类,使用tuple或list再转换为字符串
# ip = ','.join(tuple(proxy.values()))
# print('第' + str(i) + '次连接成功,IP地址是:' + ip)
valid_ip.append(proxy)
except requests.exceptions.ConnectionError:
i += 1
print('第' + str(i) + '次链接失败')
return valid_ip
def save_valid_ip(valid_ip):
# 设置导出CSV文件的路径地址及名称
file_path = r'C:\Users\Administrator\Desktop\ValidIp.csv'
# 添加newline=''防止写入时写一行又插入一条空行
with open(file_path, 'w', newline='', encoding='utf-8') as f:
# 使用字典的方式写入CSV文件
field_names = ['序号', '有效代理IP地址']
f_csv = csv.DictWriter(f, field_names)
f_csv.writeheader()
count = len(valid_ip)
for i in range(count):
f_csv.writerow(
{'序号': str(i + 1),
'有效代理IP地址': valid_ip[i]
}
)
def main():
# 爬取的页面数,1个页面有100个IP地址,根据需求填写
number = 1
ip_list = find_ip(number)
valid_ip = test_ip(ip_list)
save_valid_ip(valid_ip)
if __name__ == '__main__':
main()