Just upload the code directly, there is no explanation for every step, copy and paste it can be used. Can not run directly reply "code" to your source code
import requests
from lxml import etree
import json
class XiciProxiesSpider(object):
def __init__(self):
self.num = 1
self.start_url = 'https://www.kuaidaili.com/free/inha/{}'.format(self.num)
self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36'
}
def get_page_from_url(self, url):
response = requests.get(url, headers=self.headers)
return response.content.decode()
def get_data_from_page(self, page):
# 把page转换为Element对象
html = etree.HTML(page)
# 获取包含代理信息的tr列表
trs = html.xpath('//tbody//tr')
# 遍历trs, 获取数据信息
data = {
'http': [],
# 'https': []
}
for tr in trs:
try:
ip = tr.xpath('./td[1]/text()')[0] # IP地址
port = tr.xpath('./td[2]/text()')[0] # 端口
ip_type = tr.xpath('./td[4]/text()')[0].lower() # 类型 以及大小写转换
# 如果ip不是http或https直接返回
if ip_type not in data.keys():
return
# 构建代理数据
item = {ip_type: '{}:{}'.format(ip, port)}
# 检查代理IP是否可用, 如果可用添加到列表中
if self.validate_ip(item, ip_type):
data[ip_type].append(item)
except Exception as ex:
print(ex)
print(etree.tostring(tr))
print("222",data)
return data
def validate_ip(self, item, ip_type):
try:
test_url = "{}://blog.csdn.net/weixin_43407092/article/details/89743502".format(ip_type)
response = requests.get(test_url, proxies=item, timeout=2)
if response.status_code == 200:
return True
return False
except Exception as ex:
return False
def save_data(self, data):
with open('快代理.txt', 'a') as f:
json.dump(data, f, indent=2)
self.num += 1
def run(self):
while True:
# 获取页面内宽容
page = self.get_page_from_url(self.start_url)
# 获取可用代理IP
data = self.get_data_from_page(page)
# 保存数据
self.save_data(data)
if __name__ == '__main__':
fps = XiciProxiesSpider()
fps.run()
The execution results are as follows, and there are not many useful agents.