【爬虫】抓取xicidaili可用代理ip

# coding=utf-8
import requests
from lxml import etree
ips=[]
def run(page):
url="https://www.xicidaili.com/nn/{}"
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36,PostmanRuntime/7.16.3",
'Accept': "*/*",
'Cache-Control': "no-cache",
'Postman-Token': "e17c0361-c140-4e67-b4d7-1d4297b6876d,2da41bb3-79f5-40fd-a5a7-63c0acbd4442",
'Host': "www.xicidaili.com",
'Accept-Encoding': "gzip, deflate",
'Cookie': "_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJWYyNTA3YjBmOWFjNDAxOWJhYWEzNDg4YWQ0OTU5ZjYyBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUkxQnBlMzlsNmR3bExnWHltNklaWjFIdDJyNkdiVzE0cXUwR094TlErczQ9BjsARg%3D%3D--108c1be9a4e23604bde585654cfee79143f53fb6",
'cache-control': "no-cache"
}
r=requests.get(url.format(page),headers=headers)

selector=etree.HTML(r.text)
info_list=selector.xpath('//table[@id="ip_list"]//tr')
# print(info_list)
info_list=info_list[1:]
for info in info_list:
ip=''.join(info.xpath('./td[2]/text()'))
port=''.join(info.xpath('./td[3]/text()'))
protocol=''.join(info.xpath('./td[6]/text()'))
ips.append(protocol+"://"+ip+":"+port)
print(ips)

#存储到txt文件
def write_to_txt(lists):
with open('ips.txt','w',encoding='utf-8') as f:
f.write("\n".join(lists))


if __name__=="__main__":
for i in range(1,5):
print("==================同步第{}页=====================".format(i))
run(i)
write_to_txt(ips)
print(len(ips))

猜你喜欢

转载自www.cnblogs.com/winstonsias/p/11528021.html