写这个博客的原因是又用到了代理 但是公司没有购买代理 没办法只有爬取一些免费的代理 虽然不太稳定 凑合着用吧,为了防止以后代码在重写一遍这里把代码直接放在博客上以备后用。
import requests
from retry import retry
from scrapy import Selector
@retry(8)
def get_agent(url):
"""
获取http代理
:param url:
:return:所获取到的代理ip
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'
}
agent_response = requests.get(url=url, headers=headers)
agent_response = Selector(text=agent_response.text)
all_agent = agent_response.xpath('//tr[@class="odd"]')
for each_agent in all_agent:
each_ip = each_agent.xpath('td[2]/text()').extract_first()
each_port = each_agent.xpath('td[3]/text()').extract_first()
each_kinds = each_agent.xpath('td[6]/text()').extract_first()
full_agent = {"http": "http://%s:%s" % (each_ip, each_port)}
check_agent(full_agent)
next_page = agent_response.xpath('//a[@class="next_page"]/@href').extract_first()
if next_page:
print('this is %s page' % next_page)
if next_page == '/wt/4':
choice = input('do you wan get next page? y or n :>')
if choice == 'n':
exit()
url = 'http://www.xicidaili.com' + next_page
get_agent(url)
@retry(8)
def check_agent(full_agent):
"""
检查代理ip是否可用
:param full_agent: 代理ip
:return:
"""
url = 'http://www.xicidaili.com/nt/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36',
"Referer": "http://www.xicidaili.com/nn/1"
}
try:
xi_ci_response = requests.get(url=url, headers=headers, proxies=full_agent,
timeout=30)
except BaseException as e:
print(e)
else:
if xi_ci_response.status_code == 200:
print('Successfully this %s agent is available' % full_agent)
with open('./ip_agent.text', 'a+') as f:
f.write('%s' % full_agent + '\n')
else:
print('Failed this %s agent is unavailable' % full_agent)
if __name__ == '__main__':
url = 'http://www.xicidaili.com/wt/'
get_agent(url)
我只爬取的是国内的htpp
爬取速度确实慢。慢慢让他爬取吧。