python-西刺代理的获取

写这个博客的原因是又用到了代理但是公司没有购买代理没办法只有爬取一些免费的代理虽然不太稳定凑合着用吧，为了防止以后代码在重写一遍这里把代码直接放在博客上以备后用。

import requests
from retry import retry
from scrapy import Selector


@retry(8)
def get_agent(url):
    """
    获取http代理
    :param url:
    :return:所获取到的代理ip
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'
    }
    agent_response = requests.get(url=url, headers=headers)
    agent_response = Selector(text=agent_response.text)
    all_agent = agent_response.xpath('//tr[@class="odd"]')
    for each_agent in all_agent:
        each_ip = each_agent.xpath('td[2]/text()').extract_first()
        each_port = each_agent.xpath('td[3]/text()').extract_first()
        each_kinds = each_agent.xpath('td[6]/text()').extract_first()
        full_agent = {"http": "http://%s:%s" % (each_ip, each_port)}
        check_agent(full_agent)

    next_page = agent_response.xpath('//a[@class="next_page"]/@href').extract_first()
    if next_page:
        print('this is %s page' % next_page)
        if next_page == '/wt/4':
            choice = input('do you wan get next page? y or n :>')
            if choice == 'n':
                exit()
        url = 'http://www.xicidaili.com' + next_page
        get_agent(url)


@retry(8)
def check_agent(full_agent):
    """
    检查代理ip是否可用
    :param full_agent: 代理ip
    :return:
    """
    url = 'http://www.xicidaili.com/nt/'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36',
        "Referer": "http://www.xicidaili.com/nn/1"
    }
    try:
        xi_ci_response = requests.get(url=url, headers=headers, proxies=full_agent,
                                       timeout=30)
    except BaseException as e:
        print(e)
    else:
        if xi_ci_response.status_code == 200:
            print('Successfully this %s agent is available' % full_agent)
            with open('./ip_agent.text', 'a+') as f:
                f.write('%s' % full_agent + '\n')
        else:
            print('Failed this %s agent is unavailable' % full_agent)


if __name__ == '__main__':
    url = 'http://www.xicidaili.com/wt/'
    get_agent(url)

我只爬取的是国内的htpp爬取速度确实慢。慢慢让他爬取吧。

python-西刺代理的获取

猜你喜欢