python-西刺代理的获取

写这个博客的原因是又用到了代理 但是公司没有购买代理 没办法只有爬取一些免费的代理 虽然不太稳定 凑合着用吧,为了防止以后代码在重写一遍这里把代码直接放在博客上以备后用。

import requests
from retry import retry
from scrapy import Selector


@retry(8)
def get_agent(url):
    """
    获取http代理
    :param url:
    :return:所获取到的代理ip
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'
    }
    agent_response = requests.get(url=url, headers=headers)
    agent_response = Selector(text=agent_response.text)
    all_agent = agent_response.xpath('//tr[@class="odd"]')
    for each_agent in all_agent:
        each_ip = each_agent.xpath('td[2]/text()').extract_first()
        each_port = each_agent.xpath('td[3]/text()').extract_first()
        each_kinds = each_agent.xpath('td[6]/text()').extract_first()
        full_agent = {"http": "http://%s:%s" % (each_ip, each_port)}
        check_agent(full_agent)

    next_page = agent_response.xpath('//a[@class="next_page"]/@href').extract_first()
    if next_page:
        print('this is %s page' % next_page)
        if next_page == '/wt/4':
            choice = input('do you wan get next page? y or n :>')
            if choice == 'n':
                exit()
        url = 'http://www.xicidaili.com' + next_page
        get_agent(url)


@retry(8)
def check_agent(full_agent):
    """
    检查代理ip是否可用
    :param full_agent: 代理ip
    :return:
    """
    url = 'http://www.xicidaili.com/nt/'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36',
        "Referer": "http://www.xicidaili.com/nn/1"
    }
    try:
        xi_ci_response = requests.get(url=url, headers=headers, proxies=full_agent,
                                       timeout=30)
    except BaseException as e:
        print(e)
    else:
        if xi_ci_response.status_code == 200:
            print('Successfully this %s agent is available' % full_agent)
            with open('./ip_agent.text', 'a+') as f:
                f.write('%s' % full_agent + '\n')
        else:
            print('Failed this %s agent is unavailable' % full_agent)


if __name__ == '__main__':
    url = 'http://www.xicidaili.com/wt/'
    get_agent(url)

我只爬取的是国内的htpp爬取速度确实慢。慢慢让他爬取吧。

猜你喜欢

转载自blog.csdn.net/weixin_42812527/article/details/83348954