python爬虫登录爬取

import requests
from lxml import etree
import time
import random
import csv

def test_ip(ip_address):
    '''
    测试ip是否可用
    :param ip_address: 代理ip
    '''
    url = 'http://icanhazip.com/'

    headers = {
        # headers 头部文件
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0',
    }
    ip_pool = []
    for ip_test in ip_address:
        # print(ip_test)
        try:
            response = requests.get(url=url,headers=headers,proxies=ip_test,timeout=5)
            if response.status_code == 200:
                ip_pool.append(ip_test)
            time.sleep(random.randint(2,8))
        except Exception as e:
            pass
    print(ip_pool)
    files_save(ip_pool)

def files_save(ip_list):
    '''
    将可用代理ip保存
    :param ip_list:代理ip
    :return:
    '''
    with open('./代理ip.csv','a+',encoding='utf-8')as f:
        write = csv.writer(f)
        write.writerow(ip_list)
    pass



def get_page_data(nums):
    '''
    获取西刺代理的页面信息
    :return:
    '''
    ip_list = []
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0',
    }
    for i in range(1,nums+1):
        url = "https://www.xicidaili.com/nn/{}".format(i)
        response = requests.request('get',url=url,headers=headers)
        page_data = etree.HTML(response.text)
        # 获取https信息
        # https_infos = page_data.xpath(".//tr[@class='odd']")
        # 获取http信息
        # http_infos = page_data.xpath(".//tr[@class='']")

        page_infos = page_data.xpath(".//tr[@class='odd']|.//tr[@class='']")

        for info in page_infos:
            ip_dict = {}
            ip_address = info.xpath(".//td[2]/text()")[0]
            ip_port = info.xpath(".//td[3]/text()")[0]
            ip_type = info.xpath(".//td[6]/text()")[0].lower()
            ip_dict[ip_type] = ip_type+'://'+ip_address+':'+ip_port
            ip_list.append(ip_dict)
    # print(ip_list)
    test_ip(ip_list)
    pass

    pass

if __name__ == '__main__':
    '''
    爬取代理ip时应注意
    需要测试此ip是否可用
    爬取速度
    分析:
    url信息
    页面      url
    1           https://www.xicidaili.com/nn/
    2           https://www.xicidaili.com/nn/2
    3           https://www.xicidaili.com/nn/3
    
    '''
    # nums = int(input("请输入爬取页数>>"))
    nums = 2
    get_page_data(nums)

猜你喜欢

转载自www.cnblogs.com/lizhihoublog/p/12551324.html