Crawler IP proxy pool code record

When using python to crawl to visit a website, most of the time you need to be careful that the IP access is too frequent and the website is blocked

At this time, you need to use the IP proxy pool
. The free IP website on the Internet is used: domestic high-hidden proxy
Insert picture description here

Code:

import requests
import time
import random
from lxml import etree

#  获取
def get_ip_list(headers, page):
    ip_list = []
    for i in range(int(page)):
        # 爬取免费的IP
        url = 'https://www.kuaidaili.com/free/inha/{}/'.format(i+1)
        # print("爬取网址为:", url)
        #获取代理IP地址
        web_data = requests.get(url, headers=headers)
        if web_data.status_code == 200:
            tree0 = etree.HTML(web_data.text)
            ip_lists = tree0.xpath('//table/tbody/tr/td[@data-title="IP"]/text()');
            port_lists = tree0.xpath('//table/tbody/tr/td[@data-title="PORT"]/text()')
            type_lists = tree0.xpath('//table/tbody/tr/td[@data-title="类型"]/text()')
            # print(ip_lists)
            # print(port_lists)
            for x,y in zip(ip_lists, port_lists):
                ip_list.append(x + ":" + y)
            time.sleep(3)  # 防止访问频率过快,被封
    # print(len(ip_list))
    return ip_list

#  组建随机IP地址
def get_random_ip(ip_list):
#获取代理IP地址
    proxy_list = []
    for ip in ip_list:
        proxy_list.append('http://' + ip)
    proxy_ip = random.choice(proxy_list)
    proxies = {
    
    'http': proxy_ip}
    return proxies

if __name__ == '__main__':
    headers = {
    
    
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
    }
    #获取代理IP地址
    ip_list = get_ip_list(headers=headers, page=3)
    print(ip_list)

Insert picture description here

Guess you like

Origin blog.csdn.net/qq_36171287/article/details/113095615