爬虫爬取可用的免費代理IP

需要用的工具

pycharm,python3.7,selenium库,request库,os文件库,re模块的正则表达式运用,自行下载chromedriver自测软件

代碼如下

具體的描述代碼可以看

from selenium import webdriver # 从selenium导入webdriver
from selenium.webdriver.chrome.options import Options
import requests
import re
from bs4 import BeautifulSoup
# 获取免费的代理并验证代理的可用性
from lxml import etree

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}

#获取代理ip网站的代理ip
def get_free_proxy():
    url = 'https://www.xicidaili.com/nn/'
    response = requests.get(url, headers=headers)
    html_ele = etree.HTML(response.content)
    tr_eles = html_ele.xpath('//table[@id="ip_list"]//tr')
    tr_eles.pop(0)
    for tr_ele in tr_eles:
        ip_str = tr_ele.xpath('./td[2]/text()')[0]
        port = tr_ele.xpath('./td[3]/text()')[0]
        yield ip_str + ':' + port

#验证代理ip的可用性
def validate_proxy(proxy_str):
    url = 'https://www.baidu.com'
    proxy = {
        'http': 'http://' + proxy_str,
        'https': 'http://'+proxy_str
    }
    try:
        response = requests.get(url, proxies=proxy, timeout=2)
        if response.status_code == 200:
            return True
    except:
        return False
if __name__ == '__main__':
    for item in get_free_proxy():
        if validate_proxy(item):
            print('可用的代理IP:')
            print(item)

发布了11 篇原创文章 · 获赞 307 · 访问量 6759

猜你喜欢

转载自blog.csdn.net/weixin_43853097/article/details/103946657