【python爬虫】 爬取ip代理池

"""
目标:python 爬取ip代理池
所需模块:request bs4 
模块安装:
bs4:  pip install bs4
开发环境:Pycharm  python3.8
爬虫代码必备步骤:
1.确定需求
2.发送请求
3.获取数据
4.解析数据
5.打印数据
"""
import requests
from bs4 import BeautifulSoup
import time
proxies_list = []

def download(response):
    global proxies_list
    data_list = BeautifulSoup(response.text, "lxml")
    tr_list = data_list.select("#freelist > table > tbody")
    tr_list = BeautifulSoup(str(tr_list), "lxml")
    for tr in tr_list.find_all("tr"):
        proxies_dict = {
    
    }
        td = BeautifulSoup(str(tr), "lxml")
        try:
            http_type = td.find(attrs={
    
    "data-title": "类型"}).text.split(",")[1].strip()
        except:
            http_type = td.find(attrs={
    
    "data-title": "类型"}).text
        ip = td.find(attrs={
    
    "data-title": "IP"}).text
        port = td.find(attrs={
    
    "data-title": "PORT"}).text
        # print(http_type, ip, port)
        proxies_dict[http_type] = ip + ":" + port
        proxies_list.append(proxies_dict)


if __name__ == "__main__":
    for i in range(3):
        url = "https://www.kuaidaili.com/ops/proxylist/" + str(i+1)
        response = requests.get(url=url)
        download(response)
	num=0
    for proxies in proxies_list:
        num+=1
        print(f'NO.{
      
      num}.{
      
      proxies}')
    print(f'爬取ip代理个数{
      
      num}')

猜你喜欢

转载自blog.csdn.net/weixin_52049271/article/details/127175599