爬虫篇——代理IP爬取备用及存储

爬虫篇——代理IP爬取备用及存储

代码

本文通过抓取免费的高匿IP代理,将其写入列表并保存为json格式文件,且将代码进行了封装,方便以后抓取数据时动态的更新handle的IP地址,从一方面避免抓取数据时反爬的干扰。

# *************************** 免费高匿代理IP爬取 ****************************
import urllib.request
import requests
from bs4 import BeautifulSoup
import json
class ProxySpider(object):
    def __init__(self):
        self.url = "https://www.xicidaili.com/nn/"
        self.headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
        self.ip_list = []
        self.ip_valid = []
    # 1、发送网络请求 获取数据
    def data_request(self):
        return requests.get(self.url,headers = self.headers).content.decode("utf-8")
    
    # 2、解析数据
    def data_parse(self,data):
        parse_data = BeautifulSoup(data,"lxml")
#         print(parse_data)
        all_proxy = parse_data.select('tr')
        del all_proxy[0]
        for proxy in all_proxy:
            ip = proxy.select("td")[1].get_text()
            port = proxy.select("td")[2].get_text()
            ip = ip + ":"+port
            self.ip_list.append(ip)
            
    # 3、检查代理ip地址的可用性
    def ip_validation(self):
        for ip in self.ip_list:
            try:
                free_proxy = {}
                free_proxy["http"] = ip
                url = "https://www.baidu.com/"
                requests.get(url,headers = self.headers,proxies = free_proxy)
                self.ip_valid.append(ip)
            except urllib.request.HTTPError as error:
                print(error.code)
                
    # 4、数据存储
    def data_save(self):
        with open("free_proxy.json","w",encoding = "utf-8")as fp:
            json.dump(self.ip_valid,fp)
    
    # 4、统筹运行
    def run(self):
        # 1、请求数据
        data = self.data_request()
#         print(data)
        # 2、数据解析
        self.data_parse(data)
#         print(len(self.ip_list))
        # 3、检查ip地址可用性
        self.ip_validation()
#         print(len(self.ip_valid))
        # 4、数据存储
        self.data_save()    
if __name__ == "__main__":
    ProxySpider().run()

代码运行结果:

["122.51.49.88:8888", "118.181.226.166:44640", "122.4.40.194:27430", "115.49.74.102:8118", "101.200.81.61:80", "49.76.237.243:8123", "124.156.98.172:80", "117.88.176.221:3000", "122.51.183.224:808", "119.254.94.93:46323", "59.44.78.30:42335", "27.208.231.100:8060", "113.77.101.202:8118", "124.239.216.14:8060", "101.132.123.99:8080", "60.31.213.115:808", "115.219.168.69:8118", "117.94.213.119:8118", "58.254.220.116:52470", "112.14.47.6:52024", "117.186.49.50:55443", "60.2.44.182:30963", "61.54.225.130:8060", "117.88.176.162:3000", "117.88.177.143:3000", "117.88.176.194:3000", "60.216.101.46:59351", "139.196.193.85:8080", "27.188.65.244:8060", "101.132.190.101:80", "60.190.250.120:8080", "115.46.116.170:8123", "120.198.76.45:41443", "218.59.193.14:47138", "121.237.149.63:3000", "121.237.148.31:3000", "117.88.177.197:3000", "117.88.176.55:3000", "119.180.173.81:8060", "222.95.144.202:3000", "117.88.176.170:3000", "121.237.148.241:3000", "183.195.106.118:8118", "114.104.134.142:8888", "223.68.190.130:8181", "121.237.149.218:3000", "110.189.152.86:52277", "27.184.157.205:8118", "112.194.112.175:8118", "202.107.233.123:8090", "119.84.112.137:80", "211.159.219.225:8118", "115.29.108.117:8118", "183.250.255.86:63000", "117.62.172.230:8118", "111.222.141.127:8118", "218.76.253.201:61408", "218.203.132.117:808", "221.193.94.18:8118", "121.237.149.206:3000", "220.173.143.242:808", "1.197.203.247:9999", "171.35.172.5:9999", "118.114.96.78:8118", "117.87.72.226:8118", "117.88.5.40:3000", "125.123.19.197:8118", "61.150.96.27:46111", "182.32.234.18:9999", "171.35.167.220:9999", "171.35.167.224:9999", "123.168.136.2:9999", "113.194.49.94:9999", "222.85.28.130:40505", "123.206.54.52:8118", "27.184.141.239:8118", "124.93.201.59:59618", "117.114.149.66:53281", "121.237.149.107:3000", "180.117.98.96:8118", "123.132.232.254:37638", "139.224.233.103:8118", "221.218.102.146:33323", "118.24.155.27:8118", "113.12.202.50:40498", "222.190.125.3:8118", "175.148.69.90:1133", "218.75.69.50:39590", "118.78.196.186:8118", "222.95.144.59:3000", "121.237.149.136:3000", "117.88.5.250:3000", "171.35.168.177:9999", "121.237.148.179:3000", "223.241.118.200:8010", "58.215.219.2:8000", "180.117.234.56:8118", "117.88.176.93:3000", "123.171.5.132:8118", "119.129.203.140:8118"]

by CyrusMay 2020 04 24

青春是手牵手坐上了
永不回头的火车
总有一天我们都老了
不会遗憾就OK了
——————五月天——————

原创文章 5 获赞 39 访问量 2330

猜你喜欢

转载自blog.csdn.net/Cyrus_May/article/details/105737354
今日推荐