python爬虫利用requests制作代理池s

爬取代理然后验证代理,将可用代理放入txt文件。

辣鸡编码,大佬们轻喷,有问题留言。。。。。。。谢谢。
结果如图
在这里插入图片描述

import requests
from scrapy import Selector

start_url = 'http://www.89ip.cn/index_1.html'
url = 'http://www.89ip.cn/index_{}.html'

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}

class MyProxy(object):
	def GetPage(self,url):#页面源码获取
		response = requests.get(url=url,headers=headers)
		text = response.text
		return text
	def GetInfo(self,text):#页面信息获取
		selector = Selector(text=text)
		FindTable = selector.xpath('//div[@class="layui-form"]/table/tbody/tr')
		for proxy in FindTable:
			ip = "".join(proxy.xpath('.//td[1]/text()').get()).replace('\t','').replace('\n','')
			port = "".join(proxy.xpath('.//td[2]/text()').get()).replace('\t','').replace('\n','')
			print(ip,port)
			self.TestIP(ip,port)
	def TabPage(self,text):#切换页面
		selector = Selector(text=text)
		page = selector.xpath('//*[@id="layui-laypage-1"]/a[8]/@data-page').get()
		self.new_url = url.format(page)
	def TestIP(self,ip,port):
		try:
			response = requests.get(url='https://www.baidu.com/',headers=headers,proxies={"http":"{}:{}".format(ip,port)})
			print(response.status_code)
			if response.status_code<200 or response.status_code>200:
				print("访问失败")
			else:
				self.file = open('proxy.txt', 'a+')
				self.file.write('{}:{}\n'.format(ip,port))
				self.file.close()
		except Exception as e:
			print("访问失败")
	def close(self):
		self.file.close()
mypoxy = MyProxy()
text = mypoxy.GetPage(start_url)
while True:
	try:
		mypoxy.GetInfo(text)
		mypoxy.GetPage(text)
		text = mypoxy.GetPage(mypoxy.new_url)
	except Exception as e:
		print('**'*10)
		# mypoxy.close()

发布了23 篇原创文章 · 获赞 22 · 访问量 1万+

猜你喜欢

转载自blog.csdn.net/qq_36389249/article/details/103345300