Python 之 代理IP代理池

  在使用Python爬虫时,经常遇见具有反爬机制的网站。我们可以通过伪装headers来爬取,但是网站还是可以获取你的ip,从而禁掉你的ip来阻止爬取信息。

  在request方法中,我们可以通过proxies参数来伪装我们的ip,一些网站上有免费的ip代理网站,可以通过爬取这些ip,经检测后建立ip代理池。

  推荐一种常用的伪装头方法

  from fake_useragent import UserAgent

  ua = UserAgent()

  headers = {'User-Agent':ua.random}

  接下来进入正题

  爬取ip(IPPool.py)

  import requests

  from lxml import etree

  from fake_useragent import UserAgent

  #伪装

  ua = UserAgent()

  headers = {'User-Agent':ua.random}

  def get_ip():

  ip_list = []

  #路径

  url = 'https://www.xicidaili.com/nt/' #ip是有时效的,只爬取第一页

  #请求

  response = requests.get(url=url,headers=headers)

  #设置编码

  response.encoding = response.apparent_encoding

  response = response.text

  response = etree.HTML(response)

  tr_list = response.xpath('//tr[@class="odd"]')

  for i in tr_list:

  #ip

  ip = i.xpath('./td[2]/text()')[0]

  #端口号

  port = i.xpath('./td[3]/text()')[0]

  #协议

  agreement = i.xpath('./td[6]/text()')[0]

  agreement = agreement.lower()

  #拼装完整路径

  ip = agreement + '://' + ip + ':' + port

  ip_list.append(ip)

  return ip_list

  if __name__ == '__main__':

  ip_list = get_ip()

  print(ip_list)

  测试ip

  测试方法一(from multiprocessing.dummy import Pool)

  import requests

  from multiprocessing.dummy import Pool

  #获取爬取到的ip列表

  from IPPool import get_ip

  test_list = get_ip()

  #定义一个全局列表,用来存放有效ip

  ip_list = []

  #ip测试网站

  url = 'http://icanhazip.com'

  headers = {

  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0'

  }

  def ip_test(ip):

  try:

  if ip.split(":")[0] == 'http':

  proxies = {

  'http': ip

  }

  else:

  proxies = {

  'https': ip

  }

  response = requests.get(url=url, headers=headers, proxies=proxies, timeout=3)

  ip_list.append(ip)

  print(ip + "可用")

  except:

  print(ip + "不可用")

  if __name__ == '__main__':

  pool = Pool(4)

  pool.map(ip_test, test_list)

  print(ip_list)

  print("总共爬取%s个ip,可用ip为:%s,不可用ip为:%s"%(len(test_list),len(ip_list),len(test_list)-len(ip_list)))

  测试结果:

  测试方法二(Threading多线程队列)

  import threading

  import requests

  import queue

  from fake_useragent import UserAgent

  #获取爬取到的ip列表

  from IPPool import get_ip

  test_list = get_ip()

  #定义一个全局列表,用来存放有效ip

  ip_pool = []

  #随机头伪装

  ua = UserAgent()

  headers = {'User-Agent':ua.random}

  url = 'https://www.csdn.net/'

  # url = 'http://icanhazip.com/'

  def test_ip(queue_list):

  while True:

  if queue_list.empty():

  break

  else:

  ip = queue_list.get()

  if ip.split(":")[0] == 'http':

  proxies = {

  'http' : ip

  }

  else:

  proxies = {

  'https': ip

  }

  try:

  response = requests.get(url=url, headers=headers, proxies=proxies,timeout=3)

  if response.status_code == 200:

  print("【%s】测试%s,测试结果【可用】" % (threading.current_thread().name, proxies))

  ip_pool.append(ip)

  except:

  print("【%s】测试%s,测试结果【不可用】" % (threading.current_thread().name, proxies))

  if __name__ == '__main__':

  queue_list = queue.Queue()#创建队列

  #将爬取的ip放入队列中

  for i in test_list:

  queue_list.put(i)

  #创建线程

  out_thread = [threading.Thread(target=test_ip, args=(queue_list,), name="进程%s" % item) for item in range(5)]

  for thread in out_thread:

  thread.start()

  for thread in out_thread:

  thread.join()

  print('测试完成')

  print(ip_pool)

  print("总共爬取%s个ip,可用ip为:%s,不可用ip为:%s"%(len(test_list),len(ip_pool),len(test_list)-len(ip_pool)))

  结果:

  测试网址不需要那么复杂,www.baidu.com一类的都可以,有一位博主推荐了一个测试网站:http://icanhazip.com/

  在测试时遇到了一个坑,没有太注意协议是http还是https,统一用了http,然后发现每一个ip都可以用,当然这是不可能的,经过修改后,测试成功的ip大概在二十五个左右。郑州做流产多少钱 http://www.kdwtrl.com/

  https://www.kuaidaili.com/free/intr/这个网址的ip爬取也写了(ip还没有处理),但是这个网址的一页ip有点少,所以就没有测试

  IPPool2.py

  import requests

  from lxml import etree

  from fake_useragent import UserAgent

  #伪装

  ua = UserAgent()

  headers = {'User-Agent':ua.random}

  def get_ip():

  ip_list = []

  #路径

  url = 'https://www.kuaidaili.com/free/intr/'

  #请求

  response = requests.get(url=url,headers=headers)

  #设置编码

  response.encoding = response.apparent_encoding

  response = response.text

  response = etree.HTML(response)

  tr_list = response.xpath('//*[@id="list"]/table/tbody/tr')

  for i in tr_list:

  ip = i.xpath('./td[1]/text()')[0]

  ip_list.append(ip)

  return ip_list

  if __name__ == '__main__':

  ip_list = get_ip()

  # print(ip_list)


猜你喜欢

转载自blog.51cto.com/14503791/2485400