Python's agent IP agent pool

  When using Python reptiles, often encounter a website that anti-climb mechanism. We can crawl through camouflage headers, but the site still can get your ip, in order to ban your ip to prevent crawling information.

  In the request method, we can disguise our ip through proxies parameters, some free ip proxy site on the Web site, you can crawl these ip, ip agent pool established by the post-test.

  It recommended a common method of camouflage head

  from fake_useragent import UserAgent

  ua = UserAgent()

  headers = {'User-Agent':ua.random}

  Then get to the point

  Crawling ip (IPPool.py)

  import requests

  from lxml import etree

  from fake_useragent import UserAgent

  #camouflage

  ua = UserAgent()

  headers = {'User-Agent':ua.random}

  def get_ip():

  ip_list = []

  #path

  url = 'https://www.xicidaili.com/nt/' #ip is aging, only the first page crawling

  #request

  response = requests.get(url=url,headers=headers)

  # Set coding

  response.encoding = response.apparent_encoding

  response = response.text

  response = etree.HTML(response)

  tr_list = response.xpath('//tr[@class="odd"]')

  for i in tr_list:

  #ip

  ip = i.xpath('./td[2]/text()')[0]

  #The port number

  port = i.xpath('./td[3]/text()')[0]

  #protocol

  agreement = i.xpath('./td[6]/text()')[0]

  agreement = agreement.lower()

  # Assembling the full path

  ip = agreement + '://' + ip + ':' + port

  ip_list.append(ip)

  return ip_list

  if __name__ == '__main__':

  ip_list = get_ip()

  print(ip_list)

  Test ip

  Test Method a (from multiprocessing.dummy import Pool)

  import requests

  from multiprocessing.dummy import Pool

  # Crawling to get a list of ip

  from IPPool import get_ip

  test_list = get_ip()

  # Define a global list, used to store valid ip

  ip_list = []

  #ip test site

  url = 'http://icanhazip.com'

  headers = {

  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0'

  }

  def ip_test(ip):

  try:

  if ip.split(":")[0] == 'http':

  proxies = {

  'http': ip

  }

  else:

  proxies = {

  'https': ip

  }

  response = requests.get(url=url, headers=headers, proxies=proxies, timeout=3)

  ip_list.append(ip)

  print (ip + "Available")

  except:

  print (ip + "unavailable")

  if __name__ == '__main__':

  pool = pool (4)

  pool.map(ip_test, test_list)

  print(ip_list)

  print ( "% s of total crawling ip, ip available as:% s, ip is unavailable:% s"% (len (test_list), len (ip_list), len (test_list) -len (ip_list)))

  Test Results:

  Test Method Two (Threading multi-thread queue)

  import threading

  import requests

  import queue

  from fake_useragent import UserAgent

  # Crawling to get a list of ip

  from IPPool import get_ip

  test_list = get_ip()

  # Define a global list, used to store valid ip

  ip_pool = []

  # Random head camouflage

  ua = UserAgent()

  headers = {'User-Agent':ua.random}

  url = 'https://www.csdn.net/'

  # url = 'http://icanhazip.com/'

  def test_ip(queue_list):

  while True:

  if queue_list.empty():

  break

  else:

  ip = queue_list.get()

  if ip.split(":")[0] == 'http':

  proxies = {

  'http' : ip

  }

  else:

  proxies = {

  'https': ip

  }

  try:

  response = requests.get(url=url, headers=headers, proxies=proxies,timeout=3)

  if response.status_code == 200:

  print ( "% s] [% s test, the test results are available []"% (threading.current_thread (). name, proxies))

  ip_pool.append(ip)

  except:

  print ( "% s] [% s test, the test results are not available []"% (threading.current_thread (). name, proxies))

  if __name__ == '__main__':

  queue_list = queue.Queue () # Create a queue

  # Ip of crawling into the queue

  for i in test_list:

  queue_list.put(i)

  # Create a thread

  out_thread = [threading.Thread(target=test_ip, args=(queue_list,), name="进程%s" % item) for item in range(5)]

  for thread in out_thread:

  thread.start()

  for thread in out_thread:

  thread.join()

  print ( 'Test Complete')

  print(ip_pool)

  print ( "% s of total crawling ip, ip available as:% s, ip is unavailable:% s"% (len (test_list), len (ip_pool), len (test_list) -len (ip_pool)))

  result:

  Test URLs do not need so complex, www.baidu.com a class can have a blogger recommends a test Web site: http: //icanhazip.com/

  Encountered when testing a pit, not too much attention to protocol is http or https, unified with the http, and then found each ip can be used, of course, this is not possible, modified after a successful test of about twenty-ip five or so. Zhengzhou, how much money do abortion http://www.kdwtrl.com/

  ip https://www.kuaidaili.com/free/intr/ crawling this URL also wrote (ip no treatment), but a little bit ip this URL, so there is no test

  IPPool2.py

  import requests

  from lxml import etree

  from fake_useragent import UserAgent

  #camouflage

  ua = UserAgent()

  headers = {'User-Agent':ua.random}

  def get_ip():

  ip_list = []

  #path

  url = 'https://www.kuaidaili.com/free/intr/'

  #request

  response = requests.get(url=url,headers=headers)

  # Set coding

  response.encoding = response.apparent_encoding

  response = response.text

  response = etree.HTML(response)

  tr_list = response.xpath('//*[@id="list"]/table/tbody/tr')

  for i in tr_list:

  ip = i.xpath('./td[1]/text()')[0]

  ip_list.append(ip)

  return ip_list

  if __name__ == '__main__':

  ip_list = get_ip()

  # print(ip_list)


Guess you like

Origin blog.51cto.com/14503791/2485400