When using Python reptiles, often encounter a website that anti-climb mechanism. We can crawl through camouflage headers, but the site still can get your ip, in order to ban your ip to prevent crawling information.
In the request method, we can disguise our ip through proxies parameters, some free ip proxy site on the Web site, you can crawl these ip, ip agent pool established by the post-test.
It recommended a common method of camouflage head
from fake_useragent import UserAgent
ua = UserAgent()
headers = {'User-Agent':ua.random}
Then get to the point
Crawling ip (IPPool.py)
import requests
from lxml import etree
from fake_useragent import UserAgent
#camouflage
ua = UserAgent()
headers = {'User-Agent':ua.random}
def get_ip():
ip_list = []
#path
url = 'https://www.xicidaili.com/nt/' #ip is aging, only the first page crawling
#request
response = requests.get(url=url,headers=headers)
# Set coding
response.encoding = response.apparent_encoding
response = response.text
response = etree.HTML(response)
tr_list = response.xpath('//tr[@class="odd"]')
for i in tr_list:
#ip
ip = i.xpath('./td[2]/text()')[0]
#The port number
port = i.xpath('./td[3]/text()')[0]
#protocol
agreement = i.xpath('./td[6]/text()')[0]
agreement = agreement.lower()
# Assembling the full path
ip = agreement + '://' + ip + ':' + port
ip_list.append(ip)
return ip_list
if __name__ == '__main__':
ip_list = get_ip()
print(ip_list)
Test ip
Test Method a (from multiprocessing.dummy import Pool)
import requests
from multiprocessing.dummy import Pool
# Crawling to get a list of ip
from IPPool import get_ip
test_list = get_ip()
# Define a global list, used to store valid ip
ip_list = []
#ip test site
url = 'http://icanhazip.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0'
}
def ip_test(ip):
try:
if ip.split(":")[0] == 'http':
proxies = {
'http': ip
}
else:
proxies = {
'https': ip
}
response = requests.get(url=url, headers=headers, proxies=proxies, timeout=3)
ip_list.append(ip)
print (ip + "Available")
except:
print (ip + "unavailable")
if __name__ == '__main__':
pool = pool (4)
pool.map(ip_test, test_list)
print(ip_list)
print ( "% s of total crawling ip, ip available as:% s, ip is unavailable:% s"% (len (test_list), len (ip_list), len (test_list) -len (ip_list)))
Test Results:
Test Method Two (Threading multi-thread queue)
import threading
import requests
import queue
from fake_useragent import UserAgent
# Crawling to get a list of ip
from IPPool import get_ip
test_list = get_ip()
# Define a global list, used to store valid ip
ip_pool = []
# Random head camouflage
ua = UserAgent()
headers = {'User-Agent':ua.random}
url = 'https://www.csdn.net/'
# url = 'http://icanhazip.com/'
def test_ip(queue_list):
while True:
if queue_list.empty():
break
else:
ip = queue_list.get()
if ip.split(":")[0] == 'http':
proxies = {
'http' : ip
}
else:
proxies = {
'https': ip
}
try:
response = requests.get(url=url, headers=headers, proxies=proxies,timeout=3)
if response.status_code == 200:
print ( "% s] [% s test, the test results are available []"% (threading.current_thread (). name, proxies))
ip_pool.append(ip)
except:
print ( "% s] [% s test, the test results are not available []"% (threading.current_thread (). name, proxies))
if __name__ == '__main__':
queue_list = queue.Queue () # Create a queue
# Ip of crawling into the queue
for i in test_list:
queue_list.put(i)
# Create a thread
out_thread = [threading.Thread(target=test_ip, args=(queue_list,), name="进程%s" % item) for item in range(5)]
for thread in out_thread:
thread.start()
for thread in out_thread:
thread.join()
print ( 'Test Complete')
print(ip_pool)
print ( "% s of total crawling ip, ip available as:% s, ip is unavailable:% s"% (len (test_list), len (ip_pool), len (test_list) -len (ip_pool)))
result:
Test URLs do not need so complex, www.baidu.com a class can have a blogger recommends a test Web site: http: //icanhazip.com/
Encountered when testing a pit, not too much attention to protocol is http or https, unified with the http, and then found each ip can be used, of course, this is not possible, modified after a successful test of about twenty-ip five or so. Zhengzhou, how much money do abortion http://www.kdwtrl.com/
ip https://www.kuaidaili.com/free/intr/ crawling this URL also wrote (ip no treatment), but a little bit ip this URL, so there is no test
IPPool2.py
import requests
from lxml import etree
from fake_useragent import UserAgent
#camouflage
ua = UserAgent()
headers = {'User-Agent':ua.random}
def get_ip():
ip_list = []
#path
url = 'https://www.kuaidaili.com/free/intr/'
#request
response = requests.get(url=url,headers=headers)
# Set coding
response.encoding = response.apparent_encoding
response = response.text
response = etree.HTML(response)
tr_list = response.xpath('//*[@id="list"]/table/tbody/tr')
for i in tr_list:
ip = i.xpath('./td[1]/text()')[0]
ip_list.append(ip)
return ip_list
if __name__ == '__main__':
ip_list = get_ip()
# print(ip_list)