示例1:Python 3.X HTTP代理调用·爬虫(动态)代理IP
'''
Python 3.x
描述:本DEMO演示了使用爬虫(动态)代理IP请求网页的过程,代码使用了多线程
逻辑:每隔5秒从API接口获取IP,对于每一个IP开启一个线程去抓取网页源码
'''
import requests
import time
import threading
from requests.packages import urllib3
ips = []
class CrawlThread(threading.Thread):
def __init__(self,proxyip):
super(CrawlThread, self).__init__()
self.proxyip=proxyip
def run(self):
start = time.time()
urllib3.disable_warnings()
html=requests.get(url=targetUrl, proxies={"http" : 'http://' + self.proxyip, "https" : 'https://' + self.proxyip}, verify=False, timeout=15).content.decode()
end = time.time()
print(threading.current_thread().getName() + "使用代理IP, 耗时 " + str(end - start) + "毫秒 " + self.proxyip + " 获取到如下HTML内容:\n" + html + "\n*************")
class GetIpThread(threading.Thread):
def __init__(self,fetchSecond):
super(GetIpThread, self).__init__()
self.fetchSecond=fetchSecond
def run(self):
global ips
while True:
res = requests.get(apiUrl).content.decode()
ips = res.split('\n')
for proxyip in ips:
if proxyip.strip():
CrawlThread(proxyip).start()
time.sleep(self.fetchSecond)
if __name__ == '__main__':
apiUrl = "http:xxxx"
targetUrl = "http://ip.chinaz.com/getip.aspx"
fetchSecond = 5
GetIpThread(fetchSecond).start()
示例2:Python 2.X 使用HTTP代理·爬虫(动态)代理IP
'''
Python 2.X
描述:本DEMO演示了使用爬虫(动态)代理IP请求网页的过程,代码使用了多线程
逻辑:每隔5秒从API接口获取IP,对于每一个IP开启一个线程去抓取网页源码
'''
import urllib;
import urllib2;
import time;
import threading;
import ssl;
ips = [];
class CrawlThread(threading.Thread):
def __init__(self,proxyip):
super(CrawlThread, self).__init__();
self.proxyip=proxyip;
def run(self):
start = time.time();
ssl._create_default_https_context = ssl._create_unverified_context;
User_Agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0';
header = {};
header['User-Agent'] = User_Agent;
proxy={"http": self.proxyip};
proxy_support = urllib2.ProxyHandler(proxy);
opener = urllib2.build_opener(proxy_support);
urllib2.install_opener(opener);
timeout = 15;
req = urllib2.Request(targetUrl, headers=header);
response = urllib2.urlopen(req, None, timeout);
html = response.read();
html = html.decode("utf-8").encode("gbk")
end = time.time();
print(threading.current_thread().getName() + "使用代理IP, 耗时 " + str(end - start) + "毫秒 " + self.proxyip + " 获取到如下HTML内容:\n" + html + "\n*************")
class GetIpThread(threading.Thread):
def __init__(self,fetchSecond):
super(GetIpThread, self).__init__();
self.fetchSecond=fetchSecond;
def run(self):
global ips;
while True:
res = urllib.urlopen(apiUrl).read().strip("\n");
ips = res.split("\n");
for proxyip in ips:
if proxyip.strip():
CrawlThread(proxyip).start();
time.sleep(self.fetchSecond);
if __name__ == '__main__':
apiUrl = "http://xxxx"
targetUrl = "http://ip.chinaz.com/getip.aspx";
fetchSecond = 5;
GetIpThread(fetchSecond).start();
示例3:Python 3.X 使用S5代理·Socks5代理IP
'''
Python 3.x
描述:本DEMO演示了使用爬虫(动态)代理IP请求网页的过程,代码使用了多线程
逻辑:每隔5秒从API接口获取IP,对于每一个IP开启一个线程去抓取网页源码
注意:需先安装socks模块 pip3 install 'requests[socks]'
'''
import requests;
import time;
import threading;
from requests.packages import urllib3;
ips = [];
class CrawlThread(threading.Thread):
def __init__(self,proxyip):
super(CrawlThread, self).__init__();
self.proxyip=proxyip;
def run(self):
start = time.time();
urllib3.disable_warnings();
html=requests.get(url=targetUrl, proxies={"http" : 'socks5://user:password@' + self.proxyip, "https" : 'socks5://user:password@' + self.proxyip}, verify=False, timeout=15).content.decode()
end = time.time();
print(threading.current_thread().getName() + "使用代理IP, 耗时 " + str(end - start) + "毫秒 " + self.proxyip + " 获取到如下HTML内容:\n" + html + "\n*************")
class GetIpThread(threading.Thread):
def __init__(self,fetchSecond):
super(GetIpThread, self).__init__();
self.fetchSecond=fetchSecond;
def run(self):
global ips;
while True:
res = requests.get(apiUrl).content.decode()
ips = res.split('\n');
for proxyip in ips:
CrawlThread(proxyip).start();
time.sleep(self.fetchSecond);
if __name__ == '__main__':
apiUrl = "http:xxxx"
targetUrl = "http://ip.chinaz.com/getip.aspx";
fetchSecond = 5;
GetIpThread(fetchSecond).start();
示例4: Scrpay 中使用HTTP代理
'''增加一个代理中间件'''
DOWNLOADER_MIDDLEWARES = {
'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware':None,
'myproxies.middlewares.ProxyMiddleWare':125,
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware':None
}
'''修改中间件middlewares.py'''
import random
import scrapy
from scrapy import log
class ProxyMiddleWare(object):
"""docstring for ProxyMiddleWare"""
def process_request(self,request, spider):
'''对request对象加上proxy'''
proxy = self.get_random_proxy()
print("this is request ip:"+proxy)
request.meta['proxy'] = proxy
def process_response(self, request, response, spider):
'''对返回的response处理'''
if response.status != 200:
proxy = self.get_random_proxy()
print("this is response ip:"+proxy)
request.meta['proxy'] = proxy
return request
return response
def get_random_proxy(self):
apiUrl = "http:xxxx"
proxy = requests.get(apiUrl)
return proxy