进程 线程 协程 用来爬虫

1.线程

 1 from concurrent.futures import ThreadPoolExecutor
 2 import requests
 3 import time
 4 
 5 def task(url):
 6     """
 7     下载页面
 8     :param url:
 9     :return:
10     """
11     response = requests.get(url)
12     return response
13 
14 # 通过回调函数处理
15 def done(future,*args,**kwargs):
16     response = future.result() #response [200]
17     print(response.status_code,response.content)
18 
19 pool = ThreadPoolExecutor(7)
20 url_list = [
21     'http://www.cnblogs.com/wupeiqi',
22     'http://huaban.com/favorite/beauty/',
23     'http://www.bing.com',
24     'http://www.zhihu.com',
25     'http://www.sina.com',
26     'http://www.baidu.com',
27     'http://www.autohome.com.cn',
28 ]
29 for url in url_list:
30     v = pool.submit(task,url)
31     v.add_done_callback(done)
32 
33 pool.shutdown(wait=True)
多线程
 1 from concurrent.futures import ThreadPoolExecutor
 2 import requests
 3 import time
 4 
 5 def task(url):
 6     response = requests.get(url)
 7     print(url,response)
 8     # 写正则表达式 这里直接返回处理
 9 
10 
11 pool = ThreadPoolExecutor(7) # 线程池
12 url_list = [
13     'http://www.cnblogs.com/wupeiqi',
14     'http://huaban.com/favorite/beauty/',
15     'http://www.bing.com',
16     'http://www.zhihu.com',
17     'http://www.sina.com',
18     'http://www.baidu.com',
19     'http://www.autohome.com.cn',
20 ]
21 for url in url_list:
22     pool.submit(task,url)
23 
24 pool.shutdown(wait=True)
多线程不加回调

2. 进程

 1 from concurrent.futures import ProcessPoolExecutor
 2 import requests
 3 
 4 
 5 from multiprocessing import freeze_support
 6 
 7 
 8 def task(url):
 9     response = requests.get(url)
10     print(url,response)
11     # 写正则表达式
12 
13 
14 pool = ProcessPoolExecutor(7)
15 url_list = [
16     'http://www.cnblogs.com/wupeiqi',
17     'http://huaban.com/favorite/beauty/',
18     'http://www.bing.com',
19     'http://www.zhihu.com',
20     'http://www.sina.com',
21     'http://www.baidu.com',
22     'http://www.autohome.com.cn',
23 ]
24 if __name__ == '__main__':
25     freeze_support()
26     for url in url_list:
27         pool.submit(task,url)
28 
29     pool.shutdown(wait=True)
多进程
 1 from concurrent.futures import ProcessPoolExecutor
 2 import requests
 3 from multiprocessing import freeze_support
 4 
 5 def task(url):
 6     response = requests.get(url)
 7     return response
 8 
 9 def done(future,*args,**kwargs):
10     response = future.result()
11     print(response.url, response.status_code,response.content)
12 
13 pool = ProcessPoolExecutor(4)
14 url_list = [
15     'http://www.bing.com',
16     'http://www.sina.com',
17     'http://www.baidu.com',
18     'http://www.autohome.com.cn',
19 ]
20 
21 if __name__ == '__main__':
22     freeze_support()
23     for url in url_list:
24         v = pool.submit(task,url)
25         v.add_done_callback(done) #回调
26 
27     pool.shutdown(wait=True)
多进程+回调函数

3.协程

模块有 asyncio(内置)、gevent、Twisted、Tornado 

 1 import asyncio
 2 @asyncio.coroutine
 3 def task():
 4     print('before....task.....')
 5     yield from asyncio.sleep(5)
 6     print('end...task....')
 7 
 8 
 9 tasks = [task(),task()]
10 
11 loop = asyncio.get_event_loop()
12 loop.run_until_complete(asyncio.gather(*tasks))
13 loop.close()
asyncio
 1 import asyncio
 2 
 3 @asyncio.coroutine
 4 def task(host, url='/'):
 5     print('start',host,url)
 6     reader, writer = yield from asyncio.open_connection(host, 80) # 创建链接
 7 
 8     request_header_content = "GET %s HTTP/1.0\r\nHost: %s\r\n\r\n" % (url, host,) # 封装
 9     request_header_content = bytes(request_header_content, encoding='utf-8')
10 
11     writer.write(request_header_content)
12     yield from writer.drain()
13     text = yield from reader.read()
14     print('end',host, url, text)
15     writer.close()
16 
17 tasks = [
18     task('www.cnblogs.com', '/wupeiqi/'),
19     task('dig.chouti.com', '/pic/show?nid=4073644713430508&lid=10273091')
20 ]
21 
22 loop = asyncio.get_event_loop()
23 results = loop.run_until_complete(asyncio.gather(*tasks))
24 loop.close()
 1 import aiohttp
 2 import asyncio
 3 import time
 4 
 5 @asyncio.coroutine
 6 def fetch_async(url):
 7     print('begin...')
 8     response = yield from aiohttp.request('GET', url) # 替换前面的 "GET %s HTTP/1.0\r\nHost: %s\r\n\r\n" % (url, host,) 封装
 9     time.sleep(5)
10     print(url, response)
11     response.close()
12 
13 
14 tasks = [fetch_async('http://www.baidu.com/'),fetch_async('http://www.chouti.com/')] #
15 
16 event_loop = asyncio.get_event_loop()
17 results = event_loop.run_until_complete(asyncio.gather(*tasks))
18 event_loop.close()
asyncio+aiohttp
 1 import gevent
 2 import requests
 3 from gevent import monkey
 4 monkey.patch_all() #这一句千万不要忘记啦 socket是会阻塞的 monkey 会找到并解决阻塞
 5 
 6 
 7 def task(method, url, req_kwargs):
 8     print(method, url, req_kwargs)
 9     response = requests.request(method=method, url=url, **req_kwargs)
10     print(response.url, response.content)
11 
12 ##### 发送请求 这个请求没有请求限制 #####
13 gevent.joinall([
14     gevent.spawn(task, method='get', url='https://www.python.org/', req_kwargs={}),
15     gevent.spawn(task, method='get', url='https://www.yahoo.com/', req_kwargs={}),
16     gevent.spawn(task, method='get', url='https://github.com/', req_kwargs={}),
17 ])
gevent
 1 import gevent
 2 import requests
 3 from gevent import monkey
 4 monkey.patch_all()
 5 
 6 
 7 def task(method, url, req_kwargs):
 8     print(method, url, req_kwargs)
 9     response = requests.request(method=method, url=url, **req_kwargs)
10     print(response.url, response.content)
11 ##### 发送请求(协程池控制最大协程数量) #####
12 from gevent.pool import Pool
13 pool = Pool(5)
14 gevent.joinall([
15     pool.spawn(task, method='get', url='https://www.python.org/', req_kwargs={}),
16     pool.spawn(task, method='get', url='https://www.yahoo.com/', req_kwargs={}),
17     pool.spawn(task, method='get', url='https://www.github.com/', req_kwargs={}),
18 ])
 1 from twisted.internet import defer
 2 from twisted.web.client import getPage
 3 from twisted.internet import reactor
 4 
 5 def one_done(arg):
 6     print(arg)
 7 
 8 def all_done(arg):
 9     print('done')
10     reactor.stop() # 全部完成之后 stop
11 
12 @defer.inlineCallbacks
13 def task(url):
14     res = getPage(bytes(url, encoding='utf8')) # 发送Http请求 2
15     res.addCallback(one_done)
16     yield res
17 
18 url_list = [
19     'http://www.cnblogs.com',
20     'http://www.baidu.com',
21     'http://www.cnblogs.com',
22     'http://www.cnblogs.com',
23 ]
24 
25 defer_list = [] # [特殊,特殊,特殊(已经向url发送请求)]
26 for url in url_list:
27     # v 标注已经往哪发请求啦
28     v = task(url) # 1
29     defer_list.append(v) # 所有的请求已经发送完 添加到一个列表里面
30 
31 d = defer.DeferredList(defer_list)
32 d.addBoth(all_done) # all_done 只执行一次
33 
34 
35 reactor.run() # 死循环 去defer_list 里面执行
Twisted
 1 from tornado.httpclient import AsyncHTTPClient
 2 from tornado.httpclient import HTTPRequest
 3 from tornado import ioloop
 4 
 5 COUNT = 0
 6 def handle_response(response):
 7     global COUNT
 8     COUNT -= 1
 9     if response.error:
10         print("Error:", response.error)
11     else:
12         print(response.body)
13         # 方法同twisted
14         # ioloop.IOLoop.current().stop()
15     if COUNT == 0:
16         ioloop.IOLoop.current().stop()  #COUNT 计数器终止循环
17 
18 def func(): # 步骤2
19     url_list = [
20         'http://www.baidu.com',
21         'http://www.bing.com',
22     ]
23     global COUNT
24     COUNT = len(url_list)
25     for url in url_list:
26         print(url)
27         http_client = AsyncHTTPClient() # 创建一个对象
28         http_client.fetch(HTTPRequest(url), handle_response) # 如果HTTPRequest 下载完成 就会执行 handle_response
29 
30 
31 ioloop.IOLoop.current().add_callback(func) # 步骤1
32 ioloop.IOLoop.current().start() # 死循环
Tornado

 自定义异步IO框架

  1 import socket
  2 import select
  3 
  4 #以下写的都是客户端
  5 
  6 # ########################## HTTP请求本质,阻塞 ##########################
  7 """
  8 sk = socket.socket()
  9 # 1.连接
 10 sk.connect(('www.baidu.com',80,)) # IO阻塞
 11 print('连接成功了...')
 12 
 13 # 2. 连接成功发送消息
 14 sk.send(b'GET / HTTP/1.0\r\nHost:www.baidu.com\r\n\r\n')
 15 # sk.send(b'POST / HTTP/1.0\r\nHost:www.baidu.com\r\n\r\nk1=v1&k2=v2') #sk.recv(8096) 表示最多接受的消息大小
 16 
 17 # 3. 等待着服务端响应
 18 data = sk.recv(8096) # IO阻塞
 19 print(data)
 20 
 21 # 关闭连接
 22 sk.close()
 23 """
 24 # ########################## HTTP请求本质,非阻塞 ##########################
 25 """
 26 sk = socket.socket()
 27 sk.setblocking(False)
 28 # 1.连接
 29 try:
 30     sk.connect(('www.baidu.com',80,)) # IO阻塞
 31     print('连接成功了...')
 32 except BlockingIOError as e:
 33     print(e)
 34 # 2. 连接成功发送消息
 35 sk.send(b'GET / HTTP/1.0\r\nHost:www.baidu.com\r\n\r\n')
 36 # sk.send(b'POST / HTTP/1.0\r\nHost:www.baidu.com\r\n\r\nk1=v1&k2=v2')
 37 
 38 # 3. 等待着服务端响应
 39 data = sk.recv(8096) # IO阻塞
 40 print(data)
 41 
 42 # 关闭连接
 43 sk.close()
 44 """
 45 
 46 class MyHttpRequest:
 47     def __init__(self,sk,host,callback):
 48         self.socket = sk
 49         self.host = host
 50         self.callback = callback
 51     def fileno(self):
 52         return self.socket.fileno()
 53 
 54 # 对响应数据的处理
 55 class MyHttpResponse: 
 56     def __init__(self,recv_data):
 57         self.recv_data = recv_data
 58         self.header_dict = {} # 响应头
 59         self.body = None # 响应体
 60 
 61         self.initialize()
 62     def initialize(self):
 63         headers, body = self.recv_data.split(b'\r\n\r\n', 1)
 64         self.body = body
 65         header_list = headers.split(b'\r\n')
 66         for h in header_list: # h是字节类型
 67             h_str = str(h,encoding='utf-8')#转成str类型
 68             v = h_str.split(':',1)
 69             if len(v) == 2:
 70                 self.header_dict[v[0]] = v[1]
 71                 
 72 
 73 
 74 class AsyncRequest:
 75     def __init__(self):
 76         self.conn = []
 77         self.connection = [] # 用于检测是否已经连接成功
 78 
 79     def add_request(self,host,callback):
 80         try:
 81             sk = socket.socket()
 82             sk.setblocking(0) #这里设置为0  connect会报错BlockingIOError 设置为非阻塞
 83             sk.connect((host,80,))
 84         except BlockingIOError as e:
 85             pass
 86         request = MyHttpRequest(sk,host,callback)
 87         self.conn.append(request)
 88         self.connection.append(request)
 89 
 90     def run(self): # 事件循环 while True
 91 
 92         while True:
 93             rlist,wlist,elist = select.select(self.conn,self.connection,self.conn,0.05) #connection 表示连接的列表 conn表示接受
 94             for w in wlist: #此时w是MyHttpRequest对象
 95                 print(w.host,'连接成功...')
 96                 # 只要能循环到,表示socket和服务器端已经连接成功
 97                 tpl = "GET / HTTP/1.0\r\nHost:%s\r\n\r\n"  %(w.host,)
 98                 w.socket.send(bytes(tpl,encoding='utf-8'))
 99                 self.connection.remove(w) # 已经链接成功 就移除
100             for r in rlist:
101                 # r,是MyHttpRequest
102                 recv_data = bytes()
103                 while True: # 循环接受数据
104                     try:
105                         data = r.socket.recv(8096)
106                         recv_data += data
107                     except Exception as e:
108                         break
109                 response = MyHttpResponse(recv_data)
110                 r.callback(response)
111                 r.socket.close()
112                 self.conn.remove(r)
113             if len(self.conn) == 0:
114                 break
115 
116 def f1(response):
117     print('保存到文件',response.header_dict)
118 
119 def f2(response):
120     print('保存到数据库', response.header_dict)
121 
122 url_list = [
123     {'host':'www.baidu.com','callback': f1},
124     {'host':'cn.bing.com','callback': f2},
125     {'host':'www.cnblogs.com','callback': f2},
126 ]
127 
128 req = AsyncRequest()
129 for item in url_list:
130     req.add_request(item['host'],item['callback'])
131 
132 req.run()
select_IO多路复用+非阻塞socket

猜你喜欢

转载自www.cnblogs.com/tangkaishou/p/9236457.html
今日推荐