浏览器/爬虫都是socket 客户端
如何提高并发? 多线程 :IO 多进程:计算
一.IO多路复用
作用:检测多个socket 是否发生变化(是否连接成功/是否获取数据) (可读/可写)
regests 模块
ret=requests.get('https://www.baidu.com/s?wd=alex') #DNS解析,根据域名解析出IP 后面的为搜索关键字
socket
1 #socket 发送请求 2 import socket 3 cilent=socket.socket() 4 cilent.connect(("www.baidui.com",80)) 5 cilent.sendall(b'GET /s?wd=alex HTTP/1.0\r\nhost:www.baidu.com\r\n\r\n') 6 data=cilent.recv(8096) 7 lst=[] 8 while 1: 9 if not data: 10 break 11 lst.append(data) 12 list=b"".join(lst) 13 list.decode("utf8")
b"".join(lst) 使列表结成字符串
2.多线程解决并发
1 import socket 2 import requests 3 def func(key): 4 cilent=socket.socket() 5 cilent.connect(("www.baidui.com",80)) 6 cilent.sendall(b'GET /s?wd=%s HTTP/1.0\r\nhost:www.baidu.com\r\n\r\n' % key) 7 data=cilent.recv(8096) 8 lst=[] 9 while 1: 10 if not data: 11 break 12 lst.append(data) 13 list=b"".join(lst) 14 list.decode("utf8") 15 import threading 16 l=["zq","wa","wd"] 17 18 for item in l: 19 t1=threading.Thread(target=func,args=(item)) 20 t1.start()
缺点: 这个需要等待,因为有阻塞,会在客户端connect,recv 时刻需要等待客户端 ,怎样才能不等待呢???
IO多路复用+socket 实现并发请求 (一个线程100个请求)
这里用到 client.setblocking(False)
1 client = socket.socket() 2 client.setblocking(False) # 将原来阻塞的位置变成非阻塞(报错) 3 # 百度创建连接: 阻塞 4 5 try:#需要 避免 6 client.connect(('www.baidu.com',80)) # 执行了但报错了 7 except BlockingIOError as e: 8 pass
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
IO多路复用+socket 实现并发请求
1 import socket 2 import select 3 4 client1 = socket.socket() 5 client1.setblocking(False) # 百度创建连接: 非阻塞 6 7 try: 8 client1.connect(('www.baidu.com',80)) 9 except BlockingIOError as e: 10 pass 11 12 client2 = socket.socket() 13 client2.setblocking(False) # 百度创建连接: 非阻塞 14 try: 15 client2.connect(('www.sogou.com',80)) 16 except BlockingIOError as e: 17 pass 18 19 client3 = socket.socket() 20 client3.setblocking(False) # 百度创建连接: 非阻塞 21 try: 22 client3.connect(('www.oldboyedu.com',80)) 23 except BlockingIOError as e: 24 pass 25 26 socket_list = [client1,client2,client3]#创建连接列表,有三个 27 conn_list = [client1,client2,client3]#再创建 28 29 while True: 30 rlist,wlist,elist = select.select(socket_list,conn_list,[],0.005)#rlist 31 # wlist中表示是否已经和socket建立连接对象,有返回值,可写 返回cilent 32 #rlist中表示是否已经和socket有返回数据,有返回值,可读 返回cilent 33 #[] 中 将错误信息返回空列表 34 #0.005 最大0.005秒检测错误 35 for sk in wlist: 36 if sk == client1: 37 sk.sendall(b'GET /s?wd=alex HTTP/1.0\r\nhost:www.baidu.com\r\n\r\n') 38 elif sk==client2: 39 sk.sendall(b'GET /web?query=fdf HTTP/1.0\r\nhost:www.sogou.com\r\n\r\n') 40 else: 41 sk.sendall(b'GET /s?wd=alex HTTP/1.0\r\nhost:www.oldboyedu.com\r\n\r\n') 42 conn_list.remove(sk) 43 for sk in rlist: 44 chunk_list = [] 45 while True: 46 try: 47 chunk = sk.recv(8096) 48 if not chunk:# 49 break 50 chunk_list.append(chunk) 51 except BlockingIOError as e: 52 break 53 body = b''.join(chunk_list) 54 # print(body.decode('utf-8')) 55 print('------------>',body) 56 sk.close() 57 socket_list.remove(sk) 58 if not socket_list:#这里指当列表中的cilent被取到时,把他们移出作为判断 59 #直到把最后都取出 60 break
用面向对象做IO多路复用
import socket import select class Req(object): def __init__(self,sk,func): self.sock = sk self.func = func def fileno(self): return self.sock.fileno() class Nb(object): def __init__(self): self.conn_list = [] self.socket_list = [] def add(self,url,func): client = socket.socket() client.setblocking(False) # 非阻塞 try: client.connect((url, 80)) except BlockingIOError as e: pass obj = Req(client,func) self.conn_list.append(obj) self.socket_list.append(obj) def run(self): while True: rlist,wlist,elist = select.select(self.socket_list,self.conn_list,[],0.005) # wlist中表示已经连接成功的req对象 for sk in wlist: # 发生变换的req对象 sk.sock.sendall(b'GET /s?wd=alex HTTP/1.0\r\nhost:www.baidu.com\r\n\r\n') self.conn_list.remove(sk) for sk in rlist: chunk_list = [] while True: try: chunk = sk.sock.recv(8096) if not chunk: break chunk_list.append(chunk) except BlockingIOError as e: break body = b''.join(chunk_list) # print(body.decode('utf-8')) sk.func(body) sk.sock.close() self.socket_list.remove(sk) if not self.socket_list: break def baidu_repsonse(body): print('百度下载结果:',body) def sogou_repsonse(body): print('搜狗下载结果:', body) def oldboyedu_repsonse(body): print('老男孩下载结果:', body) t1 = Nb() t1.add('www.baidu.com',baidu_repsonse) t1.add('www.sogou.com',sogou_repsonse) t1.add('www.oldboyedu.com',oldboyedu_repsonse) t1.run()