28 Apr 18
1. Asynchronous + callback mechanism
a. Problem introduction
question:
1) The return value of the task cannot be processed in a timely manner, and it must wait until all tasks have been run before they can be processed uniformly
2) The parsing process is executed serially. It takes 2s to parse once, and 18s to parse 9 times.
from concurrent.futures import ProcessPoolExecutor,ThreadPoolExecutor
import them
import requests
import time
import random
def get(url):
print('%s GET %s' %(os.getpid(),url))
response=requests.get(url)
time.sleep (random.randint (1,3))
if response.status_code == 200:
return response.text
def pasrse(res):
print('%s parsing result is: %s' %(os.getpid(),len(res)))
if __name__ == '__main__':
urls=[
'https://www.baidu.com',
'https://www.baidu.com',
'https://www.baidu.com',
'https://www.baidu.com',
'https://www.baidu.com',
'https://www.baidu.com',
'https://www.baidu.com',
'https://www.baidu.com',
'https://www.python.org',
]
pool=ProcessPoolExecutor(4)
objs = []
for url in urls:
obj=pool.submit(get,url)
objs.append(obj)
pool.shutdown(wait=True)
for obj in objs:
res=obj.result()
spend(res)
b. Advanced solution: The above two problems can be solved, but the information acquisition function set and the analytical information function pasrse are coupled together
from concurrent.futures import ProcessPoolExecutor,ThreadPoolExecutor
import requests
import them
import time
import random
def get(url):
print('%s GET %s' %(os.getpid(),url))
response=requests.get(url)
time.sleep (random.randint (1,3))
if response.status_code == 200:
pasrse(response.text)
def pasrse(res):
print('%s parsing result is: %s' %(os.getpid(),len(res)))
if __name__ == '__main__':
urls=[
'https://www.baidu.com',
'https://www.baidu.com',
'https://www.baidu.com',
'https://www.baidu.com',
'https://www.baidu.com',
'https://www.baidu.com',
'https://www.baidu.com',
'https://www.baidu.com',
'https://www.python.org',
]
pool=ProcessPoolExecutor(4)
for url in urls:
pool.submit(get,url)
c1. The ultimate solution: The above two problems can be solved, and the information acquisition function set and the analytical information function pasrse can be decoupled at the same time (process version)
The main process as the executor of the callback
from concurrent.futures import ProcessPoolExecutor,ThreadPoolExecutor
import requests
import them
import time
import random
def get(url):
print('%s GET %s' %(os.getpid(),url))
response=requests.get(url)
time.sleep (random.randint (1,3))
if response.status_code == 200:
# Do parsing work
return response.text
def pasrse(obj): #The subsequent callback is that obj will pass itself to pasrse, so pasrse must have and only one parameter
res=obj.result()
print('%s parsing result is: %s' %(os.getpid(),len(res)))
if __name__ == '__main__':
urls=[
'https://www.baidu.com',
'https://www.baidu.com',
'https://www.baidu.com',
'https://www.baidu.com',
'https://www.baidu.com',
'https://www.baidu.com',
'https://www.baidu.com',
'https://www.baidu.com',
'https://www.python.org',
]
pool=ProcessPoolExecutor(4)
for url in urls:
obj=pool.submit(get,url)
obj.add_done_callback(pasrse)
print('Main process', os.getpid())
c2, the ultimate solution: It can solve the above two problems, while decoupling the information acquisition function set and the parsing information function pasrse (thread version)
Which child process is idle will be the executor of the callback by that child process
from concurrent.futures import ProcessPoolExecutor,ThreadPoolExecutor
from threading import current_thread
import requests
import them
import time
import random
def get(url):
print('%s GET %s' %(current_thread().name,url))
response=requests.get(url)
time.sleep (random.randint (1,3))
if response.status_code == 200:
# Do parsing work
return response.text
def pasrse(obj):
res=obj.result()
print('%s parsing result is: %s' %(current_thread().name,len(res)))
if __name__ == '__main__':
urls=[
'https://www.baidu.com',
'https://www.baidu.com',
'https://www.baidu.com',
'https://www.baidu.com',
'https://www.baidu.com',
'https://www.baidu.com',
'https://www.baidu.com',
'https://www.baidu.com',
'https://www.python.org',
]
pool=ThreadPoolExecutor(4)
for url in urls:
obj=pool.submit(get,url)
obj.add_done_callback(pasrse)
print('Main thread', current_thread().name)
2. Thread queue
import queue
q=queue.Queue(3) #Queue: first in first out
q.put(1)
q.put(2)
q.put(3)
# q.put(4) #blocking
print(q.get())
print(q.get())
print(q.get())
q=queue.LifoQueue(3) #Stack: last in first out
q.put('a')
q.put('b')
q.put('c')
print(q.get())
print(q.get())
print(q.get())
q=queue.PriorityQueue(3) #Priority queue: You can store values in the queue in the form of small tuples. The first element represents the priority. The smaller the number, the higher the priority.
q.put((10,'user1'))
q.put((-3,'user2'))
q.put((-2,'user3'))
print(q.get())
print(q.get())
print(q.get())
3. Thread event
a. Case 1: After waiting for check to reset the value in the event, connect continues to run from event.wait()
from threading import Event,current_thread,Thread
import time
event=Event() #event maintains a global variable inside
def check():
print('%s is checking whether the service is normal....' %current_thread().name)
time.sleep(3)
event.set() #Change the value of the global variable in the event
def connect():
print('%s waiting for connection...' %current_thread().name)
event.wait() #Wait for the value of the global variable to be reset; if the parentheses are 1, it will only wait for 1 second
print('%s start connecting...' % current_thread().name)
if __name__ == '__main__':
t1=Thread(target=connect)
t2=Thread(target=connect)
t3=Thread(target=connect)
c1=Thread(target=check)
t1.start()
t2.start()
t3.start()
c1.start()
b. Case 2: Exit after three brush attempts
from threading import Event,current_thread,Thread
import time
event=Event()
def check():
print('%s is checking whether the service is normal....' %current_thread().name)
time.sleep(5)
event.set()
def connect():
count=1
while not event.is_set():
if count == 4:
print('Too many attempts, please try again later')
return
print('%s trying to connect for the %sth time...' %(current_thread().name,count))
event.wait(1)
count+=1
print('%s start connecting...' % current_thread().name)
if __name__ == '__main__':
t1=Thread(target=connect)
t2=Thread(target=connect)
t3=Thread(target=connect)
c1=Thread(target=check)
t1.start()
t2.start()
t3.start()
c1.start()
4. Coroutines
1. Implement concurrency under a single thread: coroutines (in order to improve efficiency; but not to say that all coroutines will improve efficiency)
Concurrency refers to multiple tasks that appear to be running at the same time; the essence of concurrent implementation: switching + saving state
Effective coroutines 'deceive' the CPU to a certain extent; through their own internal coordination, as soon as they encounter IO, they will switch to other programs of their own, making the CPU think that the program has been running, making it more likely to be in a ready state Or running state to occupy more CPU.
2. Three ways to achieve concurrency:
a) Concurrency under a single thread; controlled by the program itself, relatively fast
b) Concurrency under multithreading; controlled by the operating system, relatively slow
c) Concurrency under multi-process; controlled by the operating system, relatively slow
3. Based on the yield saved state, the two tasks can be switched back and forth directly, that is, the effect of concurrency (but the yield will not encounter blocking automatic switching programs)
PS: If printing is added to each task, it is obvious that the printing of the two tasks is you and I, that is, concurrent execution.
import time
def consumer():
'''Task 1: Receive data, process data'''
while True:
x=yield
def producer():
'''Task 2: Production data'''
g=consumer()
next(g)
for i in range(10000000):
g.send(i)
start=time.time()
producer() #1.0202116966247559
stop=time.time()
print(stop-start)
# Pure computing tasks are executed concurrently
import time
def task1():
res=1
for i in range(1000000):
res+=i
yield
time.sleep(10000) #yield will not automatically skip blocking
print('task1')
def task2():
g=task1()
res=1
for i in range(1000000):
res*=i
next(g)
print('task2')
start=time.time()
task2()
stop=time.time()
print(stop-start)
5. Implementing IO switching under a single thread
1. Use greenlet (encapsulate yield, not automatically cut when IO is encountered)
from greenlet import greenlet
import time
def eat(name):
print('%s eat 1' %name)
time.sleep(30)
g2.switch('alex') #Only pass the value at the first switch
print('%s eat 2' %name)
g2.switch()
def play(name):
print('%s play 1' %name)
g1.switch()
print('%s play 2' %name)
g1=greenlet(eat)
g2=greenlet(play)
g1.switch('egon')
2. Use the gevent module (encapsulate greenlet, if you don't process it, you will automatically cut it when you encounter your own IO)
import guy
def eat(name):
print('%s eat 1' %name)
gevent.sleep(5) #Change to time.sleep(5), it will not automatically switch
print('%s eat 2' %name)
def play(name):
print('%s play 1' %name)
vent.sleep(3)
print('%s play 2' %name)
g1=gevent.spawn(eat,'egon')
g2=gevent.spawn(play,'alex')
# vent.sleep(100)
# g1.join()
# g2.join()
vent.joinall([g1,g2])
3. Use the gevent module (encapsulate the greenlet, if it is processed, it will also actively cut when other IOs are encountered)
from gevent import monkey;monkey.patch_all()
from threading import current_thread
import guy
import time
def eat():
print('%s eat 1' %current_thread().name)
time.sleep(5)
print('%s eat 2' %current_thread().name)
def play():
print('%s play 1' %current_thread().name)
time.sleep(3)
print('%s play 2' %current_thread().name)
g1=gevent.spawn(eat)
g2=gevent.spawn(play)
# vent.sleep(100)
# g1.join()
# g2.join()
print(current_thread().name)
vent.joinall([g1,g2])