Python学习笔记之多进程

一、创建多进程

import multiprocessing
import os
import time
def work(num):
    print("进程%s正在运行, pid=%s" %(multiprocessing.current_process().name,os.getpid()))
    time.sleep(5)
def main():
    jobs = []
    for i in range(1,11):
        p = multiprocessing.Process(name='进程%s'%(i),target=work,args=(i,))
        jobs.append(p)
        p.start()

    for job in  jobs:
        job.join()
if __name__ == '__main__':
    main()

我们使用运行该程序，在程序没有运行结束之前，在shell中运行ps -aux | grep .py 命令，我们会看到十一个相同的进程产生于该脚本文件。

我们前面使用过继承threading.Thread类来实现自己的多线程类，同样，在多进程里面，我们也可以通过继承multiprocessing.Process类实现多进程。

class SubProcess(multiprocessing.Process):
    def __init__(self, name, num):
        super(SubProcess, self).__init__()
        self.num = num

    def run(self):
        print('work-%s' %(self.num))
        # 获取当前进程的名称
        print("进程%s正在运行, pid=%s" %(multiprocessing.current_process().name,
                                   os.getpid()))

我们通过自己的类的实例化对象就可以创建多进程了。

二、进程间的通信

我们通过队列区实现进程间的通信

import multiprocessing
import os
import random
import time

class SubProcess1(multiprocessing.Process):
    def __init__(self,que):
        super(SubProcess1,self).__init__()
        self.que = que
    def run(self):
        while True:
            time.sleep(1)
            print('进程%s正在运行，pid = %s'
                  ''%(multiprocessing.current_process().name,os.getpid()))
            if not self.que.full():
                num = random.randint(1,10)
                self.que.put(num)
                print('入队%s'%(num))
            else:
                print('队列满')
class SubProcess2(multiprocessing.Process):
    def __init__(self,que):
        super(SubProcess2,self).__init__()
        self.que = que
    def run(self):
        while True:
            print('进程%s正在运行，pid = %s'
                  ''%(multiprocessing.current_process().name,os.getpid()))
            if not self.que.empty():
                print('出队%d'%(self.que.get()))
            else:
                print('队列空')
            time.sleep(1)

que = multiprocessing.Queue(4)
def main():
    p1 = SubProcess1(que)
    p2 = SubProcess2(que)
    p2.start()
    p1.start()
    p1.join()
    p2.join()
    print('主进程结束')
if __name__ == '__main__':
    main()

三、创建进程池

进程池我们可以通过multiprocessing模块提供的Manager中有Queue类，使用队列去实现进程池。

import os
from multiprocessing import Manager
from multiprocessing import Pool
import time
import random



def get_queue(que):
    while True:
        time.sleep(1)
        if not que.empty():
            print('出队%d' % (que.get()),'   PID:' , os.getpid())
        else:
            print('队列空','   PID:' , os.getpid())

def put_queue(que):
    while True:
        time.sleep(1)
        if not que.full():
            num = random.randint(1, 10)
            que.put(num)
            print('入队%s' % (num),'  PID:' , os.getpid())
        else:
            print('队列满','   PID:' , os.getpid())

def main():
    ## 创建队列
    que = Manager().Queue()
    pool = Pool()
    pool.apply_async(put_queue,(que,))
    pool.apply_async(get_queue,(que,))
    #关闭进程池（pool），使其不在接受新的任务。
    pool.close()
    pool.join()
if __name__ == '__main__':
    main()

使用concurrent.futures 模块中的ProcessPoolExecutor类去快速创建进程池，使用进程池完成指定目录下的所有文件的拷贝工作。

import os
import time
from concurrent.futures import ProcessPoolExecutor
def SearchFileAbsPath(dirname):
    dirname = os.path.abspath(dirname)  #
    filenames = list()
    for root, dirs, files in os.walk(dirname, topdown=False):  # 扫描一层目录
        for name in files:
            filenames.append(root + os.path.sep + name)  # 每一个文件的绝对路径放入列表
    return filenames

def cpyfile(dirname,filename):
    """
    :param dirname:   拷贝后的文件放在的位置
    :param filename: 存放文件绝对路径
    :return:
    """
    with open(filename, 'r') as file_it:
        file_content = file_it.read()
    with open(dirname + os.path.sep + os.path.split(filename)[1], 'w') as file_it:
        file_it.write(file_content)
        print(filename, '复制成功!!!')
    time.sleep(1)
def main():
    filenames = SearchFileAbsPath('/root/PycharmProjects/day13')
    # 创建进程池
    pool = ProcessPoolExecutor(max_workers=3)
    for filename in filenames:
        #给进程池的进程分配任务;
        pool.submit(cpyfile,'/root/Desktop/hello',filename)
if __name__ == '__main__':
    main()

四、as_completed()的应用

as_completed可以将多个进程、线程、协程、并发执行，但是它把返回的结果变为一个生成器，每次返回的都是一个执行的结果，不会等待所有的线程、进程、协程执行完才返回。

import time_it
import re
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import as_completed
from urllib import request
def get_url_content(url):
    with request.urlopen(url) as url_it:
        url_content = url_it.read()
    return url_content.decode('utf-8')


def get_url_page(pattern,str_infor):
    #<a href="/p/2314539885?pn=31">尾页</a>
    return re.findall(pattern,str_infor)[0]


def get_email_as_compeletd(url):
    content = get_url_content(url)
    return re.findall(r'[a-zA-Z0-9]+@\w+\.com',content)


#不使用多线程
@time_it.timeit
def no_usethread(url_list):
    for url in url_list:
        get_email_as_compeletd(url)


#使用submit不使用as_complete
@time_it.timeit
def usesumbitthread(url_list):
    with ThreadPoolExecutor(max_workers=5) as pool:
        for urls in url_list:
            pool.submit(get_email_as_compeletd, urls).result()


#使用submit使用as_completed 函数
@time_it.timeit
def use_as_completed(url_list):
    with ThreadPoolExecutor(max_workers=5) as pool:
        future_url = [pool.submit(get_email_as_compeletd, urls) for urls in url_list]
    for future in as_completed(future_url):
        future.result()
def main():
    url = 'http://tieba.baidu.com/p/2314539885'
    pattern = r'<a href="/p/.*pn=(\d+)">尾页</a>'
    page = int(get_url_page(pattern,get_url_content(url)))
    url_list = ['http://tieba.baidu.com/p/2314539885'+'?pn=%s'%(i) for i in range(page)]
    usesumbitthread(url_list)
    use_as_completed(url_list)
    no_usethread(url_list)


if __name__ == '__main__':
    main()

我们知道当我们使用pool.submit函数将任务加到线程池中时，pool.submit()函数将会返回一个future对象，当我们的任务函数有返回值的时候，我们需要使用future.result()方法获取返回值。future.result()被使用时python解释器会一直等待该任务被执行完毕，返回结果，这样就会导致多线程变为单线程。

上面代码，我们自己写了一个装饰器time_it用来求函数运行的时间，我们使用5个线程对去爬去贴吧中某个帖子的所有邮箱账号，我们使用了三种方式：一种使用多线程但是没有使用as_completed函数，一种使用多线程和as_complete函数，最后还有一种没有使用任何多线程，我们看一下每个函数的运行时间

输出：

usesumbitthread函数的运行时间为22.565438270568848 s
use_as_completed函数的运行时间为4.77132248878479 s
no_usethread函数的运行时间为26.670533418655396 s

正如上面我们所说，使用future.result()求取任务函数的返回值结果，会导致每一个任务被等待，被迫变为了单线程。没有使用as_complete的和没有使用多线程的运行时间几乎一样。

注：

当我们需要终止进程时可以使用Process.terminate()方法。