用gevent创建协程前,我们一般进行打补丁,为了遇到延时操作不报错。但如果和进程池混合使用,就会在创建池子时程序卡住,不再往下执行。
此时把补丁去掉,并且不要用延时操作,程序就能继续执行,下载这种耗时过程,完全不需要打补丁且不会报错的。
下面是爬取图片的代码,代码A用的进程加协程爬取:速度明显高于只用协程
代码B用的是协程,时间长了好多
代码A
进程池与协程
from multiprocessing import Pool,Manager
import gevent
import time
# from gevent import monkey
import urllib.request
import re
# 打补丁
# monkey.patch_all() # 有进程池,所以把补丁去掉,要不在创建池子时卡住
def my_download(name,url):
"""下载并保存"""
# 请求
resp = urllib.request.urlopen(url)
# 读取数据
data = resp.read()
# 保存
with open("/home/python/Desktop/mi_zi_1/"+ name,"wb") as f:
f.write(data)
def get_html(q,url,k):
"""解析出图片网址"""
# 打开文件获取源码
resp = urllib.request.urlopen(url)
data = resp.read()
# 正则匹配出图片网址
urls = re.findall(r"//.*?large.*?\.jpg", data.decode("utf-8"))
# 统计图片个数
count = len(urls)
q.put(count)
li = list()
i = 0
for url in urls:
url = "http:" + url
# print(url) # 打印出网址
i += 1 # 产生图片名字
# 产生一个卵,并把一个卵添加到列表
li.append(gevent.spawn(my_download, str(k) + "-" + str(i) + ".jpg", url))
# 等待所有协程(卵)结束
gevent.joinall(li)
def main():
"""
https://jandan.net
"""
urls = list()
# 获取多页网址
for j in range(12,18):
urls.append("https://jandan.net/ooxx/page-" + str(j) + "#comments")
# 创建进程池
po = Pool(3)
q = Manager().Queue()
k = 0 # 产生图片名字和i组合
for url in urls:
print(url)
k += 1
# 向池中添数据
po.apply_async(get_html,args=(q,url,k))
# 关闭进程池
po.close()
po.join()
# 统计图片个数
count = 0
while not q.empty():
a = q.get()
count += a
print(count)
if __name__ == '__main__':
print(time.ctime())
main()
print(time.ctime())
代码B
只用协程,代码在上面的基础上改的
from multiprocessing import Pool,Manager
import gevent
import time
from gevent import monkey
import urllib.request
import re
# 打补丁
monkey.patch_all()
def my_download(name,url):
"""下载并保存"""
# 请求
resp = urllib.request.urlopen(url)
# 读取数据
data = resp.read()
# 保存
with open("/home/python/Desktop/mi_zi_1/"+ name,"wb") as f:
f.write(data)
def get_html(q,url,k):
"""解析出图片网址"""
# 打开文件获取源码
resp = urllib.request.urlopen(url)
data = resp.read()
# 正则匹配出图片网址
urls = re.findall(r"//.*?large.*?\.jpg", data.decode("utf-8"))
# 统计图片个数
count = len(urls)
q.put(count)
li = list()
i = 0
for url in urls:
url = "http:" + url
# print(url) # 打印出网址
i += 1 # 产生图片名字
# 产生一个卵,并把一个卵添加到列表
li.append(gevent.spawn(my_download, str(k) + "-" + str(i) + ".jpg", url))
# 等待所有协程(卵)结束
gevent.joinall(li)
def main():
"""
https://jandan.net
"""
urls = list()
for j in range(12,18):
urls.append("https://jandan.net/ooxx/page-" + str(j) + "#comments")
# 创建进程池
# po = Pool(3)
q = Manager().Queue()
k = 0
for url in urls:
print(url)
k += 1
# po.apply_async(get_html,args=(q,url,k))
get_html(q,url,k)
# 关闭进程池
# po.close()
# po.join()
count = 0
while not q.empty():
a = q.get()
count += a
print(count)
if __name__ == '__main__':
print(time.ctime())
main()
print(time.ctime())