当运行代码时, 抛出如下异常:
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/connection.py", line 206, in send
self._send_bytes(_ForkingPickler.dumps(obj))
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/reduction.py", line 51, in dumps
cls(buf, protocol).dump(obj)
TypeError: can't pickle _thread.lock objects
def done():
# 通过缓存获取抓撞库数据
data_df = redis_cache()
# 获取 1. mongo库对象 2. 新闻资讯mongo集合名称
mg_db, coll_names_list = get_mongo_coll_names()
# 写入mysql
uri = "数据库地址"
post = {'id': [], 'title': [], 'ctime': [], 'gtime': [], 'url': [], 'contents': [], 'company': [],
'collection_name': []}
__Process = 3
# 进程池
from multiprocessing import Pool
pool = Pool(__Process)
ret_list = []
for n in range(len(coll_names_list)):
mg_cursor = mg_db[coll_names_list[n]]
pool_ret = pool.apply_async(func=process_task, args=(mg_cursor, data_df))
ret_list.append(pool_ret)
pool.close()
pool.join()
解决方式:
后来查看发现, 进程池内部处理使用了pickle模块(用于python特有的类型和python的数据类型间进行转换)中的dump(obj, file, protocol=None,)方法对参数进行了封装处理.
出现这个问题是因为我在参数传递中传递了我自定义的数据库存储类mg_cursor, 造成进程池内部处理封装过程无法对其进行处理.
解决办法, 在参数传递中, 去掉自定义的类mg_cursor, 而是将自定义类放到处理函数中去.
def progress_task(coll_names_list, n, data_df):
"""
多进程任务
:param coll_names_list: mongo集合名
:param n: 获取集合名索引位置
:param data_df: 公司撞库名单
"""
mg_cli = pymongo.MongoClient(MONGO_URL)
mg_db = mg_cli[MONGO_DB]
mg_cursor = mg_db[coll_names_list[n]]
with ThreadPoolExecutor(max_workers=15) as pool:
all_task = [pool.submit(thread_task, data, data_df) for data in mg_cursor.find()]
for i, task in enumerate(as_completed(all_task)):
task.result()
def done():
# 通过缓存获取抓撞库数据
data_df = redis_cache()
# 获取 新闻资讯mongo集合名称
coll_names_list = get_mongo_coll_names()
# 进程池
pool = Pool(3)
ret_list = []
for n in range(len(coll_names_list)):
pool_ret = pool.apply_async(func=progress_task, args=(coll_names_list, n, data_df))
ret_list.append(pool_ret)
pool.close()
pool.join()