mongo批量写入es

import pymongo
import math
from elasticsearch import Elasticsearch
from elasticsearch import helpers
import time
HOST = ['ip:端口']
es = Elasticsearch(HOST,timeout=3600) # 链接ES HOST可以是[ip:端口,ip:端口] 的集群
client = pymongo.MongoClient("")  # 链接数据库
db = client["blue_book_news_dev"]["blue_book_news"]
# 统计mongo里面的数量, 计算分页
nums = db.count()
print(nums)
pages = math.ceil(nums/500)
_index = "ai51_main_prod"
start_time = time.time()
for i in range(pages):
    n =  500 * i
    print("第{}多少个500,第{}条".format(i,n))
    l=list(db.find({},projection={'_id':False,'news_url': True,"content":True,"title": True,"publish_time":True}).skip(n).limit(500)) 
    for line in l:
        actions = []
        if line.get("news_url"):
            action = {
                "_index": _index, #  类似于主键类型
                "_type": "sources", # 类型
                "_id": line["news_url"], # id 如果不自己定义系统会给创建
                "_source": {
                    "page_category": None,  
                    "url": line.get("news_url"),
                    "article_title": line.get("title"),
                    "article_content": line.get("content"),
                    "publish_time_raw": line.get("publish_time"),
                    "publish_time_nomalized": None,
                    "summary":None

                }}
            actions.append(action)
        helpers.bulk(es, actions)  # 批量写入

end_time =time.time()
print(start_time-end_time)

  

猜你喜欢

转载自www.cnblogs.com/wang102030/p/11950531.html
今日推荐