#-*- coding:utf8 -*-
from elasticsearch import Elasticsearch, helpers
import json
import pdb
class ElasticsearchService:
def __init__(self, hosts):
self.__elasticsearch = Elasticsearch(hosts, sniff_on_start=True, sniff_on_connection_fail=True, sniffer_timeout=60, timeout=30, retry_on_timeout=True, max_retries=5)
def search_scroll(self, index, doc_type, query):
try:
return self.__elasticsearch.search(index=index, query=query, search_type="query_then_fetch", scroll="1m")
except BaseException as e:
print str(e)
pass
return {}
def scroll_scan(self, scroll,scroll_id):
try:
resJson = self.__elasticsearch.scroll(scroll, scroll_id)
return resJson
except BaseException as e:
print str(e)
pass
return []
def delete_by_query(self, index, query):
try:
resJson = self.__elasticsearch.delete_by_query(index=index, body=query)
return resJson.get('deleted')
except BaseException as e:
print 'delete fail'
pass
return 0
def update_by_query(self, index, query):
try:
resJson = self.__elasticsearch.update_by_query(index=index, body=query)
return resJson.get('updated')
except BaseException as e:
pass
return 0
def insert_bulk(self, data_lst):
try:
return helpers.bulk(self.__elasticsearch, data_lst, request_timeout=60)
except BaseException as e:
return [0, []]
if __name__ == '__main__':
es_hosts = ["IP:PORT"]
baike_all_index = 'baike_index'
baike_all_type = 'baike_all'
elastic_service = ElasticsearchService(es_hosts)
#这里是进行第一次查询,query中size指定每个批次的大小,返回的结果中不仅有查询到的数据,还有一个scroll_id, 这个scrool_id可以认为是下一次查询的起始位置
res = elastic_service.search_scroll(baike_all_index, baike_all_type, {"query": {"match_all": {}},"_source": ["url"], "size": 10000})
hits = res.get('hits')
if hits.get('total') > 0:
for hit in hits.get('hits'):
print hit['_source']['url'] #这里是取我自己的数据
while res.get('_scroll_id') and hits.get('total') > 0:
#后续的每次查询都需要带上上一次查询结果中得到的scroll_id参数
res = elastic_service.scroll_scan(scroll='1m', scroll_id=res.get('_scroll_id'))
hits = res.get('hits')
if hits.get('total') > 0:
for hit in hits.get('hits'):
print hit['_source']['url']
elasticsearch 遍历索引数据
猜你喜欢
转载自blog.csdn.net/sslfk/article/details/132429803
今日推荐
周排行