Elasticsearch Rest-Client使用教程

Elasticsearch curl命令

-XGET一种请求方法

-d 标识以post形式传入参数 ,写在请求正文里面

?pretty=true 以格式的形式显示结果

curl -XGET http://localhost:9200/_cluster/health?pretty --查询elasticsearch的健康信息

curl -XGET http://localhost:9200/ --查询实例的相关信息

curl -XGET http://localhost:9200/_cluster/nodes/ --得到集群中节点的相关信息

curl -XPOST http://localhost:9200/_cluster/nodes/_shutdown --关闭整个集群

curl -XPOST http://localhost:9200/_cluster/nodes/aaaa/_shutdown --关闭集群中指定节点

curl -XPOST http://localhost:9200/test --创建名为test的索引

curl -XDELETE http://localhost:9200/test --删除名为test的索引

curl -XGET ‘http://10.10.110.2:19200/benlaitest/_search?pretty=true’ -d ‘{“query”:{“multi_match”:{“query”:“法国”,“fields”:[“firstname”,“lastname”]}}}’ --查询数据(匹配firstname和lastname)

curl http://10.10.110.160:9200/benlaitest/_analyze?analyzer=standard -d 我爱你中国

postman执行请求API:
http://10.10.110.160:9200/_cat/indices?v – Get请求 查看有多少索引

http://10.10.110.160:9200/benlaitest/_analyze?analyzer=standard --查看分词结果

DSL 介绍

Elasticsearch提供丰富且灵活的查询语言叫做DSL查询(Query DSL),它允许你构建更加复杂、强大的查询。DSL(Domain Specific Language特定领域语言)以JSON请求体的形式出现。

DSL 简单用法

  • 查询所有的商品:
GET /product_index/product/_search
{
  "query": {
    "match_all": {}
}
  • 查询商品名称包含 milk 的商品,同时按照价格降序排序:
GET /product_index/product/_search
{
 "query": {
   "match": {
     "product_name": "milk"
   }
 },
 "sort": [
   {
     "price": "desc"
   }
 ]
}
  • 分页指定结果字段查询商品:
GET /product_index/product/_search
{
  "query": {
    "match_all": {}
  },
  "_source": [
    "product_name",
    "price"
  ],
  "from": 0, ## 从第几个商品开始查,最开始是 0
  "size": 1  ## 要查几个结果
}
  • range 用法,查询数值、时间区间:
GET /product_index/product/_search
{
  "query": {
    "range": {
      "price": {
        "gte": 30.00
      }
    }
  }
}

多搜索条件组合查询(最常用)

  • bool 下包括:must(必须匹配,类似于数据库的 =),must_not(必须不匹配,类似于数据库的
    !=),should(没有强制匹配,类似于数据库的 or),filter(过滤)
GET /product_index/product/_search
{
  "query": {
    "bool": {
      "must": [
        {
          "match": {
            "product_name": "pure milk"
          }
        }
      ],
      "should": [
        {
          "match": {
            "product_desc": "常温"
          }
        }
      ],
      "must_not": [
        {
          "match": {
            "product_name": "蒙牛"
          }
        }
      ],
      "filter": {
        "range": {
          "price": {
            "gte": 33.00
          }
        }
      }
    }
  }
}     

Rest-Client

Elastic正在开发高级客户端,它将在REST客户端之上工作,并允许您发送DSL查询等。

案例分析

  1. 需求:将爬虫的原始数据组装成DSL语句,每次5000数据量批量添加数据到es
	// 保存爬虫原始数据
	public void saveCrawlerInstanceData(List<CrawlerInstanceData> list) {
		// 开启批量插入
		try {
			String pre = "爬虫原始数据";

			StringBuilder bulkRequestBody = new StringBuilder();
			//开启计数
			int count = 1;
			for (CrawlerInstanceData data : list) {

				// 添加es信息异常
				try {

				    Map map = BeanUtil.transBean2Map(data);
                    map.put("ctime", data.getCtime().getTime());
                    //索引名称
                    String esIndex = getESIndex(4);

					String requestJson = JSON.toJSONString(map, WriteMapNullValue);
					String actionMetaData = String.format("{ \"index\" : { \"_index\" : \"%s\", \"_type\" : \"%s\" ,\"_id\" : \"%s\"} }%n",
							indexPrefix + esIndex, "crawlerdata", data.getId());
					//拼装json语句
					bulkRequestBody.append(actionMetaData);
					bulkRequestBody.append(requestJson);
					bulkRequestBody.append("\n");

					String esPath = String.format("/%s/%s/%s", indexPrefix + esIndex,"crawlerdata", "_bulk");

					//每次批量添加5000条数据
					if (count % 5000 == 0 || count == list.size() ) {
						String resultJson = RestClientUtil.getESDtats(restClient, bulkRequestBody.toString(), esPath, "POST");
						if (StringUtils.isBlank(resultJson)) {
							logger.error("{} es POST 批量插入数据操作不成功 ", pre);
							throw new Exception(" es POST 批量插入数据操作不成功");
						} else {
							bulkRequestBody = new StringBuilder();
						}
					}
					count++;
				} catch (Exception e) {
					// TODO: handle exception
					logger.error("添加es信息异常", e);
				}

			}
		} catch (Exception e) {
			logger.error("es保存爬虫原始数据异常", e);
		}
	}

封装RestClient dsl执行语句

public static String getESDtats(RestClient restClient, String sql, String esPath, String requestType) {
        if (null != restClient && !StringUtil.isBlank(sql) && !StringUtil.isBlank(esPath)) {
            HttpEntity entity = new NStringEntity(sql, ContentType.APPLICATION_JSON);
            String result = null;

            try {
                Response indexResponse = restClient.performRequest(null == requestType ? "GET" : requestType, esPath, Collections.emptyMap(), entity, new Header[0]);
                result = EntityUtils.toString(indexResponse.getEntity());
            } catch (IOException var8) {
                LOGGER.error(ExceptionUtils.getStackTrace(var8));
            }

            return result;
        } else {
            return null;
        }
    }
  1. 需求:聚合查询爬虫编号为crawler_instance_id的爬虫示例成功爬虫的数据量crawler_cnt,实际转换成业务数据量update_cnt
	public Map<String, Object> lastTaskDataCount(String crawlerInstanceId) {

		String esIndex = getESIndex(4);

		//拼装dsl json语句
		StringBuilder requestBody = new StringBuilder();
		requestBody.append("{\"size\":10,\"_source\":[\"\"],\"from\": 0,");
		requestBody.append("\"query\":{\"term\":{\"crawler_instance_id\":").append(crawlerInstanceId).append("}},");
		// 入库数据量统计,爬取数据量统计
		requestBody.append("\"aggs\": {\"crawlTotalAgg\": {\"sum\": {\"field\":\"crawler_cnt\"}},");
		requestBody.append("\"updateTotalAgg\":{ \"sum\": {\"field\":\"update_cnt\"}}}}");

		//es路径
		String esPath = String.format("/%s/%s/%s", indexPrefix + esIndex,"crawlerdata", "_search");
		String result = RestClientUtil.getESDtats(restClient, requestBody.toString(), esPath, "Get");

		Map<String, Object> map = new HashMap<String, Object>();

		JSONObject jsonObject = JSONObject.parseObject(result);
		if (null != jsonObject) {
			String aggregations = jsonObject.getString("aggregations");
			JSONObject aggregationsObject = JSONObject.parseObject(aggregations);
			if (aggregationsObject != null) {
				map.put("crawl_total",  (int)Double.parseDouble(JSONObject.parseObject(aggregationsObject.getString("crawlTotalAgg")).getString("value")));
				map.put("update_total", (int)Double.parseDouble(JSONObject.parseObject(aggregationsObject.getString("updateTotalAgg")).getString("value")));
			}
			JSONObject hitsObject = JSONObject.parseObject(String.valueOf(jsonObject.get("hits")));
			if (null != hitsObject) {
				map.put("data_total",Integer.parseInt(String.valueOf(hitsObject.get("total"))));
			}
		}

		return map;
	}

聚合查询DSL语句,_source为空,仅返回聚合查询数据,加快查询效率

{
	"size": 10,
	"_source": [""],
	"from": 0,
	"query": {
		"term": {
			"crawler_instance_id": crawler_instance_id
		}
	},
	"aggs": {
		"crawlTotalAgg": {
			"sum": {
				"field": "crawler_cnt"
			}
		},
		"updateTotalAgg": {
			"sum": {
				"field": "update_cnt"
			}
		}
	}
}
  1. 需求:多重聚合查询指定media_id网站crawler_code爬虫类型最近7次crawler_instance_id爬虫示例成功爬虫的数据量crawler_cnt,实际转换成业务数据量update_cnt
	// 最近7次爬虫数据情况
	public List<Map<String, Object>> lastSevenTaskCount(String crawlerCode, String mediaId) {

		String esIndex = getESIndex(4);


		List<Map<String, Object>> reList = new ArrayList<Map<String, Object>>();

		//拼装dsl json语句
		StringBuilder requestBody = new StringBuilder();
		requestBody.append("{\"size\":7,\"_source\":[\"id\"],\"from\": 0,");
		requestBody.append("\"query\":{\"bool\":{\"must\":[{\"term\":{\"crawler_code.keyword\":\"").append(crawlerCode).append("\"}},{");
		requestBody.append("\"term\": {\"media_id\":").append(mediaId).append("}}]}},");
		requestBody.append("\"aggs\": {\"crawler_instance_id_agg\": {\"terms\": {\"field\": \"crawler_instance_id\",\"size\":7,\"order\":{\"_term\": \"desc\"}},");
		requestBody.append("\"aggs\": {\"crawler_code_agg\": {\"terms\": {\"field\":\"crawler_code\"},");
		requestBody.append("\"aggs\": {\"media_id_agg\": {\"terms\": {\"field\":\"media_id\"},");
		// 更新入库数据量统计,爬取数据量统计
		requestBody.append("\"aggs\": {\"update_sum_agg\": {\"sum\": {\"field\": \"update_cnt\"}},\"crawler_cnt_agg\": {\"sum\": {\"field\": \"crawler_cnt\"").append("}}}}}}}}},");
		requestBody.append("\"sort\": [{\"id\": {\"order\": \"desc\"}}]}");
		//es路径
		String esPath = String.format("/%s/%s/%s", indexPrefix + esIndex,"crawlerdata", "_search");
		String result = RestClientUtil.getESDtats(restClient, requestBody.toString(), esPath, "Get");
		List<Map> crawlerInstanceList = RestClientUtil.getAggregationsListByResult(result,"crawler_instance_id_agg");

		crawlerInstanceList.stream().forEach(x->{
			JSONObject crawlerCodeObject = (JSONObject)x.get("crawler_code_agg");
			List<Map> crawlerCodeList =bucketsObject(crawlerCodeObject);
			Map<String, Object> map = new HashMap<String, Object>();
			map.put("crawler_instance_id", x.get("key"));
			crawlerCodeList.stream().forEach(y->{
				map.put("crawler_code", y.get("key"));
				JSONObject media = (JSONObject)y.get("media_id_agg");
				List<Map> mediaList =bucketsObject(media);
				mediaList.stream().forEach(z->{
					JSONObject updateSum = (JSONObject)z.get("update_sum_agg");

					map.put("update_sum",  (int)Double.parseDouble(updateSum.getString("value")));
					JSONObject crawlTotal = (JSONObject)z.get("crawler_cnt_agg");
					map.put("crawl_total", (int)Double.parseDouble(crawlTotal.getString("value")));

				});
			});
			reList.add(map);
		});
		return reList;
	}

聚合查询DSL语句

{
	"size": 7,
	"_source": ["id"],
	"from": 0,
	"query": {
		"bool": {
			"must": [{
				"term": {
					"crawler_code.keyword": "新闻"//爬虫类型
				}
			}, {
				"term": {
					"media_id": 4//网站编号
				}
			}]
		}
	},
	"aggs": {
		"crawler_instance_id_agg": {
			"terms": {
				"field": "crawler_instance_id",
				"size": 7,
				"order": {
					"_term": "desc"
				}
			},
			"aggs": {
				"crawler_code_agg": {
					"terms": {
						"field": "crawler_code"
					},
					"aggs": {
						"media_id_agg": {
							"terms": {
								"field": "media_id"
							},
							"aggs": {
								"update_sum_agg": {
									"sum": {
										"field": "update_cnt"
									}
								},
								"crawler_cnt_agg": {
									"sum": {
										"field": "crawler_cnt"
									}
								}
							}
						}
					}
				}
			}
		}
	},
	"sort": [{
		"id": {
			"order": "desc"
		}
	}]
} {
	"range": {
		"update_cnt": {
			"gt": 0
		}
	}
}]
}
}
}
  1. 需求:大数据量Elastic数据迁移方法,使用了查询ES的scroll方式,对比通过ES的DSL查询语句用分页from和size的分页查询到了千万级别之后,from就会慢的出奇,甚至报错
	//组装es指定字段查询语句
    public Map<String, Object> getEsQueryInfo() {

        Map<String, Object> infoMap = new HashMap<>(16);

        String esQuery = "";

        StringBuilder requestBody = new StringBuilder();
        requestBody.append("{\"size\":\"").append(esSize).append("\",");
        requestBody.append("\"query\": {\"bool\":{\"must\":{\"term\":{\"").append(field).append("\":\"").append(value);
        requestBody.append("\"}}}}}");
        infoMap.put("esQuery", requestBody.toString());
        infoMap.put("oldEsUrl", String.format("/%s/_search?scroll=%s", oldEsUrl, esScrollTime));

        LOGGER.info("配置信息 {}", JSON.toJSON(infoMap));

        return infoMap;
    }

    //组装es范围查询语句
    public Map<String, Object> getRangeEsQueryInfo(long time) {

        Map<String, Object> infoMap = new HashMap<>(16);

        String esQuery = "";

        StringBuilder requestBody = new StringBuilder();
        requestBody.append("{\"size\":\"").append(esSize).append("\",");
        requestBody.append("\"query\": {\"bool\":{\"must\":{\"range\":{\"gt\":").append(time);
        requestBody.append("}}}}}");

        infoMap.put("esQuery", requestBody.toString());
        infoMap.put("oldEsUrl", String.format("/%s/_search?scroll=%s", oldEsUrl, esScrollTime));

        LOGGER.info("配置信息 {}", JSON.toJSON(infoMap));

        return infoMap;
    }
	//查询es数据
    public List<Map> query( Map<String, Object> infoMap) {

        List<Map> resultList = new ArrayList<>();
        String esQuery = MapUtils.getString(infoMap, "esQuery");
        String oldEsUrl = MapUtils.getString(infoMap, "oldEsUrl");

        // 是否滚动查询
        if (StringUtils.isNotBlank(esScrollId)) {

            String scrollQueryUrl = String.format("/_search/scroll");
            String scrollQuerySql = String.format("{\"scroll\":\"%s\",\"scroll_id\":\"%s\"}", esScrollTime, esScrollId);
            LOGGER.info("es 滚动查询 sql:{} path:{} ", scrollQuerySql, scrollQueryUrl);
            String scrollDateResult = RestClientUtil.getESDtats(restClient, scrollQuerySql, scrollQueryUrl, "Get");
            resultList = RestClientUtil.getHitWithIdListByResult(scrollDateResult);
            if (resultList.size()==0) {
                esScrollId = "";
            }
        }
        else {
            LOGGER.info("es 查询 sql:{} path:{}", esQuery, oldEsUrl);
            String dateResult = RestClientUtil.getESDtats(restClient, esQuery, oldEsUrl, "Get");
            resultList = RestClientUtil.getHitWithIdListByResult(dateResult);
            if (resultList!=null && resultList.size()>0) {

                String scrollId = RestClientUtil.getScrollIdByResult(dateResult);
                esScrollId = scrollId;
            }
        }

        return resultList;
    }

    //添加es index数据
    public void save(List<Map> list) {
        String pre = "es数据迁移";
        //开启计数
        int count = 1;
        StringBuilder bulkRequestBody = new StringBuilder();
        for (Map<String, Object> map : list) {

            String actionMetaData = String.format("{ \"index\" : { \"_index\" : \"%s\", \"_type\" : \"%s\" ,\"_id\" : \"%s\"} }%n",
                    newEsUrl, type, map.get("_id"));
            map.remove("_id");
            map.remove("_type");
            map.remove("_index");
            String requestJson = JSON.toJSONString(map, WriteMapNullValue);
            //拼装json语句
            bulkRequestBody.append(actionMetaData);
            bulkRequestBody.append(requestJson);
            bulkRequestBody.append("\n");

            String esPath = String.format("/%s/%s/%s", newEsUrl,type, "_bulk");

            //每次批量添加5000条数据
            if (count % 5000 == 0 || count == list.size() ) {
                String resultJson = RestClientUtil.getESDtats(newRestClient, bulkRequestBody.toString(), esPath, "PUT");
                if (StringUtils.isBlank(resultJson)) {
                    LOGGER.error("{} es PUT 批量插入数据操作不成功 ", pre);
                } else {
                    bulkRequestBody = new StringBuilder();
                }
            }
            count++;
        }
    }

总结

NodeClient和TranspotClient是基本的Java客户端。它们不能被Perl,Python,Ruby等访问。
REST HTTP客户端公开NodeClient,以便所有具有REST客户端的语言都可以使用ES 。
公司的爬虫项目之前使用TranspotClient去操作ES,近期由于业务的需求,准备开始使用阿里ES,为了切换成阿里ES,方便后续ES迁移和兼容更高版本ES,就使用RESTClient通过DSL语句去操作ES,为此本人开始接触ES,在项目需求中慢慢摸索,学一点皮毛,特此分享,记录自己的成长历程,也是一件快乐的事情,对于文章的不足之处,请谅解,望共同进步。

猜你喜欢

转载自blog.csdn.net/qq_34462387/article/details/83628971