Xiao Bian here first need to rely pom.xml available to everyone :( modify according to their own version)
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.7</maven.compiler.source>
<maven.compiler.target>1.7</maven.compiler.target>
<spark.version>2.3.2</spark.version>
</properties>
<dependencies>
<dependency><!-- 依赖管理,有选择的继承-->
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.10</version>
</dependency>
<dependency>
<groupId>org.elasticsearch.client</groupId>
<artifactId>transport</artifactId>
<version>6.2.0</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.10.3</version>
</dependency>
<dependency>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
<version>20180813</version>
</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-hadoop</artifactId>
<version>6.2.4</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
</dependencies>
1. Create a programming entry ES
Mainly to provide a Utils, ES program entry created by reading the configuration file.
# elasticSearch.conf
elastic.host=192.168.130.131
elastic.port=9300
elastic.cluster.name=zzy-application
#Constants
public interface Constants {
String ELASTIC_HOST = "elastic.host";
String ELASTIC_PORT="elastic.port";
String ELASTIC_CLUSTER_NAME = "elastic.cluster.name";
}
#ElasticSearchUtil
import com.zy.es.constant.Constants;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.TransportAddress;
import org.elasticsearch.transport.client.PreBuiltTransportClient;
import java.io.IOException;
import java.io.InputStream;
import java.net.InetAddress;
import java.util.Properties;
/**
* 一般情况下的工具类都是单例
* 里面若干方法一般都是static
* 如果在连接集群的时候,集群的名称对应不上:
* NoNodeAvailableException[None of the configured nodes are available:
*/
public class ElasticSearchUtil {
private static TransportClient client;
private static Properties ps;
static {
try {
InputStream resourceAsStream = ElasticSearchUtil.class.getClassLoader().getResourceAsStream("elasticsearch.conf");
ps =new Properties();
ps.load(resourceAsStream);
String host=ps.getProperty(Constants.ELASTIC_HOST);
int port = Integer.parseInt(ps.getProperty(Constants.ELASTIC_PORT));
String clusterName=ps.getProperty(Constants.ELASTIC_CLUSTER_NAME);
Settings settings =Settings.builder()
.put("cluster.name",clusterName)
.build();
client=new PreBuiltTransportClient(settings);
//这里可以有多个,集群模式
TransportAddress ta=new TransportAddress(
InetAddress.getByName(host),
port
);
//addTransportAddresses(TransportAddress... transportAddress),参数为一个可变参数
client.addTransportAddresses(ta);
} catch (IOException e) {
e.printStackTrace();
}
}
public static TransportClient getTransportClient(){
return client;
}
public static void close(TransportClient client){
if(client!=null){
client.close();
}
}
}
2. Create an index
Xiao Bian here to provide a json, map, javabean, XContentBuilder four kinds created.
import java.util
import com.zy.es.pojo.Book
import com.zy.es.utils.ElasticSearchUtil
import org.elasticsearch.action.index.IndexResponse
import org.elasticsearch.cluster.metadata.MetaData.XContentContext
import org.elasticsearch.common.xcontent.{XContentBuilder, XContentType}
import org.elasticsearch.common.xcontent.json.JsonXContent
import org.json.JSONObject
object createIndex {
private var index="library"
private var `type`="books"
private val client = ElasticSearchUtil.getTransportClient()
def main(args: Array[String]): Unit = {
createIndexByJson()
//createIndexByMap()
// createIndexByBean()
// createIndexByXContentBuilder()
//关闭es连接对象
ElasticSearchUtil.close(client)
}
/**
* 1.通过json方式创建
* java.lang.IllegalArgumentException: The number of object passed must be even but was [1]
* 在es5.x以上,使用XContentType.JSON来制定即可
*setSource(json.toString(),XContentType.JSON) 必须指定第二个参数。
*/
def createIndexByJson()={
val json=new JSONObject
json.put("name","我爱你中国")
json.put("author","周迅")
json.put("date","2018-6-6")
//返回创建后的结果
var response: IndexResponse = client.prepareIndex(index, `type`, "9")
.setSource(json.toString, XContentType.JSON).get()
//查看版本
println(response.getVersion)
}
/**
* 2.map方式
*/
def createIndexByMap(): Unit ={
val sourceMap=new util.HashMap[String,String]()
sourceMap.put("name","朝花夕拾")
sourceMap.put("author","鲁迅")
sourceMap.put("date","2009-4-5")
var response: IndexResponse = client.prepareIndex(index, `type`, "2").setSource(sourceMap)
.get()
//查看版本
println(response.getVersion)
}
/**
* 3.使用普通的javabean
*/
def createIndexByBean()={
val book:Book=new Book("斗破苍穹","天蚕土豆","2012-2-6");
val json=new JSONObject(book)
//返回创建后的结果
var response: IndexResponse = client.prepareIndex(index, `type`, "3")
.setSource(json.toString, XContentType.JSON).get()
//查看版本
println(response.getVersion)
}
/**
* 4.XContentBuilder方式
*/
def createIndexByXContentBuilder()={
var builder: XContentBuilder = JsonXContent.contentBuilder()
builder.startObject()
.field("name","西游记")
.field("author","吴承恩")
.field("version","1.0")
.endObject()
var response: IndexResponse = client.prepareIndex(index, `type`,"4").setSource(builder)
.get()
println(response.getVersion)
}
}
3. Delete Data & Data & update batch processing
Xiao Bian here delete data, update data, batch operation.
import java.util
import com.zy.es.utils.ElasticSearchUtil
import org.elasticsearch.action.bulk.BulkResponse
import org.elasticsearch.action.delete.DeleteResponse
import org.elasticsearch.action.update.{UpdateRequestBuilder, UpdateResponse}
import org.elasticsearch.common.xcontent.{XContentBuilder, XContentType}
import org.elasticsearch.common.xcontent.json.JsonXContent
import org.json.JSONObject
object ElasticsearchCRUD {
private var index="library"
private var `type`="books"
private val client = ElasticSearchUtil.getTransportClient()
def main(args: Array[String]): Unit = {
//删除数据
testDelete()
//更新
//testUpdate()
//批量操作
//testBulk()
//关闭连接对象
ElasticSearchUtil.close(client)
}
//删除数据
def testDelete()={
var response: DeleteResponse = client.prepareDelete(index, `type`, "2").get()
println("version:"+response.getVersion)
}
//更新
def testUpdate()={
var builder: XContentBuilder = JsonXContent.contentBuilder()
builder.startObject()
.field("version","3.0")
.endObject()
var response: UpdateResponse = client.prepareUpdate(index, `type`, "4")
.setDoc(builder).get()
println("version:"+response.getVersion)
}
//批量操作
def testBulk()={
val map=new util.HashMap[String,String]()
map.put("name","无双")
map.put("author","周润发")
map.put("version","2")
val json=new JSONObject
json.put("name","红楼梦")
json.put("author","曹雪芹")
json.put("version","1.0")
var responses: BulkResponse = client.prepareBulk().add(client.prepareIndex(index, `type`, "7")
.setSource(map))
.add(client.prepareIndex(index, `type`, "8").setSource(json.toString(),XContentType.JSON))
.get()
for(response <-responses.getItems){
print(response.getVersion)
}
}
}
4. The full-text index, the index page, highlight
import java.util
import com.zy.es.utils.ElasticSearchUtil
import org.elasticsearch.action.search.{SearchResponse, SearchType}
import org.elasticsearch.index.query.QueryBuilders
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder
import org.elasticsearch.search.{SearchHit, SearchHits}
import org.json.JSONObject
import scala.collection.JavaConversions
object testSearch {
private var index="library"
private var `type`="books"
private val client = ElasticSearchUtil.getTransportClient()
def main(args: Array[String]): Unit = {
//全文索引
//fullTextSearch()
//分页索引
//pagingSearch()
//高亮索引
highlightSearch()
}
//全文索引
def fullTextSearch()={
val json=new JSONObject()
val response = client.prepareSearch(index) //设置检索的类型
.setSearchType(SearchType.DEFAULT) //设置检索的类型
.setQuery(QueryBuilders.matchQuery("author", "天蚕土豆")) //设置检索方式
.get()
val hits = response.getHits //获取检索结果
println("totals:"+hits.getTotalHits) //检索出的数据的个数
println("maxSource"+hits.getMaxScore) //最大的得分
//查询的具体的内容
val myhits = hits.getHits
for(hit <- myhits){
val index = hit.getIndex
val id = hit.getId
val `type` = hit.getType
val source =hit.getSourceAsString
val score=hit.getScore
json.put("_index",index)
json.put("_id",id)
json.put("_type",`type`)
json.put("_score", score )
json.put("_source",new JSONObject(source))
println(json.toString())
}
}
//分页索引
//分页查询:查询第num页,查count条 每一页的长度*(num-1)+count
def pagingSearch(from:Int=0,size:Int=10)={
var response: SearchResponse = client.prepareSearch(index)
.setSearchType(SearchType.QUERY_THEN_FETCH)
.setQuery(QueryBuilders.matchQuery("name", "西游记"))
.setFrom(from)
.setSize(size)
.get()
val myhits: SearchHits = response.getHits
val total=myhits.totalHits
println("zzy为您查询出"+total+"记录:")
val hits: Array[SearchHit] = myhits.getHits
for (hit<-hits){
val map: util.Map[String, AnyRef] = hit.getSourceAsMap
val author=map.get("author")
val name=map.get("name")
val version=map.get("version")
print(
s"""
|author:${author}
|name:${name}
|version:${version}
""".stripMargin)
}
}
//高亮索引
def highlightSearch()={
val response=client.prepareSearch(index)
.setSearchType(SearchType.DEFAULT)
.setQuery(QueryBuilders.matchQuery("author","周润发"))
.highlighter(new HighlightBuilder()
.field("author")//给哪个字段添加标签
.preTags("<font color='red' size='20px'>")//添加的前置标签
.postTags("</font>"))//添加的后置标签
.get()
val myHits = response.getHits
val total = myHits.totalHits
println("zzy为您查询出" + total + "记录:")
val hits: Array[SearchHit] = myHits.getHits
for(hit <-hits){
//注意这里如果想要获取高亮的字段,必须使用高亮的方式获取
val HLfields = hit.getHighlightFields
//这里的field是设置高亮的字段名:author highlight查询的所有的字段值(含高亮的)
for((field,highlight)<-JavaConversions.mapAsScalaMap(HLfields)){
var date=""
val fragments=highlight.getFragments
for(fragment <-fragments){
date+=fragment.toString
}
print(date)
}
}
}
}
The Chinese word
(1) error demo
First, we now add some of your own data ES cluster:
#创建索引库
curl -H "Content-Type: application/json" -XPUT 'http://192.168.130.131:9200/chinese'
#添加数据
curl -H "Content-Type: application/json" -XPOST http://192.168.130.131:9200/chinese/fulltext/1 -d'{"content":"美国留给伊拉克的是个烂摊子吗"}'
curl -H "Content-Type: application/json" -XPOST http://192.168.130.131:9200/chinese/fulltext/2 -d'{"content":"公安部:各地校车将享最高路权"}'
curl -H "Content-Type: application/json" -XPOST http://192.168.130.131:9200/chinese/fulltext/3 -d'{"content":"中韩渔警冲突调查:韩警平均每天扣1艘中国渔船"}'
curl -H "Content-Type: application/json" -XPOST http://192.168.130.131:9200/chinese/fulltext/4 -d'{"content":"中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"}'
# Then use different query to see results:
import com.zy.es.utils.ElasticSearchUtil
import org.elasticsearch.action.search.{SearchResponse, SearchType}
import org.elasticsearch.index.query.QueryBuilders
object ChineseParticipleSearch {
private var index="chinese"
private var `type`="fulltext"
private val client = ElasticSearchUtil.getTransportClient()
def main(args: Array[String]): Unit = {
val response: SearchResponse =client.prepareSearch(index)
.setSearchType(SearchType.QUERY_THEN_FETCH)
.setQuery(QueryBuilders.matchQuery("content","中国"))
.get()
val myHits = response.getHits.getHits
for(hit <- myHits){
println(hit.getSourceAsString)
}
}
}
Note : we use here match the query, the query is "Chinese"
look at the results:
here is why the United States will check out?
This is because: native query 'China' is the word after separation during retrieval, query errors index figure above situation occurs.
Then how are we to do it, I just want to check out the contents related to China, ah, never mind Chinese word to help you solve.
(2) ES configuration Chinese word
Common Chinese word plugins: IK, Paodingjieniu Chinese word, and so on. Here we use IK word.
① Download: https://github.com/medcl/elasticsearch-analysis-ik version corresponds
② use maven source code is compiled (at IK_HOME) :( mvn Clean the install -DskipTests)
③ the compiled target / releases the zip file copied to ES_HOME / plugins / analysis-ik directory, and then extract the version ES plugin-descriptor.properties therein and plugin-security.policy file to use their version
④ modify ES_HOME / config / elasticsearch .yml file, add (ES6.x above do not need to do this) index.analysis.analyzer.default.type: IK
⑤ restart es service
here small a bit rude:
# ps--aux | grep elasticsearch
# kill -9 pid
# / ES_HOME / bin / elasticsearch -d start
(3) re-tested
The first step : before the data will be deleted
curl -XDELETE ' http://192.168.130.131:9200/chinese/1 '
curl -XDELETE ' http://192.168.130.131:9200/chinese/2 '
curl -XDELETE ' HTTP : //192.168.130.131: 9200 / chinese /. 3 '
curl -XDELETE' http://192.168.130.131:9200/chinese/4 '
Step : reload the data, and set IK word
# ik word set
curl -XPOST http://192.168.130.131:9200/chinese/fulltext/_mapping -H 'the type-the Content: file application / JSON' -d '
{
"Properties": {
"Content": {
"type": "text",
"Analyzer": "ik_max_word",
"search_analyzer":"ik_max_word"
}
}
} "
#添加数据
curl -H "Content-Type: application/json" -XPOST http://192.168.130.131:9200/chinese/fulltext/1 -d'{"content":"美国留给伊拉克的是个烂摊子吗"}'
curl -H "Content-Type: application/json" -XPOST http://192.168.130.131:9200/chinese/fulltext/2 -d'{"content":"公安部:各地校车将享最高路权"}'
curl -H "Content-Type: application/json" -XPOST http://192.168.130.131:9200/chinese/fulltext/3 -d'{"content":"中韩渔警冲突调查:韩警平均每天扣1艘中国渔船"}'
curl -H "Content-Type: application/json" -XPOST http://192.168.130.131:9200/chinese/fulltext/4 -d'{"content":"中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"}'
The third step :
re-run just above code, here we look at the results:
6.Elasticsearch On Spark
Integration conditions:
ES official website:
https://www.elastic.co/guide/en/elasticsearch/hadoop/current/install.html
Maven dependent: https://mvnrepository.com/artifact/org.elasticsearch/elasticsearch-hadoop/ 6.2.4
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-hadoop</artifactId>
<version>6.2.4</version>
</dependency>
//如果使用spark中可以读到ES中的数据,需要导入隐式转换
import java.util.Date
import com.zy.es.utils.ElasticSearchUtil
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.elasticsearch.cluster.metadata.MetaData.XContentContext
import org.elasticsearch.common.xcontent.XContentType
import org.elasticsearch.spark._
/**
* spark整合ES
* 通过spark去读取es中的数据,同时将操作之后的结果落地到ES
*/
object EsOnSpark {
private val client = ElasticSearchUtil.getTransportClient()
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("EsOnSpark")
.setMaster("local[2]")
.set("es.index.auto.create", "true") //写数据的时候如果索引库不存在,自动创建
.set("es.nodes", "192.168.130.131") //设置ES集群的节点
.set("es.port", "9200") //设置ES集群的端口
val sc = new SparkContext(conf)
var EsRDD: RDD[(String, String)] = sc.esJsonRDD("library/books") //指定index/type
var index = "es-spark"
var `type` = "book"
EsRDD.foreach { case (id, json) => {
client.prepareIndex(index, `type`, new Date().getTime.toString)
.setSource(json, XContentType.JSON).get()
println(id + "" + json)
}
}
sc.stop()
}
}
Here are just small series describes some common API operations, we know that ES greatest strength lies in his inquiry, later small series will further complement API on ElasticSearch powerful query capabilities.