从零学Elasticsearch系列——集成中文分词器IK

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/qq_31871785/article/details/86142572

系列文章:


参考资料:https://github.com/medcl/elasticsearch-analysis-ik

安装

[es@localhost root]$ cd /usr/elasticsearch-6.4.0/
[es@localhost elasticsearch-6.4.0]$ ./bin/elasticsearch-plugin install https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v6.4.0/elasticsearch-analysis-ik-6.4.0.zip
-> Downloading https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v6.4.0/elasticsearch-analysis-ik-6.4.0.zip
[=================================================] 100%  
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@     WARNING: plugin requires additional permissions     @
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
* java.net.SocketPermission * connect,resolve
See http://docs.oracle.com/javase/8/docs/technotes/guides/security/permissions.html
for descriptions of what these permissions allow and the associated risks.

Continue with installation? [y/N]y
-> Installed analysis-ik
[es@localhost elasticsearch-6.4.0]$ ll plugins/
total 0
drwxr-xr-x. 2 es es 229 Jan  3 10:04 analysis-ik

# 重新启动es的服务
[root@localhost ~]# jps
2673 Elasticsearch
46151 Jps
40921 PluginCli
[root@localhost ~]# kill -9 2673
[root@localhost ~]# cd /usr/elasticsearch-6.4.0/
[root@localhost elasticsearch-6.4.0]# su es
[es@localhost elasticsearch-6.4.0]$ bin/elasticsearch

测试

创建测试索引

PUT /news

创建类型映射

POST /news/international/_mapping
{
  "properties": {
    "content":{
      "type": "text",
      "analyzer": "ik_max_word",  # 会将文本做最细粒度的拆分
      "search_analyzer": "ik_max_word"
    }
  }
}

插入测试数据

POST /news/international/_bulk
{"index":{"_id":1}}
{"content":"美国留给伊拉克的是个烂摊子吗"}
{"index":{"_id":2}}
{"content":"公安部:各地校车将享最高路权"}
{"index":{"_id":3}}
{"content":"中韩渔警冲突调查:韩警平均每天扣1艘中国渔船"}
{"index":{"_id":4}}
{"content":"中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"}

根据关键词高亮查询

GET /news/international/_search
{
  "query": {
    "match": {
      "content": "中国"
    }
  },
  "highlight": {
    "fields": {"content": {}}
  }
}

-------------------------------------------------------------------
{
  "took": 177,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 2,
    "max_score": 0.6489038,
    "hits": [
      {
        "_index": "news",
        "_type": "international",
        "_id": "4",
        "_score": 0.6489038,
        "_source": {
          "content": "中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"
        },
        "highlight": {
          "content": [
            "<em>中国</em>驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"
          ]
        }
      },
      {
        "_index": "news",
        "_type": "international",
        "_id": "3",
        "_score": 0.2876821,
        "_source": {
          "content": "中韩渔警冲突调查:韩警平均每天扣1艘中国渔船"
        },
        "highlight": {
          "content": [
            "中韩渔警冲突调查:韩警平均每天扣1艘<em>中国</em>渔船"
          ]
        }
      }
    ]
  }
}

IK词典配置

[root@localhost analysis-ik]# vim /usr/elasticsearch-6.4.0/config/analysis-ik/IKAnalyzer.cfg.xml

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
        <comment>IK Analyzer 扩展配置</comment>
        <!--用户可以在这里配置自己的扩展字典 -->
        <entry key="ext_dict">mydict.dic;</entry>
         <!--用户可以在这里配置自己的扩展停止词字典-->
        <entry key="ext_stopwords"></entry>
        <!--用户可以在这里配置远程扩展字典 -->
        <!--<entry key="remote_ext_dict">location</entry>-->
        <!--用户可以在这里配置远程扩展停止词字典-->
        <!--<entry key="remote_ext_stopwords">http://xxx.com/xxx.dic</entry>-->
</properties>

自定义扩展词测试

POST /_analyze
{
  "analyzer": "ik_max_word",
  "text": ["抖音视频真的好火啊"]
}
# 未添加扩展词 效果如下
---------------------------------------------------
{
  "tokens": [
    {
      "token": "抖",
      "start_offset": 0,
      "end_offset": 1,
      "type": "CN_CHAR",
      "position": 0
    },
    {
      "token": "音视频",
      "start_offset": 1,
      "end_offset": 4,
      "type": "CN_WORD",
      "position": 1
    },
    ......
  ]
}
      
# 添加扩展词 效果如下
---------------------------------------------------
{
  "tokens": [
    {
      "token": "抖音",
      "start_offset": 0,
      "end_offset": 2,
      "type": "CN_WORD",
      "position": 0
    },
    ......
    {
      "token": "腰子姐",
      "start_offset": 17,
      "end_offset": 20,
      "type": "CN_WORD",
      "position": 8
    }
  ]
}      

猜你喜欢

转载自blog.csdn.net/qq_31871785/article/details/86142572