Elasticsearch 定义多个分词器模板

Elasticsearch  定义多个分词器模板

版本:Elasticsearch  7.2.0

1.定义索引时,使用多个分词器

2.在模板中定义中使用多个分词器

3.ik+pinyin

对人工智能感兴趣的同学,可以点击以下链接:

现在人工智能非常火爆,很多朋友都想学,但是一般的教程都是为博硕生准备的,太难看懂了。最近发现了一个非常适合小白入门的教程,不仅通俗易懂而且还很风趣幽默。所以忍不住分享一下给大家。点这里可以跳转到教程。

https://www.cbedai.net/u014646662

1.定义索引时,使用多个分词器

Ik+English分词器

Ik分词器可以对汉语分词,English分词器可以对英文中时态、复数、大小写等处理

put /test
{
  "settings":{
    "number_of_shards" : "3",
        "number_of_replicas" : "0",
         "analysis": {
            "analyzer": {
                "ik_en_analyzer": {
                    "type": "custom",
                    "tokenizer": "ik_max_word",
                    "filter": ["en_stemmer"]
                }
            },
            "filter": {
                 "en_stemmer" : {
                    "type" : "stemmer",
                    "name" : "english"
                }
            }
        }
  },
  "mappings":{
    "properties":{
      "id":{"type":"long"},
      "name":{
        "type" : "text",
        "analyzer" : "ik_en_analyzer"
      },
      "text":{"type":"text"}
    }
  }
}

测试


GET test/_analyze
{
  "field": "name", 
   "text":"Saying and doing are two different things."
}

{
  "tokens" : [
    {
      "token" : "sai",
      "start_offset" : 0,
      "end_offset" : 6,
      "type" : "ENGLISH",
      "position" : 0
    },
    {
      "token" : "do",
      "start_offset" : 11,
      "end_offset" : 16,
      "type" : "ENGLISH",
      "position" : 1
    },
    {
      "token" : "two",
      "start_offset" : 21,
      "end_offset" : 24,
      "type" : "ENGLISH",
      "position" : 2
    },
    {
      "token" : "differ",
      "start_offset" : 25,
      "end_offset" : 34,
      "type" : "ENGLISH",
      "position" : 3
    },
    {
      "token" : "things.",
      "start_offset" : 35,
      "end_offset" : 42,
      "type" : "LETTER",
      "position" : 4
    },
    {
      "token" : "thing",
      "start_offset" : 35,
      "end_offset" : 41,
      "type" : "ENGLISH",
      "position" : 5
    }
  ]
}

GET test/_analyze
{
  "field": "name", 
   "text":"Ik分词器可以对汉语分词"
}

{
  "tokens" : [
    {
      "token" : "ik",
      "start_offset" : 0,
      "end_offset" : 2,
      "type" : "ENGLISH",
      "position" : 0
    },
    {
      "token" : "分词器",
      "start_offset" : 2,
      "end_offset" : 5,
      "type" : "CN_WORD",
      "position" : 1
    },
    {
      "token" : "分词",
      "start_offset" : 2,
      "end_offset" : 4,
      "type" : "CN_WORD",
      "position" : 2
    },
    {
      "token" : "器",
      "start_offset" : 4,
      "end_offset" : 5,
      "type" : "CN_CHAR",
      "position" : 3
    },
    {
      "token" : "可以",
      "start_offset" : 5,
      "end_offset" : 7,
      "type" : "CN_WORD",
      "position" : 4
    },
    {
      "token" : "以对",
      "start_offset" : 6,
      "end_offset" : 8,
      "type" : "CN_WORD",
      "position" : 5
    },
    {
      "token" : "汉语",
      "start_offset" : 8,
      "end_offset" : 10,
      "type" : "CN_WORD",
      "position" : 6
    },
    {
      "token" : "分词",
      "start_offset" : 10,
      "end_offset" : 12,
      "type" : "CN_WORD",
      "position" : 7
    }
  ]
}

2.在模板中定义中使用多个分词器

Post _template/template_default
{
    "index_patterns": ["*"],
  "order" : 0,
  "version": 0,
  "settings": {
    "number_of_shards": 2,
    "number_of_replicas":1 ,
     "analysis": {
            "analyzer": {
                "ik_en_analyzer": {
                    "type": "custom",
                    "tokenizer": "ik_max_word",
                    "filter": ["en_stemmer"]
                }
            },
            "filter": {
                 "en_stemmer" : {
                    "type" : "stemmer",
                    "name" : "english"
                }
            }
        }
  },
    
    "mappings": {
      "date_detection": true,
      "numeric_detection": true,
            "dynamic_templates": [
                {
                    "string_fields": {
                        "match": "*",
                        "match_mapping_type": "string",
                        "mapping": {
                            "type": "text",
                            "norms": false,
                            "analyzer": "ik_en_analyzer",
                            "fields": {
                                "keyword": {
                                    "type": "keyword"
                                }
                            }
                        }
                    }
                }
            ]
    }
}

3.ik+pinyin

put /test
{
  "settings":{
    "number_of_shards" : "3",
        "number_of_replicas" : "0",
         "analysis": {
            "analyzer": {
                "ik_en_analyzer": {
                    "type": "custom",
                    "tokenizer": "ik_max_word",
                    "filter": ["my_pinyin"]
                }
            },
            "filter": {
                 "my_pinyin" : {
                     "type" : "pinyin",
                    "keep_separate_first_letter" : false,
                    "keep_full_pinyin" : true,
                    "keep_original" : true,
                    "limit_first_letter_length" : 16,
                    "lowercase" : true,
                    "remove_duplicated_term" : true
                }
            }
        }
  },
  "mappings":{
    "properties":{
      "id":{"type":"long"},
      "name":{
        "type" : "text",
        "analyzer" : "ik_en_analyzer"
      },
      "text":{"type":"text"}
    }
  }
}

pinyin分词器详解:https://github.com/medcl/elasticsearch-analysis-pinyin

发布了139 篇原创文章 · 获赞 273 · 访问量 666万+

猜你喜欢

转载自blog.csdn.net/u014646662/article/details/97272096