前言
本文基于elasticsearch7.3.0版本
说明
edge_ngram和ngram是elasticsearch内置的两个tokenizer和filter
实例
步骤
- 自定义两个分析器edge_ngram_analyzer和ngram_analyzer
- 进行分词测试
创建测试索引
PUT analyzer_test
{
"settings": {
"refresh_interval": "1s",
"index": {
"max_ngram_diff": 10
},
"analysis": {
"analyzer": {
"edge_ngram_analyzer": {
"type": "custom",
"char_filter": [],
"tokenizer": "keyword",
"filter": [
"edge_ngram_filter"
]
},
"ngram_analyzer": {
"type": "custom",
"char_filter": [],
"tokenizer": "keyword",
"filter": [
"ngram_filter"
]
}
},
"filter": {
"edge_ngram_filter": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 11
},
"ngram_filter": {
"type": "ngram",
"min_gram": 2,
"max_gram": 5
}
}
}
}
}
测试edge_ngram_analyzer分析器
POST /analyzer_test/_analyze
{
"text": "虹桥机场",
"analyzer": "edge_ngram_analyzer"
}
{
"tokens" : [
{
"token" : "虹",
"start_offset" : 0,
"end_offset" : 4,
"type" : "word",
"position" : 0
},
{
"token" : "虹桥",
"start_offset" : 0,
"end_offset" : 4,
"type" : "word",
"position" : 0
},
{
"token" : "虹桥机",
"start_offset" : 0,
"end_offset" : 4,
"type" : "word",
"position" : 0
},
{
"token" : "虹桥机场",
"start_offset" : 0,
"end_offset" : 4,
"type" : "word",
"position" : 0
}
]
}
测试ngram_analyzer分析器
POST /analyzer_test/_analyze
{
"text": "虹桥机场",
"analyzer": "ngram_analyzer"
}
{
"tokens" : [
{
"token" : "虹桥",
"start_offset" : 0,
"end_offset" : 4,
"type" : "word",
"position" : 0
},
{
"token" : "虹桥机",
"start_offset" : 0,
"end_offset" : 4,
"type" : "word",
"position" : 0
},
{
"token" : "虹桥机场",
"start_offset" : 0,
"end_offset" : 4,
"type" : "word",
"position" : 0
},
{
"token" : "桥机",
"start_offset" : 0,
"end_offset" : 4,
"type" : "word",
"position" : 0
},
{
"token" : "桥机场",
"start_offset" : 0,
"end_offset" : 4,
"type" : "word",
"position" : 0
},
{
"token" : "机场",
"start_offset" : 0,
"end_offset" : 4,
"type" : "word",
"position" : 0
}
]
}
区别
- edge_ngram是从第一个字符开始,按照步长,进行分词,适合前缀匹配场景,比如:订单号,手机号,邮政编码的检索
- ngram是从每一个字符开始,按照步长,进行分词,适合前缀中缀检索