ElasticSearch advanced syntax

Play with TMDB movie data analysis

Indexing

PUT /movie
{
    
    
   "settings" : {
    
    
      "number_of_shards" : 1,
      "number_of_replicas" : 1
   },
   "mappings": {
    
    
     "properties": {
    
    
       "title":{
    
    "type":"text","analyzer": "english"},
       "tagline":{
    
    "type":"text","analyzer": "english"},
       "release_date":{
    
    "type":"date",        "format": "8yyyy/MM/dd||yyyy/M/dd||yyyy/MM/d||yyyy/M/d"},
       "popularity":{
    
    "type":"double"},
       "cast":{
    
    
         "type":"object",
         "properties":{
    
    
           "character":{
    
    "type":"text","analyzer":"standard"},
           "name":{
    
    "type":"text","analyzer":"standard"}
         }
       },
       "overview":{
    
    "type":"text","analyzer": "english"}
     }
   }
}

data import

  1. Search TMDB on the Internet to download movie data resources and import ES

Query DSL simple experiment

  1. match query, search in the index after analyzing the word segmentation defined on the field
GET /movie/_search
{
    
    
  "query":{
    
    
    "match":{
    
    "title":"steve"}
  }
}
  1. Term query, without word analysis, go directly to the index query, and search for the exact match between the keywords and the words in the index
GET /movie/_search
{
    
    
  "query":{
    
    
    "match":{
    
    "title":"steve zissou"}
  }
}

GET /movie/_search
{
    
    
  "query":{
    
    
    "term":{
    
    "title":"steve zissou"}
  }
}
  1. and and or after match participle
GET /movie/_search
{
    
    
  "query":{
    
    
    "match":{
    
    "title":"basketball with cartoom aliens"} // 默认使用的是OR
  }
}
GET /movie/_search
{
    
    
  "query":{
    
    
    "match": {
    
    
      "title": {
    
    
        "query": "basketball with cartoom aliens",
        "operator": "and" // 使用的是AND
      }
    }
  } 
}
  1. Minimum term match
GET /movie/_search
{
    
    
  "query":{
    
    
    "match": {
    
    
      "title": {
    
    
        "query": "basketball with cartoom aliens",
        "operator": "or" ,
        "minimum_should_match": 2 // 最少匹配中两个词
      }
    }
  }
}
  1. Phrase query
GET /movie/_search
{
    
    
  "query":{
    
    
    "match_phrase":{
    
    "title":"steve zissou"} // 匹配短语
  }
}

GET /movie/_search
{
    
    
  "query":{
    
    
    "match_phrase_prefix":{
    
    "title":"steve zis"} // 短语前缀查询
  }
}
  1. Multi-field query
GET /movie/_search
{
    
    
  "query":{
    
    
    "multi_match":{
    
    
      "query":"basketball with cartoom aliens",
      "field":["title","overview"] // 在这两个字段同时查询
    }
  }
}

Explain the scoring rules again (tf/idf)*tfnorm

  • tf: term frequency, how many words are contained in this document, the more it contains, the more relevant it is
  • idf: inverse document frequency, the total number of documents containing the word
  • tfnorm: Normalized according to the length of the field, the higher the frequency of occurrence in the document, the shorter the field, the more relevant
// 操作不管是字符“与”还是“或”,按照逻辑关系命中后相加得分
GET /movie/_search
{
    
    
  "explain": true, 
  "query":{
    
    
    "match":{
    
    "title":"steve"}
  }
}

GET /movie/_search
{
    
    
  "query":{
    
    
    "multi_match":{
    
    
      "query":"basketball with cartoom aliens",
      "fields":["title^10","overview"], // title乘了10的权重,着重标题查询
      "tie_break":0.3
    }
  }
}

Continue to inquire

  1. Bool query
  • must: must be true
  • must not: must all be false
  • should: One of them is true, but the more true the higher the score
GET /movie/_search
{
    
    
  "query":{
    
    
    "bool": {
    
     
      "should": [
        {
    
     "match": {
    
     "title":"basketball with cartoom aliens"}}, 
        {
    
     "match": {
    
     "overview":"basketball with cartoom aliens"}}  
      ]
    }
  }
}
  1. Different multi_query types and multi_match scores are different
  • Because multi_match has many types
  • The default is best_fields, and the highest score is used as the corresponding score. The best matching mode is equivalent to the dismax mode
GET /movie/_search
{
    
    
  "query":{
    
    
    "dis_max": {
    
     
      "queries": [
        {
    
     "match": {
    
     "title":"basketball with cartoom aliens"}}, 
        {
    
     "match": {
    
     "overview":"basketball with cartoom aliens"}}  
      ]
    }
  }
}
  • Use explan to look at ((title:steve title:job) | (overview:steve overview:job)), scoring rules
GET /movie/_validate/query?explain
{
    
    
  //"explain": true, 
  "query":{
    
    
    "multi_match":{
    
    
      "query":"steve job",
      "fields":["title","overview"],
      "operator": "or",
      "type":"best_fields"
    }
  }
}
  • Calculate the scores of word segmentation with field as the unit, and then choose the best one, which is suitable for optimal field matching
GET /movie/_search
{
    
    
  "query":{
    
    
    "dis_max": {
    
     
      "queries": [
        {
    
     "match": {
    
     "title":"basketball with cartoom aliens"}}, 
        {
    
     "match": {
    
     "overview":"basketball with cartoom aliens"}}  
      ],
      "tie_breaker": 0.3 // 将其他因素以0.3的倍数考虑进去
    }
  }
}
  • most_fields: Take the scores of the hits and add them as scores, the same as should match mode, weighted common influence mode
  • cross_fields: Calculate the total score of the field based on the word segmentation unit
// 要求Peter必须在author_first_name或author_last_name中出现
// 要求Smith必须在author_first_name或author_last_name中出现
GET /forum/article/_search
{
    
    
  "query": {
    
    
    "multi_match": {
    
    
      "query": "Peter Smith",
      "type": "cross_fields", 
      "operator": "or",
      "fields": ["author_first_name", "author_last_name"]
    }
  }
}
  1. query string
  • Convenient use of AND(+) OR(|) NOT(-)
GET /movie/_search
{
    
    
  "query":{
    
    
    "query_string":{
    
    
      "fields":["title"],
      "query":"steve AND jobs"
    }
  }
}

Filter query

  1. Single condition filter
GET /movie/_search
{
    
    
  "query":{
    
    
    "bool":{
    
    
      "filter":{
    
    
          "term":{
    
    "title":"steve"}
      }
    }
  }
}
  1. Multi-condition filtering
GET /movie/_search
{
    
    
  "query":{
    
    
    "bool":{
    
    
      "filter":[
        {
    
    "term":{
    
    "title":"steve"}},
        {
    
    "term":{
    
    "cast.name":"gaspard"}},
        {
    
    "range": {
    
     "release_date": {
    
     "lte": "2015/01/01" }}},
        {
    
    "range": {
    
     "popularity": {
    
     "gte": "25" }}}
        ]
    }
  },
  "sort":[
    {
    
    "popularity":{
    
    "order":"desc"}}
  ]
}
  1. Filter with match score
GET /movie/_search
{
    
    
  "query":{
    
    
    "bool":{
    
    
      "must": [
        {
    
     "match": {
    
     "title":   "Search"        }}, 
        {
    
     "match": {
    
     "tagline": "Elasticsearch" }}  
      ],
      "filter":[
        {
    
    "term":{
    
    "title":"steve"}},
        {
    
    "term":{
    
    "cast.name":"gaspard"}},
        {
    
    "range": {
    
     "release_date": {
    
     "lte": "2015/01/01" }}},
        {
    
    "range": {
    
     "popularity": {
    
     "gte": "25" }}}
        ]
    }
  }
}

Essential for a good search engine

  • Recall rate: there are n correct results, m/n if the query is correct
  • Accuracy: if m of the n documents found are correct, then m/n
  • Both need to be improved, but generally cannot have both. You can adjust the sort position to rank the correct results on the top to improve the user experience
GET /movie/_search
{
    
    
  "query":{
    
    
    "function_score": {
    
    
      // 原始查询得到oldscore
      "query": {
    
          
        "multi_match":{
    
    
        "query":"steve job",
        "fields":["title","overview"],
        "operator": "or",
        "type":"most_fields"
      }
    },
    "functions": [
      {
    
    "field_value_factor": {
    
    
          "field": "popularity",   // 对应要处理的字段
          "modifier": "log2p",    // 将字段值+2后,计算对数
          "factor": 10    // 字段预处理*10
        }
      }
    ]
  }
}

Guess you like

Origin blog.csdn.net/qq_36221788/article/details/109787157