ElasticSearch(3)

导入tmdb

tmdb是电影数据,他的数据量很大,非常适合用作es实践。直接谷歌kaggle tmdb下载。

首先还是要在es上建立mapping:

PUT /movie
{
  "settings": {
    "number_of_shards": 1,
    "number_of_replicas": 1
  },
  "mappings": {
    "properties": {
      "title":{"type": "text","analyzer": "english"},
      "tagline":{"type": "text","analyzer": "english"},
      "release_date":{"type": "date","format": "8yyyy/MM/dd||yyyy/M/dd||yyyy/MM/d||yyyy/M/d"},
      "popularity":{"type": "double"},
      "overview":{"type": "text","analyzer": "english"},
      "cast":{
        "type": "object",
        "properties": {
          "character":{"type":"text","analyzer":"standard"},
          "name":{"type":"text","analyzer":"standard"}
        }
        
      }
    }
  }
}

接下来创建一个程序

        <dependency>
			<groupId>org.springframework.boot</groupId>
			<artifactId>spring-boot-starter</artifactId>
		</dependency>
		<dependency>
			<groupId>org.springframework.boot</groupId>
			<artifactId>spring-boot-starter-web</artifactId>
		</dependency>
		<dependency>
			<groupId>org.elasticsearch</groupId>
			<artifactId>elasticsearch</artifactId>
			<version>7.6.1</version>
		</dependency>
		<dependency>
			<groupId>org.elasticsearch.client</groupId>
			<artifactId>transport</artifactId>
			<version>7.6.1</version>
		</dependency>
		<dependency>
			<groupId>org.springframework.boot</groupId>
			<artifactId>spring-boot-starter-test</artifactId>
			<scope>test</scope>
		</dependency>
		<dependency>
			<groupId>org.elasticsearch.plugin</groupId>
			<artifactId>transport-netty4-client</artifactId>
			<version>7.6.1</version>
		</dependency>
		<dependency>
			<groupId>com.opencsv</groupId>
			<artifactId>opencsv</artifactId>
			<version>4.2</version>
		</dependency>
		<dependency>
			<groupId>com.alibaba</groupId>
			<artifactId>fastjson</artifactId>
			<version>1.2.58</version>
		</dependency>
@Configuration
public class ESConfig {
    @Bean
    public TransportClient getClient(){
        TransportClient transportClient = null;
        try {
            Settings settings = Settings.builder()
                    .put("cluster.name","dianping-app").build();
            transportClient = new PreBuiltTransportClient(settings);
            TransportAddress firstAddress = new TransportAddress(InetAddress.getByName("127.0.0.1"),Integer.parseInt("9300"));
            TransportAddress secondAddress = new TransportAddress(InetAddress.getByName("127.0.0.1"),Integer.parseInt("9301"));
            TransportAddress thirdAddress = new TransportAddress(InetAddress.getByName("127.0.0.1"),Integer.parseInt("9302"));
            transportClient.addTransportAddress(firstAddress);
            transportClient.addTransportAddress(secondAddress);
            transportClient.addTransportAddress(thirdAddress);

        }catch (Exception e){
            e.printStackTrace();

        }
        return transportClient;
    }
}
@Controller("/es")
@RequestMapping("/es")
public class ESController {

    @Autowired
    private TransportClient transportClient;

    @RequestMapping("/get")
    @ResponseBody
    public ResponseEntity get(@RequestParam(name="id")Integer id){
        GetResponse getResponse = transportClient.prepareGet("movie",null,id.toString()).get();
        return new ResponseEntity(getResponse.getSource(), HttpStatus.OK);
    }


    @RequestMapping("/importdata")
    @ResponseBody
    public ResponseEntity importdata() throws IOException {
        //批量插入
        BulkRequest bulkRequest = new BulkRequest();
        int lineId = 0;
        InputStreamReader in = new InputStreamReader(new FileInputStream("./tmdb_5000_movies.csv"), Charset.forName("UTF-8"));
        CSVReader reader = new CSVReader(in, ',');
        List<String[]> allRecords = reader.readAll();
        for (String[] records : allRecords) {
            lineId++;
            if(lineId == 1){
                continue;
            }
            try{
                JSONArray castJsonArray = JSONArray.parseArray(records[20]);
                String character = (String) castJsonArray.getJSONObject(0).get("character");
                String name = (String) castJsonArray.getJSONObject(0).get("name");
                JSONObject cast = new JSONObject();
                cast.put("character",character);
                cast.put("name",name);
                String date = records[11];
                if(date == null || date.equals("")){
                    date = "1970/01/01";
                }
                //IndexRequest一条索引记录
                bulkRequest.add(new IndexRequest("movie", "_doc", String.valueOf(lineId-1)).source(XContentType.JSON,
                        "title", records[17],
                        "tagline",records[16],
                        "release_date",date,
                        "popularity",records[8],
                        "cast",cast,
                        "overview",records[7]));
            }catch(Exception ex){

            }
        }
        reader.close();
        transportClient.bulk(bulkRequest, new ActionListener<BulkResponse>() {
            @Override
            public void onResponse(BulkResponse bulkItemResponses) {
                System.out.println(bulkItemResponses);
            }

            @Override
            public void onFailure(Exception e) {
                System.out.println(e);
            }
        });
        return new ResponseEntity("", HttpStatus.OK);
    }
}

将csv放到项目目录下,运行一下,数据就导进来了。

一些查询

之前说过一些语句,这里再说一些,match之前说过,适用于关键词匹配的,逻辑是or,例如:

GET /movie/_search
{
  "query": {
    "match": {
      "title": "basketball with cartoom aliens"
    }
  }
}

那么换成and查询的话,代码:

GET /movie/_search
{
  "query": {
    "match": {
      "title": {
        "query": "basketball with cartoom aliens",
        "operator":"and"
      }
    }
  }
}

最小词匹配项minimum_should_match,意思就是最少有几个词匹配,默认or是1个词匹配。

GET /movie/_search
{
  "query": {
    "match": {
      "title": {
        "query": "basketball love aliens",
        "operator":"or",
        "minimum_should_match": 2
      }
    }
  }
}

短语查询:match_phrase,这样,两个词就不会分开了,这跟term的区别在于,他会做大小写之类的匹配。

GET /movie/_search
{
  "query": {
    "match_phrase": {
      "title": "steve zissou"
    }
  }
}

多字段查询:multi_match,这个可以同时查多个字段

GET /movie/_search
{
  "query": {
    "multi_match": {
      "query": "basketball with cartoom aliens",
      "fields": ["title","overview"]
    }
  }
}

我们在查询的时候,每个查询都会有一个score打分,

这个打分我们之前说过一个TF/IDF,再补充一个TFNORM,token frequency nomalized词频归一化。例如搜索steve jobs,结果:

jobs这个词在第一个结果里占比50%,第二个占比33.3%,所以第一个词的词频更高。所以分数更高。

那么打分的过程是什么样的呢?

GET /movie/_search
{
  "explain": true, 
  "query": {
    "match": {
      "title": "steve"
    }
  }
}

结果:截取其中一个看下

{
        "_shard" : "[movie][0]",
        "_node" : "WQDVMY19QXOuANmMQ0yWWg",
        "_index" : "movie",
        "_type" : "_doc",
        "_id" : "2340",
        "_score" : 7.4039927,
        "_source" : {
          "title" : "Steve Jobs",
          "tagline" : "Can a great man be a good man?",
          "release_date" : "2015/10/9",
          "popularity" : "53.670525",
          "cast" : {
            "character" : "Burke",
            "name" : "Aaron Eckhart"
          },
          "overview" : "Set backstage at three iconic product launches and ending in 1998 with the unveiling of the iMac, Steve Jobs takes us behind the scenes of the digital revolution to paint an intimate portrait of the brilliant man at its epicenter."
        },
        "_explanation" : {
          "value" : 7.4039927,
          "description" : "weight(title:steve in 2183) [PerFieldSimilarity], result of:",
          "details" : [
            {
              "value" : 7.4039927,
              "description" : "score(freq=1.0), computed as boost * idf * tf from:",
              "details" : [
                {
                  "value" : 2.2,
                    //默认放大系数
                  "description" : "boost",
                  "details" : [ ]
                },
                {
                  "value" : 7.1592917,
                  "description" : "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
                  "details" : [
                    {
                      "value" : 3,
                      "description" : "n, number of documents containing term",
                      "details" : [ ]
                    },
                    {
                      "value" : 4500,
                      "description" : "N, total number of documents with field",
                      "details" : [ ]
                    }
                  ]
                },
                {
                  "value" : 0.47008157,
                  "description" : "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:",
                  "details" : [
                    {
                      "value" : 1.0,
                      "description" : "freq, occurrences of term within document",
                      "details" : [ ]
                    },
                    {
                      "value" : 1.2,
                      "description" : "k1, term saturation parameter",
                      "details" : [ ]
                    },
                    {
                      "value" : 0.75,
                      "description" : "b, length normalization parameter",
                      "details" : [ ]
                    },
                    {
                      "value" : 2.0,
                      "description" : "dl, length of field",
                      "details" : [ ]
                    },
                    {
                      "value" : 2.1757777,
                      "description" : "avgdl, average length of field",
                      "details" : [ ]
                    }
                  ]
                }
              ]
            }
          ]
        }
      }

结果可以看出,是df分数*idf分数*放大系数得来的。其中freq / (freq + k1 * (1 - b + b * dl / avgdl))中的分母是BM25算法,他用来解决词频问题。

对于多字段查询还有个问题,

GET /movie/_search
{
  "query": {
    "multi_match": {
      "query": "basketball with cartoom aliens",
      "fields": ["title","overview"]
    }
  }
}

查询结果:

{
        "_shard" : "[movie][0]",
        "_node" : "WQDVMY19QXOuANmMQ0yWWg",
        "_index" : "movie",
        "_type" : "_doc",
        "_id" : "453",
        "_score" : 8.579647,
        "_source" : {
          "title" : "Space Jam",
          "tagline" : "Get ready to jam.",
          "release_date" : "1996/11/15",
          "popularity" : "36.125715",
          "cast" : {
            "character" : "Cameron Poe",
            "name" : "Nicolas Cage"
          },
          "overview" : "In a desperate attempt to win a basketball match and earn their freedom, the Looney Tunes seek the aid of retired basketball champion, Michael Jordan."
        },
        "_explanation" : {
          "value" : 8.579647,
          "description" : "max of:",
          "details" : [
            {
              "value" : 8.579647,
              "description" : "sum of:",
              "details" : [
                {
                  "value" : 8.579647,
                  "description" : "weight(overview:basketbal in 396) [PerFieldSimilarity], result of:",
                  "details" : [
                    {
                      "value" : 8.579647,
                      "description" : "score(freq=2.0), computed as boost * idf * tf from:",
                      "details" : [
                        {
                          "value" : 2.2,
                          "description" : "boost",
                          "details" : [ ]
                        },
                        {
                          "value" : 5.25461,
                          "description" : "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
                          "details" : [
                            {
                              "value" : 23,
                              "description" : "n, number of documents containing term",
                              "details" : [ ]
                            },
                            {
                              "value" : 4498,
                              "description" : "N, total number of documents with field",
                              "details" : [ ]
                            }
                          ]
                        },
                        {
                          "value" : 0.74217486,
                          "description" : "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:",
                          "details" : [
                            {
                              "value" : 2.0,
                              "description" : "freq, occurrences of term within document",
                              "details" : [ ]
                            },
                            {
                              "value" : 1.2,
                              "description" : "k1, term saturation parameter",
                              "details" : [ ]
                            },
                            {
                              "value" : 0.75,
                              "description" : "b, length normalization parameter",
                              "details" : [ ]
                            },
                            {
                              "value" : 16.0,
                              "description" : "dl, length of field",
                              "details" : [ ]
                            },
                            {
                              "value" : 36.475765,
                              "description" : "avgdl, average length of field",
                              "details" : [ ]
                            }
                          ]
                        }
                      ]
                    }
                  ]
                }
              ]
            }
          ]
        }
      },
      {
        "_shard" : "[movie][0]",
        "_node" : "WQDVMY19QXOuANmMQ0yWWg",
        "_index" : "movie",
        "_type" : "_doc",
        "_id" : "2550",
        "_score" : 8.280251,
        "_source" : {
          "title" : "Love & Basketball",
          "tagline" : "All's fair in love and basketball.",
          "release_date" : "2000/4/21",
          "popularity" : "2.027393",
          "cast" : {
            "character" : "Laurie Strode",
            "name" : "Jamie Lee Curtis"
          },
          "overview" : "A young African-American couple navigates the tricky paths of romance and athletics in this drama. Quincy McCall (Omar Epps) and Monica Wright (Sanaa Lathan) grew up in the same neighborhood and have known each other since childhood. As they grow into adulthood, they fall in love, but they also share another all-consuming passion: basketball. They've followed the game all their lives and have no small amount of talent on the court. As Quincy and Monica struggle to make their relationship work, they follow separate career paths though high school and college basketball and, they hope, into stardom in big-league professional ball."
        },
        "_explanation" : {
          "value" : 8.280251,
          "description" : "max of:",
          "details" : [
            {
              "value" : 5.812291,
              "description" : "sum of:",
              "details" : [
                {
                  "value" : 5.812291,
                  "description" : "weight(overview:basketbal in 2376) [PerFieldSimilarity], result of:",
                  "details" : [
                    {
                      "value" : 5.812291,
                      "description" : "score(freq=2.0), computed as boost * idf * tf from:",
                      "details" : [
                        {
                          "value" : 2.2,
                          "description" : "boost",
                          "details" : [ ]
                        },
                        {
                          "value" : 5.25461,
                          "description" : "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
                          "details" : [
                            {
                              "value" : 23,
                              "description" : "n, number of documents containing term",
                              "details" : [ ]
                            },
                            {
                              "value" : 4498,
                              "description" : "N, total number of documents with field",
                              "details" : [ ]
                            }
                          ]
                        },
                        {
                          "value" : 0.5027872,
                          "description" : "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:",
                          "details" : [
                            {
                              "value" : 2.0,
                              "description" : "freq, occurrences of term within document",
                              "details" : [ ]
                            },
                            {
                              "value" : 1.2,
                              "description" : "k1, term saturation parameter",
                              "details" : [ ]
                            },
                            {
                              "value" : 0.75,
                              "description" : "b, length normalization parameter",
                              "details" : [ ]
                            },
                            {
                              "value" : 68.0,
                              "description" : "dl, length of field (approximate)",
                              "details" : [ ]
                            },
                            {
                              "value" : 36.475765,
                              "description" : "avgdl, average length of field",
                              "details" : [ ]
                            }
                          ]
                        }
                      ]
                    }
                  ]
                }
              ]
            },
            {
              "value" : 8.280251,
              "description" : "sum of:",
              "details" : [
                {
                  "value" : 8.280251,
                  "description" : "weight(title:basketbal in 2376) [PerFieldSimilarity], result of:",
                  "details" : [
                    {
                      "value" : 8.280251,
                      "description" : "score(freq=1.0), computed as boost * idf * tf from:",
                      "details" : [
                        {
                          "value" : 2.2,
                          "description" : "boost",
                          "details" : [ ]
                        },
                        {
                          "value" : 8.00659,
                          "description" : "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
                          "details" : [
                            {
                              "value" : 1,
                              "description" : "n, number of documents containing term",
                              "details" : [ ]
                            },
                            {
                              "value" : 4500,
                              "description" : "N, total number of documents with field",
                              "details" : [ ]
                            }
                          ]
                        },
                        {
                          "value" : 0.47008157,
                          "description" : "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:",
                          "details" : [
                            {
                              "value" : 1.0,
                              "description" : "freq, occurrences of term within document",
                              "details" : [ ]
                            },
                            {
                              "value" : 1.2,
                              "description" : "k1, term saturation parameter",
                              "details" : [ ]
                            },
                            {
                              "value" : 0.75,
                              "description" : "b, length normalization parameter",
                              "details" : [ ]
                            },
                            {
                              "value" : 2.0,
                              "description" : "dl, length of field",
                              "details" : [ ]
                            },
                            {
                              "value" : 2.1757777,
                              "description" : "avgdl, average length of field",
                              "details" : [ ]
                            }
                          ]
                        }
                      ]
                    }
                  ]
                }
              ]
            }
          ]
        }
      }

通过查看评分可以看出,第一个title:Space Jam,第二个:Love & Basketball,有时候我们搜索basketball有现在标题,但是这里去不是这样的,原因是在于评分机制,多字段的评分会取最大值,从第一个可以看出,title中并没有basketball这个词,所以自然取的就是overview中的分数,而第二个显然title没有overview分数高,所以排到了第二位。所以这就是原因。

如何优化这种情况?

GET /movie/_search
{
  "query": {
    "multi_match": {
      "query": "basketball",
      "fields": ["title^10","overview"]
    }
  }
}

title^10的意思是title这个字段的分数的放大系数放大了10倍,从原来的2.2变成了22,自然排到了第一位。

这种样子还是有些局限,有很多时候并不只是根据一个字段判断,因此:

GET /movie/_search
{
  "query": {
    "multi_match": {
      "query": "basketball",
      "fields": ["title^10","overview"],
      "tie_breaker": 0.3
    }
  }
}

 关于tie_breaker,先看下数据:

解释也很清楚,最大值加其他值和的0.3。

接下来讲一下布尔查询,首先有几个关键词:

must:必须都为true

must not:必须都为false

should:其中一个为true即可

还要注意的是,这里的打分是为true的越多,得分越高。例如:

GET /movie/_search
{
  "query": {
    "bool": {
      "should": [
        {"match": {"title": "basketball with cartoom aliens"}},
        {"match": {"overview": "basketball with cartoom aliens"}}
      ]
    }
  }
}

关于打分机制,其实有很多种,不同的multi_query有很多不同的type:

best_fields:默认得分方式,取得最高的分数作为对应文档的分数,是“最匹配模式”,也叫dis_max模式

GET /movie/_search
{
  "query": {
    "multi_match": {
      "query": "basketball with cartoom aliens",
      "fields": ["title","overview"],
      "type": "best_fields"
    }
  }
}

也可以写成:

GET /movie/_search
{
  "query": {
    "dis_max": {
      "queries": [
        {"match": {"title": "basketball with cartoom aliens"}},
        {"match": {"overview": "basketball with cartoom aliens"}}
      ]
    }
  }
}

如果想看打分公式:

GET /movie/_validate/query?explain
{
  "query": {
    "multi_match": {
      "query": "basketball with cartoom aliens",
      "fields": ["title","overview"],
      "type": "best_fields"
    }
  }
}

结果:

most_fields:考虑所有的文档字段得分相加,来获得结果

GET /movie/_search
{
  "query": {
    "multi_match": {
      "query": "basketball",
      "fields": ["title","overview"],
      "type": "most_fields"
    }
  }
}

cross_fields:以分词为单位计算栏位的总分

GET /movie/_search
{
  "query": {
    "multi_match": {
      "query": "steve jobs",
      "fields": ["title","overview"],
      "type": "most_fields"
    }
  }
}

看下结构,这种是以词为单位来匹配的,先算出steve在overview和title中的分数取最大值,然后和job的最大值相加得出来的分数。

发布了97 篇原创文章 · 获赞 28 · 访问量 1万+

猜你喜欢

转载自blog.csdn.net/haozi_rou/article/details/104771151