Why is the score and order in the result of searching what I registered with nested different from the one of searching what I registerd normally?


(Masanori Ohnishi) #1

normal(not nested)

mapping

curl -X PUT "es:9200/sample1" -H 'Content-Type: application/json' -d'
{
  "mappings": {
    "_doc": {
      "properties": {
        "header" : {
          "type" : "text"
        },
        "body" : {
          "type" : "text"
        }
      }
    }
  }
}
'

register

curl -X PUT "es:9200/sample1/_doc/1?refresh" -H 'Content-Type: application/json' -d'
{
  "header": "最適な問題",
  "body": "これは数学です。"
}
'

curl -X PUT "es:9200/sample1/_doc/2?refresh" -H 'Content-Type: application/json' -d'
{
  "header": "最悪な回答",
  "body": "これは社会です。"
}
'

curl -X PUT "es:9200/sample1/_doc/3?refresh" -H 'Content-Type: application/json' -d'
{
  "header": "最良の問題",
  "body": "これは国語です。"
}
'

curl -X PUT "es:9200/sample1/_doc/4?refresh" -H 'Content-Type: application/json' -d'
{
  "header": "ひどい回答",
  "body": "これは理科です。"
}
'

search query

curl -XGET 'es:9200/sample1/_search?pretty' -H 'Content-Type: application/json' -d'
{
   "query" : {
        "simple_query_string":{
        "query": "最適",
        "fields": ["header","body"]
      }
    }
}'

result

{
  "took" : 2,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 3,
    "max_score" : 0.6931472,
    "hits" : [
      {
        "_index" : "sample1",
        "_type" : "_doc",
        "_id" : "2",
        "_score" : 0.6931472,
        "_source" : {
          "header" : "最悪な回答",
          "body" : "これは社会です。"
        }
      },
      {
        "_index" : "sample1",
        "_type" : "_doc",
        "_id" : "1",
        "_score" : 0.5753642,
        "_source" : {
          "header" : "最適な問題",
          "body" : "これは数学です。"
        }
      },
      {
        "_index" : "sample1",
        "_type" : "_doc",
        "_id" : "3",
        "_score" : 0.2876821,
        "_source" : {
          "header" : "最良の問題",
          "body" : "これは国語です。"
        }
      }
    ]
  }
}

nested

mapping

curl -X PUT "es:9200/sample2" -H 'Content-Type: application/json' -d'
{
  "mappings": {
    "_doc": {
      "properties": {
        "title" : {
          "type" : "text"
        },
        "contents": {
          "type": "nested"
        }
      }
    }
  }
}
'

register

curl -X PUT "es:9200/sample2/_doc/1?refresh" -H 'Content-Type: application/json' -d'
{
  "title": "Test title",
  "contents": [
    {
      "header": "最適な問題",
      "body": "これは数学です。"
    },
    {
      "header": "最悪な回答",
      "body": "これは社会です。"
    }
  ]
}
'

curl -X PUT "es:9200/sample2/_doc/2?refresh" -H 'Content-Type: application/json' -d'
{
  "title": "Test title",
  "contents": [
    {
      "header": "最良の問題",
      "body": "これは国語です。"
    },
    {
      "header": "ひどい回答",
      "body": "これは理科です。"
    }
  ]
}
'

search query

curl -XGET "es:9200/sample2/_search?pretty" -H 'Content-Type: application/json' -d'
{
  "_source": false,
  "size": 20,
  "query": {
    "nested": {
      "path": "contents",
      "score_mode": "max",
      "query" : {
          "simple_query_string":{
          "query": "最適",
          "fields": ["contents.header","contents.body"],
          "auto_generate_synonyms_phrase_query": 'true'
        }
      },
      "inner_hits": {} 
    }
  }
}
'

result

{
  "took" : 3,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 2,
    "max_score" : 0.87546873,
    "hits" : [
      {
        "_index" : "sample2",
        "_type" : "_doc",
        "_id" : "1",
        "_score" : 0.87546873,
        "inner_hits" : {
          "contents" : {
            "hits" : {
              "total" : 2,
              "max_score" : 0.87546873,
              "hits" : [
                {
                  "_index" : "sample2",
                  "_type" : "_doc",
                  "_id" : "1",
                  "_nested" : {
                    "field" : "contents",
                    "offset" : 0
                  },
                  "_score" : 0.87546873,
                  "_source" : {
                    "header" : "最適な問題",
                    "body" : "これは数学です。"
                  }
                },
                {
                  "_index" : "sample2",
                  "_type" : "_doc",
                  "_id" : "1",
                  "_nested" : {
                    "field" : "contents",
                    "offset" : 1
                  },
                  "_score" : 0.18232156,
                  "_source" : {
                    "header" : "最悪な回答",
                    "body" : "これは社会です。"
                  }
                }
              ]
            }
          }
        }
      },
      {
        "_index" : "sample2",
        "_type" : "_doc",
        "_id" : "2",
        "_score" : 0.6931472,
        "inner_hits" : {
          "contents" : {
            "hits" : {
              "total" : 1,
              "max_score" : 0.6931472,
              "hits" : [
                {
                  "_index" : "sample2",
                  "_type" : "_doc",
                  "_id" : "2",
                  "_nested" : {
                    "field" : "contents",
                    "offset" : 0
                  },
                  "_score" : 0.6931472,
                  "_source" : {
                    "header" : "最良の問題",
                    "body" : "これは国語です。"
                  }
                }
              ]
            }
          }
        }
      }
    ]
  }
}

↑のresultを比べてみると、
該当した同じテキストに対するスコアリングや、それに伴い、順番が違っています。
なぜ同じスコアリングや、順番にならないのでしょうか?

inner_hitsでの検索と、そうでない検索は、検索ロジックが全く別のものを使っているということでしょうか?


(tsgkdt) #2

こんにちわ

こちらで6.5.4の環境で、頂いたIndexの設定、ドキュメントを登録して、
提示されたクエリで確認したところ、以下のようになりました。

ID 内容 Sample1スコア
1 最適な問題 1.5606477
2 最悪な回答 0.35667494
3 最良の問題 0.35667494
ID 内容 Sample2スコア
1 最適な問題, 最悪な回答 1.5606477
2 最良の問題 0.35667494

Sample1もSample2もスコアがずれている、ということが確認できませんでした。

一度、explain: trueをつけて、どのようにスコア値が計算されているかを確認してみてはどうでしょうか。
どのように計算されているかを見れば、なぜスコアリングがずれるのかが分かるのでは? と思います。

POST sample1/_search
{
  "query" : {
        "simple_query_string":{
        "query": "最適",
        "fields": ["header","body"],
        "default_operator": "OR"
      }
    },
    "explain": true
}

例えば、どちらかのインデックスには、他にドキュメントが登録されていたり、
格納されている文字列の長さによっては、avgFieldLengthの値が変わるなどすれば、スコアも変わってくるかと思います。

参考になれば幸いです。


(Masanori Ohnishi) #3

早々な回答ありがとうございます!
早速教えていただいたことを試してみました!
どうやらdocFreqがずれているようですね...
自分のelasticsearchのversionは6.3.1になります。

例えば、どちらかのインデックスには、他にドキュメントが登録されていたり

ここについては、新しいindexをつくって再度登録しても結果は変わりませんでした。
もしなにか他に思い当たるところありましたら、適宜教えていただけると幸いです。


(Masanori Ohnishi) #4

query

curl -XGET "es:9200/sample2/_search?pretty" -H 'Content-Type: application/json' -d'
{
  "_source": false,
  "size": 20,
  "query": {
    "nested": {
      "path": "contents",
      "score_mode": "max",
      "query" : {
          "simple_query_string":{
          "query": "最適",
          "fields": ["contents.header","contents.body"],
          "auto_generate_synonyms_phrase_query": 'true',
          "default_operator": "OR"
        }
      },
      "inner_hits": {
        "explain": true
      } 
    }
  }
}
'

result

                    "value" : 0.87546873,
                    "description" : "sum of:",
                    "details" : [
                      {
                        "value" : 0.87546873,
                        "description" : "sum of:",
                        "details" : [
                          {
                            "value" : 0.18232156,
                            "description" : "weight(contents.header:最 in 0) [PerFieldSimilarity], result of:",
                            "details" : [
                              {
                                "value" : 0.18232156,
                                "description" : "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:",
                                "details" : [
                                  {
                                    "value" : 0.18232156,
                                    "description" : "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                                    "details" : [
                                      {
                                        "value" : 2.0,
                                        "description" : "docFreq",
                                        "details" : [ ]
                                      },
                                      {
                                        "value" : 2.0,
                                        "description" : "docCount",
                                        "details" : [ ]
                                      }
                                    ]
                                  },
                                  {
                                    "value" : 1.0,
                                    "description" : "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                                    "details" : [
                                      {
                                        "value" : 1.0,
                                        "description" : "termFreq=1.0",
                                        "details" : [ ]
                                      },
                                      {
                                        "value" : 1.2,
                                        "description" : "parameter k1",
                                        "details" : [ ]
                                      },
                                      {
                                        "value" : 0.75,
                                        "description" : "parameter b",
                                        "details" : [ ]
                                      },
                                      {
                                        "value" : 5.0,
                                        "description" : "avgFieldLength",
                                        "details" : [ ]
                                      },
                                      {
                                        "value" : 5.0,
                                        "description" : "fieldLength",
                                        "details" : [ ]
                                      }
・・・
                          {
                            "value" : 0.6931472,
                            "description" : "weight(contents.header:適 in 0) [PerFieldSimilarity], result of:",
                            "details" : [
                              {
                                "value" : 0.6931472,
                                "description" : "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:",
                                "details" : [
                                  {
                                    "value" : 0.6931472,
                                    "description" : "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                                    "details" : [
                                      {
                                        "value" : 1.0,
                                        "description" : "docFreq",
                                        "details" : [ ]
                                      },
                                      {
                                        "value" : 2.0,
                                        "description" : "docCount",
                                        "details" : [ ]
                                      }
                                    ]
                                  },
                                  {
                                    "value" : 1.0,
                                    "description" : "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                                    "details" : [
                                      {
                                        "value" : 1.0,
                                        "description" : "termFreq=1.0",
                                        "details" : [ ]
                                      },
                                      {
                                        "value" : 1.2,
                                        "description" : "parameter k1",
                                        "details" : [ ]
                                      },
                                      {
                                        "value" : 0.75,
                                        "description" : "parameter b",
                                        "details" : [ ]
                                      },
                                      {
                                        "value" : 5.0,
                                        "description" : "avgFieldLength",
                                        "details" : [ ]
                                      },
                                      {
                                        "value" : 5.0,
                                        "description" : "fieldLength",
                                        "details" : [ ]
                                      }
                                    ]

(Masanori Ohnishi) #5

query

curl -XGET 'es:9200/sample1/_search?pretty' -H 'Content-Type: application/json' -d'
{
   "query" : {
        "simple_query_string":{
        "query": "最適",
        "fields": ["header","body"],
        "default_operator": "OR"
      }
    },
    "explain": true
}'

result

          "header" : "最適な問題",
          "body" : "これは数学です。"
        },
        "_explanation" : {
          "value" : 0.5753642,
          "description" : "sum of:",
          "details" : [
            {
              "value" : 0.5753642,
              "description" : "sum of:",
              "details" : [
                {
                  "value" : 0.2876821,
                  "description" : "weight(header:最 in 0) [PerFieldSimilarity], result of:",
                  "details" : [
                    {
                      "value" : 0.2876821,
                      "description" : "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:",
                      "details" : [
                        {
                          "value" : 0.2876821,
                          "description" : "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                          "details" : [
                            {
                              "value" : 1.0,
                              "description" : "docFreq",
                              "details" : [ ]
                            },
                            {
                              "value" : 1.0,
                              "description" : "docCount",
                              "details" : [ ]
                            }
                          ]
                        },
                        {
                          "value" : 1.0,
                          "description" : "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                          "details" : [
                            {
                              "value" : 1.0,
                              "description" : "termFreq=1.0",
                              "details" : [ ]
                            },
                            {
                              "value" : 1.2,
                              "description" : "parameter k1",
                              "details" : [ ]
                            },
                            {
                              "value" : 0.75,
                              "description" : "parameter b",
                              "details" : [ ]
                            },
                            {
                              "value" : 5.0,
                              "description" : "avgFieldLength",
                              "details" : [ ]
                            },
                            {
                              "value" : 5.0,
                              "description" : "fieldLength",
                              "details" : [ ]
                            }
                          ]
                        }
                      ]
                    }
                  ]
                },
                {
                  "value" : 0.2876821,
                  "description" : "weight(header:適 in 0) [PerFieldSimilarity], result of:",
                  "details" : [
                    {
                      "value" : 0.2876821,
                      "description" : "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:",
                      "details" : [
                        {
                          "value" : 0.2876821,
                          "description" : "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                          "details" : [
                            {
                              "value" : 1.0,
                              "description" : "docFreq",
                              "details" : [ ]
                            },
                            {
                              "value" : 1.0,
                              "description" : "docCount",
                              "details" : [ ]
                            }
                          ]
                        },
                        {
                          "value" : 1.0,
                          "description" : "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                          "details" : [
                            {
                              "value" : 1.0,
                              "description" : "termFreq=1.0",
                              "details" : [ ]
                            },
                            {
                              "value" : 1.2,
                              "description" : "parameter k1",
                              "details" : [ ]
                            },
                            {
                              "value" : 0.75,
                              "description" : "parameter b",
                              "details" : [ ]
                            },
                            {
                              "value" : 5.0,
                              "description" : "avgFieldLength",
                              "details" : [ ]
                            },
                            {
                              "value" : 5.0,
                              "description" : "fieldLength",
                              "details" : [ ]
                            }