Hello, I have the following schema for an index, with some example documents. My use-case is that I'm indexing a book and I am indexing by page. I'm trying to make a query against the contents of each page and return the most relevant page from top N most relevant books, but I'm having trouble returning them by their _score field in the proper order:
PUT my-index
{}
PUT my-index/_mapping
{
"properties": {
"page": {
"type": "long"
},
"title": {
"type": "keyword"
},
"content": {
"type": "text"
}
}
}
POST _bulk
{ "index" : { "_index" : "my-index", "_id" : "doc_1.pg_1" } }
{ "page" : 1, "title": "doc_1", "content": "a quick brown fox jumped over the lazy dog ..."}
{ "index" : { "_index" : "my-index", "_id" : "doc_1.pg_2" } }
{ "page" : 2, "title": "doc_1", "content": "... and the fox landed in a briar patch"}
{ "index" : { "_index" : "my-index", "_id" : "doc_2.pg_1" } }
{ "page" : 1, "title": "doc_2", "content": "a slow orange fox leapt over the barky dog ..."}
{ "index" : { "_index" : "my-index", "_id" : "doc_2.pg_2" } }
{ "page" : 2, "title": "doc_2", "content": "... and the dog lept and got the fox"}
So far I've created the following query aggregation to get the most relevant page of the each doc in the corpus:
GET my-index/_search
{
"size":0,
"query" : {
"simple_query_string": {
"query": "the",
"fields": ["content"]
}
},
"aggs": {
"by_title" : {
"terms" : {
"field" : "title"
},
"aggs": {
"get_top_scoring_page": {
"top_hits": {
"sort": {"_score": {"order": "desc"}},
"size": 1,
"highlight": {"fields": {"content": {}}}
}
}
}
}
}
}
And here's what it returns:
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"by_title" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "doc_1",
"doc_count" : 2,
"get_top_scoring_page" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : 0.10795845,
"hits" : [
{
"_index" : "my-index",
"_type" : "_doc",
"_id" : "doc_1.pg_2",
"_score" : 0.10795845,
"_source" : {
"page" : 2,
"title" : "doc_1",
"content" : "... and the fox landed in a briar patch"
},
"highlight" : {
"content" : [
"... and <em>the</em> fox landed in a briar patch"
]
}
}
]
}
}
},
{
"key" : "doc_2",
"doc_count" : 2,
"get_top_scoring_page" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : 0.14730778,
"hits" : [
{
"_index" : "my-index",
"_type" : "_doc",
"_id" : "doc_2.pg_2",
"_score" : 0.14730778,
"_source" : {
"page" : 2,
"title" : "doc_2",
"content" : "... and the dog lept and got the fox"
},
"highlight" : {
"content" : [
"... and <em>the</em> dog lept and got <em>the</em> fox"
]
}
}
]
}
}
}
]
}
}
}
Notice that the higher scoring document doc_2
is returned after doc_1
, which is not what I want. I want to sort these results by the _score
field in descending order, with the highest scoring title being returned first.
Can anyone help me out here? I feel like I'm very close to the answer.