ElasticSearch query using Top_Hits aggregation returns additional information than required


#1

A search_term can have multiple list of synonyms and I want ElasticSearch to group synonyms by search_term and it appears that top_hits aggregation is the right solution. But the only issue I have is that it returns additional information that I don’t really care about

Here is my mapping

   "mappings": {
   "concepts": {
        "properties": {
            "search_term": {
                "type": "keyword"
            },
            "synonyms": {
                "type": "keyword"
            }
        }
    }
}

Sample data

Document 1:

Blockquote

_source
{
   "search_term" : [                
       "own",
       "possess"
   ],             
   "synonyms" : [
       "own",
       "possess"
   ]
}

Document 2:
_source
{
  "search_term" : [
   "belong",
   "own",
   "possess"
   ],             
   "synonyms" : [
    "belong",
     "own",
     "possess"
    ]
}

Document 3:
_source
{
   "search_term" : [
       "own",
       "possess"
       "control"           
   ],             
   "synonyms" : [
       "own",
       "possess"
       "control"                
   ]
}

ElasticSearch Query

{
	"size": 0,
	"query": {
		"bool": {
			"must": [{
				"terms": {
					"search_term": ["possess"]
				}
			}]
		}
	},
	"aggs": {
		"group_by_search_term": {
			"terms": {
				"field": "search_term"
			},
			"aggs": {
				"include_ synonyms": {
					"top_hits": {
						"_source": {
							"includes": ["synonyms"]
						}
					}
				}
			}
		}
	}
}

Response

{
	"took": 7,
	"timed_out": false,
	"_shards": {
		"total": 5,
		"successful": 5,
		"failed": 0
	},
	"hits": {
		"total": 3,
		"max_score": 0,
		"hits": []
	},
	"aggregations": {
		"group_by_search_term": {
			"doc_count_error_upper_bound": 0,
			"sum_other_doc_count": 0,
			"buckets": [{
					"key": "own",
					"doc_count": 3,
					"include_ suggestions ": {
						"hits": {
							"total": 3,
							"max_score": 5.615558,
							"hits": [{
									"_index": "synonyms0_2018.11.19.14.00.00.000",
									"_type": "synonyms",
									"_id": "I8907d1b2cd64496ca4b42cdaf59befa5",
									"_score": 5.615558,
									"_source": {
										"synonyms": [
											"hold",
											"own",
											"possess",
											"occupy"
										]
									}
								},
								{
									"_index": "synonyms0_2018.11.19.14.00.00.000",
									"_type": "synonyms",
									"_id": "I75e77529a04d43d7a7244acb2a8c369d",
									"_score": 5.0925226,
									"_source": {
										"synonyms": [
											"belong",
											"own",
											"possess"
										]
									}
								},
								{
									"_index": "synonyms0_2018.11.19.14.00.00.000",
									"_type": "synonyms",
									"_id": "I3d127ff1b3ea4c8394cb2ea5c73e733a",
									"_score": 5.0925226,
									"_source": {
										"synonyms": [
											"own",
											"possess",
											"control",
											"acquire",
											"have"
										]
									}
								}
							]
						}
					}
				},
				{
					"key": "possess",
					"doc_count": 3,
					"include_ suggestions ": {
						"hits": {
							"total": 3,
							"max_score": 5.615558,
							"hits": [{
									"_index": "synonyms0_2018.11.19.14.00.00.000",
									"_type": "synonyms",
									"_id": "I8907d1b2cd64496ca4b42cdaf59befa5",
									"_score": 5.615558,
									"_source": {
										"synonyms": [
											"hold",
											"own",
											"possess",
											"occupy"
										]
									}
								},
								{
									"_index": "synonyms0_2018.11.19.14.00.00.000",
									"_type": "synonyms",
									"_id": "I75e77529a04d43d7a7244acb2a8c369d",
									"_score": 5.0925226,
									"_source": {
										"synonyms": [
											"belong",
											"own",
											"possess"
										]
									}
								},
								{
									"_index": "synonyms0_2018.11.19.14.00.00.000",
									"_type": "synonyms",
									"_id": "I3d127ff1b3ea4c8394cb2ea5c73e733a",
									"_score": 5.0925226,
									"_source": {
										"synonyms": [
											"own",
											"possess",
											"control",
											"acquire",
											"have"
										]
									}
								}
							]
						}
					}
				},
				{
					"key": "belong",
					"doc_count": 1,
					"include_ suggestions ": {
						"hits": {
							"total": 1,
							"max_score": 5.0925226,
							"hits": [{
								"_index": "synonyms0_2018.11.19.14.00.00.000",
								"_type": "synonyms",
								"_id": "I75e77529a04d43d7a7244acb2a8c369d",
								"_score": 5.0925226,
								"_source": {
									"synonyms": [
										"belong",
										"own",
										"possess"
									]
								}
							}]
						}
					}
				},
				{
					"key": "control",
					"doc_count": 1,
					"include_ suggestions ": {
						"hits": {
							"total": 1,
							"max_score": 5.0925226,
							"hits": [{
								"_index": "synonyms0_2018.11.19.14.00.00.000",
								"_type": "synonyms",
								"_id": "I3d127ff1b3ea4c8394cb2ea5c73e733a",
								"_score": 5.0925226,
								"_source": {
									"synonyms": [
										"own",
										"possess",
										"control",
										"acquire",
										"have"
									]
								}
							}]
						}
					}
				}

			]
		}
	}
}

I was expecting only “possess” to show up under buckets array but the response seems to include all search_term. I was wondering if it is possible to limit the buckets array to return only the search_term being included in the ElasticSearch query.?

Your help is much appreciated!


(Abdon Pijpelink) #2

Instead of providing the search terms in the query, I would use a filters aggregation to create a separate bucket for each of your search terms. Your request would become:

GET my_index/_search
{
  "size": 0,
  "aggs": {
    "group_by_search_term": {
      "filters": {
        "filters": {
          "possess": {
            "terms": {
              "search_term": [
                "possess"
              ]
            }
          }
        }
      },
      "aggs": {
        "include_ synonyms": {
          "top_hits": {
            "_source": {
              "includes": [
                "synonyms"
              ]
            }
          }
        }
      }
    }
  }
}

If you want to retrieve synonyms for multiple search terms at once, create a filter for each search term. For example, to retrieve the synonyms for both "possess" and "belong":

GET my_index/_search
{
  "size": 0,
  "aggs": {
    "group_by_search_term": {
      "filters": {
        "filters": {
          "possess": {
            "terms": {
              "search_term": [
                "possess"
              ]
            }
          },
          "belong": {
            "terms": {
              "search_term": [
                "belong"
              ]
            }
          }
        }
      },
      "aggs": {
        "include_ synonyms": {
          "top_hits": {
            "_source": {
              "includes": [
                "synonyms"
              ]
            }
          }
        }
      }
    }
  }
}

#3

Thanks a lot Abdon! This works like a charm!


#4

A slight change in the requirement, I want to retrieve synonyms for search_terms and jusridictions.

In SQL, it would be something like this

Select synonyms from terms where search_terms in (‘possess’, ‘belong’) AND jurisdictions in (‘MN’, ‘MI’) group by search_terms

Sample data

Document 1 
{
	"search_term": [
		"own",
		"possess"
	],
	"jurisdictions": [
		"MN",
		"CA"
	],
	"synonyms": [
		"own",
		"possess"
	]
}
 
 Document 2 :
{
	"search_term": [
		"belong",
		"own",
		"possess"
	],
	"jurisdictions": [
		"MN",
		"MI"
	],
	"synonyms": [
		"belong",
		"own",
		"possess"
	]
}

Document 3 :
{
	"search_term": [
		"own",
		"possess",
		"control"
	],
	"jurisdictions": [
		"IN",
		"WI"
	],
	"synonyms": [
		"own",
		"possess",
		"control"
	]
}

Search_terms = [‘belong’, ‘possess’] and jurisdictions = [‘MN’, ‘MI’] , I expect Document 1 and Document 2 in the response.

If I provide search terms and jurisdictions in the query ( as shown below ) and then use top_hits aggregation it works but I get additional information

{
	"query": {
		"bool": {
			"must": [{
					"terms": {
						"search_term": ["possess", "belong"]
					}
				},
				{
					"terms": {
						"jurisdictions": ["MN", "MI"]
					}
				}
			]
		}
	}
}

I am not sure how to modify the fllters aggregation to accomodate jurisdictions. Could you please help?


(Abdon Pijpelink) #5

Something like this should work:

GET my_index/_search
{
  "size": 0,
  "aggs": {
    "group_by_search_term": {
      "filters": {
        "filters": {
          "possess": {
            "bool": {
              "must": [
                {
                  "terms": {
                    "search_term": [
                      "possess"
                    ]
                  }
                },
                {
                  "terms": {
                    "jurisdictions": [
                      "MN",
                      "MI"
                    ]
                  }
                }
              ]
            }
          },
          "belong": {
            "bool": {
              "must": [
                {
                  "terms": {
                    "search_term": [
                      "belong"
                    ]
                  }
                },
                {
                  "terms": {
                    "jurisdictions": [
                      "MN",
                      "MI"
                    ]
                  }
                }
              ]
            }
          }
        }
      },
      "aggs": {
        "include_ synonyms": {
          "top_hits": {
            "_source": {
              "includes": [
                "synonyms"
              ]
            }
          }
        }
      }
    }
  }
}