Filter search hits of parent documents based on value of child aggregation


#1

Building a hotel search engine with parent/child documents indexed together - units and availability. How do I filter hits based on a max dollar amount for a given stay.


Build mapping:

PUT /units/

{

	"mappings": {
		"_doc": {
			"properties": {
				"join": {
					"type": "join",
					"relations": {
						"unit": "availability"
					}
				},
				"id": {
					"type": "integer"
				},
				"date": {
					"type": "date",
					"format": "yyyy-MM-dd"
				},
				"available": {
					"type": "boolean"
				},
				"rate": {
					"type": "integer"
				}
			}
		}
	}
}

Add unit:

PUT /units/_doc/1

{
  "join": {
    "name": "unit"
  },
  "unit_id": 1
}

Add some availability:

PUT /units/_doc/2?routing=1

{
  "join": {
    "name": "availability",
    "parent": 1
  },
  "date": "2018-11-01",
  "available": true,
  "rate": 150
}
PUT /units/_doc/3?routing=1

{
  "join": {
    "name": "availability",
    "parent": 1
  },
  "date": "2018-11-02",
  "available": true,
  "rate": 125
}
PUT /units/_doc/4?routing=1

{
  "join": {
    "name": "availability",
    "parent": 1
  },
  "date": "2018-11-03",
  "available": true,
  "rate": 210
}

Result in the following:

GET /units/_doc/_search
{
	"took": 2,
	"timed_out": false,
	"_shards": {
		"total": 5,
		"successful": 5,
		"skipped": 0,
		"failed": 0
	},
	"hits": {
		"total": 4,
		"max_score": 1.0,
		"hits": [
			{
				"_index": "units",
				"_type": "_doc",
				"_id": "1",
				"_score": 1.0,
				"_source": {
					"join": {
						"name": "unit"
					},
					"unit_id": 1
				}
			},
			{
				"_index": "units",
				"_type": "_doc",
				"_id": "2",
				"_score": 1.0,
				"_routing": "1",
				"_source": {
					"join": {
						"name": "availability",
						"parent": 1
					},
					"date": "2018-11-01",
					"available": true,
					"rate": 150
				}
			},
			{
				"_index": "units",
				"_type": "_doc",
				"_id": "3",
				"_score": 1.0,
				"_routing": "1",
				"_source": {
					"join": {
						"name": "availability",
						"parent": 1
					},
					"date": "2018-11-02",
					"available": true,
					"rate": 125
				}
			},
			{
				"_index": "units",
				"_type": "_doc",
				"_id": "4",
				"_score": 1.0,
				"_routing": "1",
				"_source": {
					"join": {
						"name": "availability",
						"parent": 1
					},
					"date": "2018-11-03",
					"available": true,
					"rate": 210
				}
			}
		]
	}
}

Here's where I'm stuck...

I am getting units which are available from 2018-11-01 to 2018-11-03. The aggregation is correctly calculating the total rate for the stay at $485.

GET /units/_doc/_search
{
	"query": {
		"bool": {
			"must": [
				{
					"has_child": {
						"type": "availability",
						"query": {
							"bool": {
								"must": [
									{"term": {"date": "2018-11-01"}},
									{"term": {"available": true}}
								]
							}
						}
					}
				},
				{
					"has_child": {
						"type": "availability",
						"query": {
							"bool": {
								"must": [
									{"term": {"date": "2018-11-02"}},
									{"term": {"available": true}}
								]
							}
						}
					}
				},
				{
					"has_child": {
						"type": "availability",
						"query": {
							"bool": {
								"must": [
									{"term": {"date": "2018-11-03"}},
									{"term": {"available": true}}
								]
							}
						}
					}
				}
			]
		}
	},
	"aggs": {
		"to_unit_ids": {
			"terms": {
				"field": "unit_id"
			},
			"aggs": {
				"to_rates": {
					"children": {
						"type" : "availability" 
					},
					"aggs": {
						"to_filtered_rates": {
							"filter": {
								"range": {
									"date": {
										"gte": "2018-11-01",
										"lte": "2018-11-03"
									}
								}
							},
							"aggs": {
								"to_total_rate": {
									"sum": {
										"field": "rate"
									}
								}
							}
						}
					}
				}
			}
		}
	}
}

{
	"took": 5,
	"timed_out": false,
	"_shards": {
		"total": 5,
		"successful": 5,
		"skipped": 0,
		"failed": 0
	},
	"hits": {
		"total": 1,
		"max_score": 3.0,
			{
				"_index": "units",
				"_type": "_doc",
				"_id": "1",
				"_score": 3.0,
				"_source": {
					"join": {
						"name": "unit"
					},
					"unit_id": 1
				}
			}
		]
	},
	"aggregations": {
		"to_unit_ids": {
			"meta": {},
			"doc_count_error_upper_bound": 0,
			"sum_other_doc_count": 0,
			"buckets": [
				{
					"key": 1,
					"doc_count": 1,
					"to_rates": {
						"meta": {},
						"doc_count": 3,
						"to_filtered_rates": {
							"meta": {},
							"doc_count": 3,
							"to_total_rate": {
								"value": 485.0
							}
						}
					}
				}
			]
		}
	}
}

Okay great... now what if I would like to filter my results for total rental costs that are below $300?

I use the same search as above, but add the extra bucket_selector to the aggs to filter the bucket results:

{
	"aggs": {
		
		...

		"to_filtered_buckets": {
			"bucket_selector": {
				"buckets_path": {
					"totalRentalRate": "to_rates>to_filtered_rates>to_total_rate"
				},
				"script": "params.totalRentalRate <= 300"
			}
		}
	}
}

{
	"took": 5,
	"timed_out": false,
	"_shards": {
		"total": 5,
		"successful": 5,
		"skipped": 0,
		"failed": 0
	},
	"hits": {
		"total": 1,
		"max_score": 3.0,
		"hits": [
			{
				"_index": "units",
				"_type": "_doc",
				"_id": "1",
				"_score": 3.0,
				"_source": {
					"join": {
						"name": "unit"
					},
					"unit_id": 1
				}
			}
		]
	},
	"aggregations": {
		"to_unit_ids": {
			"meta": {},
			"doc_count_error_upper_bound": 0,
			"sum_other_doc_count": 0,
			"buckets": []
		}
	}
}

Better but not quite...

The aggregation bucket has been filtered out but the original hits are still there.


So from here I can filter post process but this could be challenging with millions of hits.

Is there a way to filter the hits based on the results of the child aggregation?

Thanks all!


(system) #2

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.