Hi,
I am a begginner in Elastic Search and I don't know how to write my aggregation correctly and even if what I want is possible.
But between nested, inner_hits, top_hits, reverse_nested, bucket_sort, composite, etc... there is probably a solution somewhere :).
What I want and what I can achieve:
- I have 1 to 1.5M documents containing 1 to 4000 nested documents (but total nested doc < 2M).
-
- I need to filter by several fields on NESTED DOCUMENTS (not on main) for all the following aggregations
-
- I need the total number of MAIN DOCUMENTS once filtered by NESTED DOCUMENTS field(s)
-
- I need global stats (sum, avg) on some NESTED DOCUMENT FIELDS
-
- I need global median on some NESTED DOCUMENT FIELDS
-
- I need to return top_hits of SUM (not count) from NESTED DOCUMENT FIELDS aggregated by a NESTED DOCUMENT FIELD
This is where I'm having trouble:
-
- I need to return top_hits of SUM (not count) from NESTED DOCUMENT FIELDS aggregated by parent document (don't know how)
I have created an example with the same type of mapping of my documents (the mapping is not fixed and can be changed if necessary for a solution) :
Some query for testing:
PUT /category
{
"settings" : {
"number_of_shards" : 2,
"number_of_replicas" : 1
},
"mappings" : {
"properties" : {
"id" : { "type" : "keyword" },
"name" : { "type" : "text" },
"available" : { "type" : "boolean" },
"products" : {
"type": "nested",
"properties": {
"id" : { "type" : "keyword" },
"name" : { "type" : "text" },
"price" : { "type" : "double" },
"quantity" : { "type" : "integer" },
"weight" : { "type" : "double" },
"color" : { "type" : "keyword" },
"brand" : {
"type": "object",
"properties": {
"id" : { "type" : "keyword" },
"name" : { "type" : "text" }
}
},
"madeIn" : {
"type": "object",
"properties": {
"id" : { "type" : "keyword" },
"name" : { "type" : "text" }
}
}
}
}
}
}
}
PUT /category/_doc/1
{
"id":"1",
"name":"car",
"available": true,
"products": [
{
"id":"1",
"name":"ferrari",
"price": 10000,
"quantity":2,
"weight": 1255000,
"color": "red",
"brand": {
"id": 1,
"name": "ferrari"
},
"madeIn": {
"id": 1,
"name":"Italy"
}
},
{
"id":"2",
"name":"clio V",
"price": 2000,
"quantity":20,
"weight": 1250000,
"color": "blue",
"brand": {
"id": 2,
"name": "renault"
},
"madeIn": {
"id": 2,
"name":"France"
}
},
{
"id":"3",
"name":"alpine a110",
"price": 8000,
"quantity":5,
"weight": 1102000,
"color": "blue",
"brand": {
"id": 3,
"name": "alpine"
},
"madeIn": {
"id": 2,
"name":"France"
}
},
{
"id":"4",
"name":"megane 4",
"price": 6000,
"quantity":35,
"weight": 1318000,
"color": "red",
"brand": {
"id": 2,
"name": "renault"
},
"madeIn": {
"id": 2,
"name":"France"
}
},
{
"id":"5",
"name":"opel corsa",
"price": 4000,
"quantity":50,
"weight": 1165000,
"color": "red",
"brand": {
"id": 4,
"name": "opel"
},
"madeIn": {
"id": 3,
"name":"Germany"
}
},
{
"id":"6",
"name":"zoe",
"price": 5000,
"quantity":100,
"weight": 1502000,
"color": "red",
"brand": {
"id": 2,
"name": "renault"
},
"madeIn": {
"id": 2,
"name":"France"
}
}
]
}
PUT /category/_doc/2
{
"id":"2",
"name":"lego",
"available": true,
"products": [
{
"id":"7",
"name":"plate 2x4 B",
"price": 10,
"quantity":20000,
"weight": 5,
"color": "blue",
"brand": {
"id": 5,
"name": "lego"
},
"madeIn": {
"id": 4,
"name":"Denmark"
}
},
{
"id":"8",
"name":"plate 12x4 R",
"price": 100,
"quantity":2000,
"weight": 30,
"color": "red",
"brand": {
"id": 5,
"name": "lego"
},
"madeIn": {
"id": 4,
"name":"Denmark"
}
},
{
"id":"9",
"name":"seat 2x2x2 R",
"price": 4,
"quantity":10000,
"weight": 7,
"color": "red",
"brand": {
"id": 5,
"name": "lego"
},
"madeIn": {
"id": 4,
"name":"Denmark"
}
},
{
"id":"10",
"name":"flat tile 1x2 Bk",
"price": 2,
"quantity":5000,
"weight": 2,
"color": "black",
"brand": {
"id": 5,
"name": "lego"
},
"madeIn": {
"id": 4,
"name":"Denmark"
}
},
{
"id":"11",
"name":"minifig yoda",
"price": 1000,
"quantity":2,
"weight": 20,
"color": "green",
"brand": {
"id": 5,
"name": "lego"
},
"madeIn": {
"id": 4,
"name":"Denmark"
}
}
]
}
PUT /category/_doc/3
{
"id":"3",
"name":"food",
"available": true,
"products": [
{
"id":"12",
"name":"tomato",
"price": 1,
"quantity":200000,
"weight": 200,
"color": "red",
"brand": {
"id": 5,
"name": "barilla"
},
"madeIn": {
"id": 1,
"name":"Italy"
}
},
{
"id":"13",
"name":"very rare spicy",
"price": 12000,
"quantity":20,
"weight": 100,
"color": "red",
"brand": {
"id": 6,
"name": "unknown"
},
"madeIn": {
"id": 5,
"name":"Chili"
}
},
{
"id":"14",
"name":"spinach",
"price": 10,
"quantity":555555,
"weight": 50,
"color": "green",
"brand": {
"id": 6,
"name": "unknown"
},
"madeIn": {
"id": 2,
"name":"France"
}
}
]
}
POST category/_search
{
"size":0,
"query": {
"match_all": {}
},
"aggs": {
"productsAggs": {
"nested": {
"path": "products"
},
"aggs": {
"productsFiltered": {
//1. First apply the filter(s) if needed
"filter": {
"term": {
"products.color": "red"
}
},
"aggs": {
//2. total number of found categories after filtering by color (or anything else)
"totalFoundCategories" : {
"reverse_nested": {}
},
//3. sum and avg price
"priceStats": {
"stats": {
"field": "products.price"
}
},
//3. sum and avg quantity
"quantityStats": {
"stats": {
"field": "products.quantity"
}
},
//3. sum and avg weight
"weightStats": {
"stats": {
"field": "products.weight"
}
},
//4. median price
"priceMedian": {
"percentiles": {
"field": "products.price",
"percents": [
50
]
}
},
//4. median quantity
"quantityMedian": {
"percentiles": {
"field": "products.quantity",
"percents": [
50
]
}
},
//4. median weight
"weightMedian": {
"percentiles": {
"field": "products.weight",
"percents": [
50
]
}
},
//5. TOP XX prices aggregated by brand
"byBrand": {
"terms": {
"field": "products.brand.id",
"size": 100
},
"aggs": {
"sumByBrand": {
"sum": {
"field": "products.price"
}
},
"price_brand_bucket_sort": {
"bucket_sort": {
"sort": [
{
"sumByBrand": {"order": "desc"}
}
],
"size":100
}
},
"name" : {
"top_hits": {
"size": 1,
"_source": ["products.brand.name"]
}
}
}
},
//5. TOP XX price aggregated by madeIn
"byMadeIn": {
"terms": {
"field": "products.madeIn.id"
},
"aggs": {
"sumByMadeIn": {
"sum": {
"field": "products.price"
}
},
"price_madeIn_bucket_sort": {
"bucket_sort": {
"sort": [
{
"sumByMadeIn": {"order": "desc"}
}
],
"size":100
}
},
"name" : {
"top_hits": {
"size": 1,
"_source": ["products.madeIn.name"]
}
}
}
}
//6. The agg missing : top category hits sorted by sum(price) of their red products
//We can't use terms to group by category because we can have 1-2M category items.
}
}
}
}
}
}
NOTE :
- In my example there are only few categories and products, in my environment, the main total number of documents is around 1.5M and sub documents around 2M.
- I am testing this query on Elastic 8.15.2