Hello,
I would like to have your opinion on these aggregations. Currently, these can take up to 5 mins to run on my ES cluster (7.17.1) for ~ 2 years of data (~ 1 billion of documents / 100GB data spread over 2 data nodes).
So I would like to know if there are things to optimize on this side and only on this side at first
GET test/_search
{
"aggregations": {
"story_id_filters": {
"terms": {
"field": "story_id",
"size": 65536
}
},
"byCategory:no_category": {
"aggregations": {
"author": {
"aggregations": {
"total_cost": {
"sum": {
"field": "cost.usd"
}
}
},
"terms": {
"field": "author",
"size": 65536
}
},
"total_cost": {
"sum": {
"field": "cost.usd"
}
},
"usage_period_start": {
"aggregations": {
"total_cost": {
"sum": {
"field": "cost.usd"
}
}
},
"date_histogram": {
"calendar_interval": "day",
"field": "usage_period_start"
}
}
},
"filter": {
"bool": {
"must_not": {
"nested": {
"path": "tags",
"query": {
"regexp": {
"tags.key": {
"value": ".*cat.*"
}
}
}
}
}
}
}
},
"byCategory:category": {
"aggregations": {
"tag_keys": {
"aggregations": {
"tag_values": {
"aggregations": {
"reverse_aggr_cost": {
"aggregations": {
"author": {
"aggregations": {
"total_cost": {
"sum": {
"field": "cost.usd"
}
}
},
"terms": {
"field": "author",
"size": 65536
}
},
"total_cost": {
"sum": {
"field": "cost.usd"
}
},
"usage_period_start": {
"aggregations": {
"total_cost": {
"sum": {
"field": "cost.usd"
}
}
},
"date_histogram": {
"calendar_interval": "day",
"field": "usage_period_start"
}
}
},
"reverse_nested": {}
}
},
"terms": {
"field": "tags.value",
"size": 65536
}
}
},
"filter": {
"regexp": {
"tags.key": {
"value": ".*cat.*"
}
}
}
}
},
"nested": {
"path": "tags"
}
},
"byAuthor:author": {
"aggregations": {
"total_cost": {
"sum": {
"field": "cost.usd"
}
},
"usage_period_start": {
"aggregations": {
"total_cost": {
"sum": {
"field": "cost.usd"
}
}
},
"date_histogram": {
"calendar_interval": "day",
"field": "usage_period_start"
}
}
},
"terms": {
"field": "author",
"size": 65536
}
},
"category_filters": {
"aggregations": {
"tag_keys": {
"aggregations": {
"tag_values": {
"terms": {
"field": "tags.value",
"size": 65536
}
}
},
"filter": {
"regexp": {
"tags.key": {
"value": ".*cat.*"
}
}
}
}
},
"nested": {
"path": "tags"
}
},
"total_cost": {
"sum": {
"field": "cost.usd"
}
},
"usage_story_id_filters": {
"terms": {
"field": "usage_story_id",
"size": 65536
}
}
},
"query": {
"bool": {
"filter": [
{
"range": {
"usage_period_start": {
"from": "2020-10-01T00:00:00Z",
"to": "2022-10-05T00:00:00Z",
"include_lower": true,
"include_upper": false
}
}
},
{
"terms": {
"author": [
"toto"
]
}
},
{
"bool": {
"must_not": {
"bool": {
"filter": [
{
"terms": {
"author": [
"toto"
]
}
},
{
"terms": {
"record_type": [
"Tax",
"StoryTotal"
]
}
}
]
}
}
}
},
{
"bool": {
"must_not": {
"bool": {
"filter": [
{
"terms": {
"author": [
"tutu"
]
}
},
{
"terms": {
"record_type": [
"tax"
]
}
}
]
}
}
}
}
]
}
},
"size": 0
}
I think going through the multi-search API. But is there a real added value?
Note: I voluntarily changed the name of some fields/values
Thanks a lot for your help