Get last document string value in each bucket

I have a reports index and I need to count the number of reports per-user that have a specific string (keyword) field value in their last document (sorted by time) in some time range.

I tried bucketing by user -> top (1) hits -> get max value -> filter by max value -> count buckets.
It's supposed to be something like this, but this only works if the field is numeric (the "max" aggregation doesn't work on string values).
Basically I need to extract the single string value from the top hit in order to filter the buckets.
How can this be achieved?

    POST /reports/_search
    {
         "size": 0,
         "query": {
             "bool": {
                 "must": {
                     "range": {
                         "timestamp": {
                             "gte": "2020-12-01T21:00:00.000Z",
                             "lte": "2020-12-31T22:41:35.092Z",
                             "format": "strict_date_optional_time"
                         }
                     }
                 }
             }
         },
         "aggs": {
             "distinct_users": {
                 "terms": {
                     "field": "userId",
                     "size": 10000
                 },
                 "aggs": {
                     "top_report": {   // get last document by date
                         "top_hits": {
                             "size": 1,
                             "sort": [
                                 {
                                     "timestamp": "desc"
                                 }
                             ],
                             "_source": "some_report_string_field"
                         }
                     },
                    "max_value": {  // get field value of last document
                        "max": {
                            "field": "some_report_string_field"
                        }
                    },
                    "value_selector": {  // select only users with a specific field value in their last document
                        "bucket_selector": {
                         "buckets_path": {
                             "max": "max_value.value"
                         },
                         "script": "params.max == 'some_value'"
                     }
                 }
             }
         },
         "users_count": { // count the filtered users
             "stats_bucket": {
                 "buckets_path": "distinct_users._count"
             }
         }
     }
}

Sounds like a “last known state” type of problem.
This is normally best solved by building an entity-centric index from your log index. This can be done using the transform api and requires some scripting to record the last known state for each entity.

1 Like

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.