Difference between two aggregations

Hi all

I am looking for some help with a watcher that I am trying to run. Basically I am trying to run a search query that will return a value if there is a difference between two aggregations.

Below is the script I am running in dev tools to compare the cardinality of number of documents in 15 min intervals to the cumulative cardinality of number of documents over a 1 hour period. I only want the script to output for differences between the two values and at the moment it is outputting regardless of whether there is a difference of not;

</>
{
"script_fields": {
"bucket_script": {
"script": {
"lang": "painless",
"source": """

int i = params._source.aggregations.book_name.buckets[0].pull_tag.buckets[0].per_15m.buckets[0].total_docs.value;
int j = params._source.aggregations.book_name.buckets[0].pull_tag.buckets[0].per_15m.buckets[0].card.value;

if ( i != j )
{
return params._source.aggregations.host_name.buckets[0].key
}

"""
}
}
}
}
</>

Maybe you need bucket selector aggregation?

Hi Tomo, thanks for your suggestion. I tried the bucket selector aggregation but the issue I am running into now is that I'm trying to filter out all sub buckets that are empty and list the buckets that aren't empty but the "parent bucket" always returns a value (see below). Do you know if it is possible to filter out the "parent bucket" if the sub buckets are empty?
</>
{
"query": {
"bool": {
"filter": [
{
"match_phrase": {
"event.dataset": "system.filesystem"
}
},
{
"range": {
"@timestamp": {
"gte": "now-24h",
"lte": "now",
"format": "strict_date_optional_time||epoch_millis"
}
}
}
],
"should": ,
"must_not":
}
},
"aggs": {
"host_name": {
"terms": {
"field": "host.name",
"size": 100
},
"aggs": {
"per_15m": {
"date_histogram": {
"field": "@timestamp",
"fixed_interval": "15m"
},
"aggs": {
"mount_point": {
"cardinality": {
"field": "system.filesystem.mount_point"
}
},
"card": {
"cumulative_cardinality": {
"buckets_path": "mount_point"
}
},
"mount_point_bucket_filter": {
"bucket_selector": {
"buckets_path": {
"Mounts": "mount_point",
"Cards": "card"
},
"script": "params.Mounts != params.Cards"
}
}
}
}
}
}
}
}

</>
Response;
</>
"aggregations" : {
"host_name" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Win10",
"doc_count" : 1008,
"per_15m" : {
"buckets" :
}
}
]
}
}
</>

I'm not sure, but how about using bucket selector aggregation also on parent aggregation.

Please use ``` instead of </> (or possibly click the </> button) and proper indent to help reading. Thanks.

Hi Tomo

I will make sure to do that going forward. I have tried a bucket selector on the parent aggregation but because it's not allowed as the terms aggregation is already on it.

Could you share sample data and sample script which I can copy and paste to Dev Tools to reproduce the problem? I would give it a try.

Hi Tomo

I have modified the sample kibana data to show you want I want to do

GET kibana_sample_data_logs/_search
{
  "query": {
    "bool": {
      "filter": [
        {
          "match_phrase": {
            "event.dataset": "sample_web_logs"
          }
        },
        {
          "range": {
            "@timestamp": {
              "gte": "2021-09-12T03:00:00.000Z",
              "lte": "2021-09-12T04:00:00.000Z",
              "format": "strict_date_optional_time||epoch_millis"
            }
          }
        }
      ],
      "should": [],
      "must_not": []
    }
  },
  "aggs": {
    "geo_dest": {
      "terms": {
        "field": "geo.dest",
        "size": 2
      },
      "aggs": {
        "per_15m": {
          "date_histogram": {
            "field": "@timestamp",
            "fixed_interval": "15m"
          },
          "aggs": {
            "referer_count": {
              "cardinality": {
                "field": "referer"
              }
            },
            "card": {
              "cumulative_cardinality": {
                "buckets_path": "referer_count"
              }
            },
            "referer_count_bucket_filter": {
              "bucket_selector": {
                "buckets_path": {
                  "Referers": "referer_count",
                  "Cards": "card"
                },
                "script": "params.Referers != params.Cards"
              }
            }
          }
        }
      }
    }
  }
}

So in the above I would like to omit geo.dest "CN" from the results because the cumulative cardinality and the cardinality of the referer for CN is equal. But you will see in the result, it still returns the bucket as it's parent bucket contains a value of CN.

Thanks for coming back to me on this and your help to date!

GET kibana_sample_data_logs/_search?filter_path=aggregations
{
  "size":0,
  "query": {
    "bool": {
      "filter": [
        {
          "match_phrase": {
            "event.dataset": "sample_web_logs"
          }
        },
        {
          "range": {
            "@timestamp": {
              "gte": "2022-01-30T03:00:00.000Z",
              "lte": "2022-01-30T04:00:00.000Z",
              "format": "strict_date_optional_time||epoch_millis"
            }
          }
        }
      ],
      "should": [],
      "must_not": []
    }
  },
  "aggs": {
    "geo_dest": {
      "terms": {
        "field": "geo.dest",
        "size": 2
      },
      "aggs": {
        "per_15m": {
          "date_histogram": {
            "field": "@timestamp",
            "fixed_interval": "15m"
          },
          "aggs": {
            "referer_count": {
              "cardinality": {
                "field": "referer"
              }
            },
            "card": {
              "cumulative_cardinality": {
                "buckets_path": "referer_count"
              }
            },
            "referer_count_bucket_filter": {
              "bucket_selector": {
                "buckets_path": {
                  "Referers": "referer_count",
                  "Cards": "card"
                },
                "script": "false"
              }
            }
          }
        },
        "bucket_existing_bucket_filter":{
          "bucket_selector": {
            "buckets_path": {
              "count":"per_15m._bucket_count"
            },
            "script": "params.count>0"
          }
        }
      }
    }
  }
}
{
  "aggregations" : {
    "geo_dest" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 6,
      "buckets" : [ ]
    }
  }
}

if you set referer_count_bucket_filter to true:

GET kibana_sample_data_logs/_search?filter_path=aggregations
{
  "size":0,
  "query": {
    "bool": {
      "filter": [
        {
          "match_phrase": {
            "event.dataset": "sample_web_logs"
          }
        },
        {
          "range": {
            "@timestamp": {
              "gte": "2022-01-30T03:00:00.000Z",
              "lte": "2022-01-30T04:00:00.000Z",
              "format": "strict_date_optional_time||epoch_millis"
            }
          }
        }
      ],
      "should": [],
      "must_not": []
    }
  },
  "aggs": {
    "geo_dest": {
      "terms": {
        "field": "geo.dest",
        "size": 2
      },
      "aggs": {
        "per_15m": {
          "date_histogram": {
            "field": "@timestamp",
            "fixed_interval": "15m"
          },
          "aggs": {
            "referer_count": {
              "cardinality": {
                "field": "referer"
              }
            },
            "card": {
              "cumulative_cardinality": {
                "buckets_path": "referer_count"
              }
            },
            "referer_count_bucket_filter": {
              "bucket_selector": {
                "buckets_path": {
                  "Referers": "referer_count",
                  "Cards": "card"
                },
                "script": "true"
              }
            }
          }
        },
        "bucket_existing_bucket_filter":{
          "bucket_selector": {
            "buckets_path": {
              "count":"per_15m._bucket_count"
            },
            "script": "params.count>0"
          }
        }
      }
    }
  }
}
{
  "aggregations" : {
    "geo_dest" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 6,
      "buckets" : [
        {
          "key" : "IN",
          "doc_count" : 4,
          "per_15m" : {
            "buckets" : [
              {
                "key_as_string" : "2022-01-30T03:15:00.000Z",
                "key" : 1643512500000,
                "doc_count" : 2,
                "referer_count" : {
                  "value" : 1
                },
                "card" : {
                  "value" : 1
                }
              },
              {
                "key_as_string" : "2022-01-30T03:30:00.000Z",
                "key" : 1643513400000,
                "doc_count" : 2,
                "referer_count" : {
                  "value" : 1
                },
                "card" : {
                  "value" : 2
                }
              }
            ]
          }
        },
        {
          "key" : "CN",
          "doc_count" : 2,
          "per_15m" : {
            "buckets" : [
              {
                "key_as_string" : "2022-01-30T03:45:00.000Z",
                "key" : 1643514300000,
                "doc_count" : 2,
                "referer_count" : {
                  "value" : 1
                },
                "card" : {
                  "value" : 1
                }
              }
            ]
          }
        }
      ]
    }
  }
}

With this bucket_existing_bucket_filter, you can select geo_dest buckets only containing any buckets.

Thank you Tomo_M, that has fixed my issue!!! I appreciate the assistance very very much!!

1 Like

Hi all

I have a new query following on from the above. When the aggregations meet my conditions specified, I want to pull out each referer and use it in my watcher action. I have added terms to pull out the referer in the aggregation in the below and I can pull each referer individually with the following 'referer': e.per_15m.buckets[0].referer_names.buckets.0.key within the actions of the watcher. But how do I pull ALL referers without having to repeat the above i.e. 'referer1': e.per_15m.buckets[0].referer_names.buckets.0.key, 'referer2': e.per_15m.buckets[0].referer_names.buckets.1.key???

I saw the attached link and have tried that but it doesn't seem to be working for me.

{
  "trigger": {
    "schedule": {
      "hourly": {
        "minute": [
          0
        ]
      }
    }
  },
  "input": {
    "search": {
      "request": {
        "search_type": "query_then_fetch",
        "indices": [
          "kibana_sample_data_logs"
        ],
        "rest_total_hits_as_int": true,
        "body": {
          "size": 0,
          "query": {
            "bool": {
              "filter": [
                {
                  "match_phrase": {
                    "event.dataset": "sample_web_logs"
                  }
                },
                {
                  "range": {
                    "@timestamp": {
                      "gte": "2021-09-12T05:00:00.000Z",
                      "lte": "2021-09-12T09:00:00.000Z",
                      "format": "strict_date_optional_time||epoch_millis"
                    }
                  }
                }
              ],
              "should": [],
              "must_not": []
            }
          },
          "aggs": {
            "geo_dest": {
              "terms": {
                "field": "geo.dest",
                "size": 100
              },
              "aggs": {
                "per_15m": {
                  "date_histogram": {
                    "field": "@timestamp",
                    "fixed_interval": "15m"
                  },
                  "aggs": {
                    "referer_count": {
                      "cardinality": {
                        "field": "referer"
                      }
                    },
                    "card": {
                      "cumulative_cardinality": {
                        "buckets_path": "referer_count"
                      }
                    },
                    "referer_count_bucket_filter": {
                      "bucket_selector": {
                        "buckets_path": {
                          "Referers": "referer_count",
                          "Cards": "card"
                        },
                        "script": "params.Referers > 1"
                      }
                    },
                    "referer_names": {
                      "terms": {
                        "field": "referer",
                        "size": 100
                      }
                    }
                  }
                },
                "bucket_existing_bucket_filter": {
                  "bucket_selector": {
                    "buckets_path": {
                      "count": "per_15m._bucket_count"
                    },
                    "script": "params.count>0"
                  }
                }
              }
            }
          }
        }
      }
    }
  },
  "condition": {
    "script": {
      "source": "return ctx.payload.aggregations.geo_dest.buckets.stream().count() > 0",
      "lang": "painless"
    }
  },
  "actions": {
    "index_payload": {
      "transform": {
        "script": {
          "source": """
            // Define metadata to bring into alert
            def triggered_time = ctx.trigger.triggered_time; 
            def severity =  ctx.metadata.severity; 
            def drilldown =  ctx.metadata.drilldown; 
            def alert_name = ctx.metadata.name;
            
          // Document Structure To Output
            return ['_doc':ctx.payload.aggregations.geo_dest.buckets.stream().map(e -> { return['@timestamp':triggered_time,'UUID': java.util.UUID.randomUUID().toString(), 'geo.dest':e.key,'alert.name':alert_name,'alert.severity':severity,'referer': e.per_15m.buckets[0].referer_names.buckets.0.key,'alert.drilldown': drilldown]}).collect(Collectors.toList())];
           """,
          "lang": "painless"
        }
      },
      "index": {
        "index": "alerts"
      }
    }
  },
  "metadata": {
    "severity": "S1",
    "drilldown": "add later"
  }
}

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.