Hi all,
We use ES v7.9.0 in Elastic Cloud.
Each cluster forwards monitoring data to the dedicated monitoring deployment.
We've added a few watches to alert on cluster health issues e.g. cluster status not green, heap pressure high. However, they never fire even when they should according to the logs and the underlying metric value. I might be missing something obvious here. Could someone point me in the right direction?
Used this as an example for Watcher:
Here's my cluster health watch:
{
"trigger": {
"schedule": {
"interval": "60s"
}
},
"input": {
"search": {
"request": {
"search_type": "query_then_fetch",
"indices": [
".monitoring-es-*"
],
"types": [
"doc"
],
"rest_total_hits_as_int": true,
"body": {
"query": {
"bool": {
"filter": {
"bool": {
"must": [
{
"range": {
"timestamp": {
"gte": "now-{{ctx.metadata.not_green_secs}}s"
}
}
},
{
"term": {
"type": "cluster_state"
}
}
]
}
}
}
},
"aggs": {
"clusters": {
"terms": {
"field": "cluster_uuid",
"size": 100
},
"aggs": {
"cluster_state": {
"filters": {
"filters": {
"yellow": {
"term": {
"cluster_state.status": "yellow"
}
},
"red": {
"term": {
"cluster_state.status": "red"
}
}
}
}
},
"latest_state": {
"top_hits": {
"size": 1,
"sort": [
{
"timestamp": {
"order": "desc"
}
}
]
}
}
}
}
},
"size": 0
}
}
}
},
"condition": {
"script": {
"source": "def is_not_green=ctx.payload.aggregations.clusters.buckets.stream().anyMatch(t -> (t.latest_state.hits.hits[0]._source.cluster_state.status== 'yellow' || t.latest_state.hits.hits[0]._source.cluster_state.status == 'red')); if (is_not_green) { def required_periods = (ctx.metadata.not_green_secs-ctx.metadata.monitoring_update_interval)/ctx.metadata.monitoring_update_interval; return ctx.payload.aggregations.clusters.buckets.stream().anyMatch(t -> ((t.cluster_state.buckets.red.doc_count + t.cluster_state.buckets.yellow.doc_count) >= required_periods ));} return false;",
"lang": "painless"
}
},
"actions": {
"log": {
"logging": {
"level": "info",
"text": "Clusters that have not NOT been green for more than {{ctx.metadata.not_green_secs}}s: {{#ctx.payload._value}}{{cluster_id}}-{{cluster_state}}{{/ctx.payload._value}}:"
}
},
"opsgenie": {
"webhook": {
"scheme": "https",
"host": "api.opsgenie.com",
"port": 443,
"method": "post",
"path": "/v1/json/eswatcher",
"params": {
"apiKey": "XXXXX"
},
"headers": {
"Content-Type": "application/json"
},
"body": "{{#toJson}}ctx{{/toJson}}"
}
}
},
"metadata": {
"monitoring_update_interval": 10,
"not_green_secs": 60
},
"transform": {
"script": {
"source": "def required_periods = (ctx.metadata.not_green_secs-ctx.metadata.monitoring_update_interval)/ctx.metadata.monitoring_update_interval; return ctx.payload.aggregations.clusters.buckets.stream().filter(t -> (t.latest_state.hits.hits[0]._source.cluster_state.status == 'yellow' || t.latest_state.hits.hits[0]._source.cluster_state.status == 'red')).filter(t -> (t.cluster_state.buckets.red.doc_count + t.cluster_state.buckets.yellow.doc_count) >= required_periods).map(t -> ['cluster_id':t.key,'cluster_state':t.latest_state.hits.hits[0]._source.cluster_state.status]).collect(Collectors.toList());",
"lang": "painless"
}
}
}