Watch for child elements like router interfaces

Hi,
We are monitoring hundreds of network devices (routers and switches) and each device has various number of interfaces (ports). I am trying to setup watch that would show on which network device which port is hitting utilization threshold.
I am quite new to elasticsearch so my knowledge is very limited so some of my configurations might not make sense :slight_smile:
I know how to make watch work for CPU when value is per device. But when I cant make watch when I want value per device and one of its many child elements like port.

My goal is after transformation in actions have output similar to this: testswitch.mydomain.net - GigabitEthernet0/1 - 632613414.4, when threshold is for example 500.
I am running elastic version 6.x
This is how my config looks like:

{
  "trigger": {
    "schedule": {
      "interval": "5m"
    }
  },
  "input": {
    "search": {
      "request": {
        "search_type": "query_then_fetch",
        "indices": [
          "<telegraf*{now/d}>"
        ],
        "types": [],
        "body": {
          "size": 0,
          "query": {
            "bool": {
              "filter": {
                "range": {
                  "@timestamp": {
                    "gte": "{{ctx.trigger.scheduled_time}}||-{{ctx.metadata.DBF_interval}}",
                    "lte": "{{ctx.trigger.scheduled_time}}",
                    "format": "strict_date_optional_time||epoch_millis"
                  }
                }
              },
              "must": {
                "query_string": {
                  "query": "_exists_:interface.ifHCInOctets",
                  "analyze_wildcard": true,
                  "default_field": "*"
                }
              }
            }
          },
          "aggs": {
            "NetworkDEV": {
              "terms": {
                "field": "tag.agent_host",
                "size": 3,
                "order": {
                  "_term": "desc"
                },
                "min_doc_count": 1
              },
              "aggs": {
                "INT_Name": {
                  "terms": {
                    "field": "tag.ifDescr",
                    "size": 3,
                    "order": {
                      "_term": "desc"
                    },
                    "min_doc_count": 1
                  },
                  "aggs": {
                    "metricAgg_AVG": {
                      "avg": {
                        "field": "interface.ifHCInOctets"
                      }
                    }
                  }
                }
              }
            }
          }
        }
      }
    }
  },
    "condition" : {
        "always" : {}
    },
  "actions": {
    "my-logging-action": {
      "logging": {
        "level": "warn",
        "text": "These devices hit threshhold: {{#ctx.payload.DBF_hit_devices}}{{key}}:{{NetworkDEV.value}}{{key}}:{{INT_Name.value}} ;-; {{/ctx.payload.DBF_hit_devices}} "
      }
    }
  },
"metadata": {
  "DBF_threshold": 500,
  "DBF_interval": "5m"
},
"transform": {
  "script": {`Preformatted text`
    "source": "return ['DBF_hit_devices' : ctx.payload.aggregations.NetworkDEV.buckets.stream().filter(s -> s.metricAgg_AVG.value >ctx.metadata.DBF_threshold).collect(Collectors.toList())]",
    "lang": "painless"
  }
}
}
  "result": {
    "execution_time": "2018-03-22T14:33:44.719Z",
    "execution_duration": 27,
    "input": {
      "type": "search",
      "status": "success",
      "payload": {
        "_shards": {
          "total": 20,
          "failed": 0,
          "successful": 20,
          "skipped": 0
        },
        "hits": {
          "hits": [],
          "total": 114824,
          "max_score": 0
        },
        "took": 22,
        "timed_out": false,
        "aggregations": {
          "NetworkDEV": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 114754,
            "buckets": [
              {
                "doc_count": 10,
                "INT_Name": {
                  "doc_count_error_upper_bound": 0,
                  "sum_other_doc_count": 0,
                  "buckets": [
                    {
                      "doc_count": 5,
                      "key": "lo",
                      "metricAgg_AVG": {
                        "value": 5466675.2
                      }
                    },
                    {
                      "doc_count": 5,
                      "key": "eth0",
                      "metricAgg_AVG": {
                        "value": 53697502412.8
                      }
                    }
                  ]
                },
                "key": "testserver.mydomain.net"
              },
              {
                "doc_count": 45,
                "INT_Name": {
                  "doc_count_error_upper_bound": 0,
                  "sum_other_doc_count": 30,
                  "buckets": [
                    {
                      "doc_count": 5,
                      "key": "Tunnel4",
                      "metricAgg_AVG": {
                        "value": 0
                      }
                    },
                    {
                      "doc_count": 5,
                      "key": "Tunnel3",
                      "metricAgg_AVG": {
                        "value": 0
                      }
                    },
                    {
                      "doc_count": 5,
                      "key": "Tunnel2",
                      "metricAgg_AVG": {
                        "value": 0
                      }
                    }
                  ]
                },
                "key": "testrouter.mydoman.net"
              },
              {
                "doc_count": 15,
                "INT_Name": {
                  "doc_count_error_upper_bound": 0,
                  "sum_other_doc_count": 0,
                  "buckets": [
                    {
                      "doc_count": 5,
                      "key": "Loopback0",
                      "metricAgg_AVG": {
                        "value": 0
                      }
                    },
                    {
                      "doc_count": 5,
                      "key": "GigabitEthernet0/1",
                      "metricAgg_AVG": {
                        "value": 632613414.4
                      }
                    },
                    {
                      "doc_count": 5,
                      "key": "GigabitEthernet0/0",
                      "metricAgg_AVG": {
                        "value": 408257222.4
                      }
                    }
                  ]
                },
                "key": "testswitch.mydomain.net"
              }
            ]
          }
        }
      },
      "search": {
        "request": {
          "search_type": "query_then_fetch",
          "indices": [
            "<telegraf*{now/d}>"
          ],
          "types": [],
          "body": {
            "size": 0,
            "query": {
              "bool": {
                "filter": {
                  "range": {
                    "@timestamp": {
                      "gte": "2018-03-22T14:33:44.719Z||-5m",
                      "lte": "2018-03-22T14:33:44.719Z",
                      "format": "strict_date_optional_time||epoch_millis"
                    }
                  }
                },
                "must": {
                  "query_string": {
                    "query": "_exists_:interface.ifHCInOctets",
                    "analyze_wildcard": true,
                    "default_field": "*"
                  }
                }
              }
            },
            "aggs": {
              "NetworkDEV": {
                "terms": {
                  "field": "tag.agent_host",
                  "size": 3,
                  "order": {
                    "_term": "desc"
                  },
                  "min_doc_count": 1
                },
                "aggs": {
                  "INT_Name": {
                    "terms": {
                      "field": "tag.ifDescr",
                      "size": 3,
                      "order": {
                        "_term": "desc"
                      },
                      "min_doc_count": 1
                    },
                    "aggs": {
                      "metricAgg_AVG": {
                        "avg": {
                          "field": "interface.ifHCInOctets"
                        }
                      }
                    }
                  }
                }
              }
            }
          }
        }
      }
    },

please take the time to properly format your messages. This is pretty much unreadable. You can use markdown here.

The search does not return a single result, and thus your transform fails. You may want to check in your condition for a base number of hits before proceeding with the transform.

Also you may want to refine your query first, so it returns what you ask for (or you were lucky and no interfaces were overloaded).

--Alex

Continue from my previous post:   

 "condition": {
      "type": "always",
      "status": "success",
      "met": true
    },
    "transform": {
      "type": "script",
      "status": "failure",
      "reason": "runtime error",
      "error": {
        "root_cause": [
          {
            "type": "script_exception",
            "reason": "runtime error",
            "script_stack": [
              "s -> s.metricAgg_AVG.value >ctx.metadata.DBF_threshold).collect(Collectors.toList())]",
              "                    ^---- HERE"
            ],
            "script": "return ['DBF_hit_devices' : ctx.payload.aggregations.NetworkDEV.buckets.stream().filter(s -> s.metricAgg_AVG.value >ctx.metadata.DBF_threshold).collect(Collectors.toList())]",
            "lang": "painless"
          }
        ],
        "type": "script_exception",
        "reason": "runtime error",
        "script_stack": [
          "s -> s.metricAgg_AVG.value >ctx.metadata.DBF_threshold).collect(Collectors.toList())]",
          "                    ^---- HERE"
        ],
        "script": "return ['DBF_hit_devices' : ctx.payload.aggregations.NetworkDEV.buckets.stream().filter(s -> s.metricAgg_AVG.value >ctx.metadata.DBF_threshold).collect(Collectors.toList())]",
        "lang": "painless",
        "caused_by": {
          "type": "null_pointer_exception",
          "reason": null
        }
      }
    },
    "actions": []
      },
      "messages": [
    "failed to execute watch transform"
      ]
    }`Preformatted text`

Hi Alex,
apologies for messy output, I have edited it, I hope it more readable now.

But my search gives results, they are limited now to 3 devices and 3 ports:

   "aggregations": {
      "NetworkDEV": {
        "doc_count_error_upper_bound": 0,
        "sum_other_doc_count": 114754,
        "buckets": [
          {
            "doc_count": 10,
            "INT_Name": {
              "doc_count_error_upper_bound": 0,
              "sum_other_doc_count": 0,
              "buckets": [
                {
                  "doc_count": 5,
                  "key": "lo",
                  "metricAgg_AVG": {
                    "value": 5466675.2
                  }
                },
                {
                  "doc_count": 5,
                  "key": "eth0",
                  "metricAgg_AVG": {
                    "value": 53697502412.8
                  }
                }
              ]
            },
            "key": "testserver.mydomain.net"
          },
          {
            "doc_count": 45,
            "INT_Name": {
              "doc_count_error_upper_bound": 0,
              "sum_other_doc_count": 30,
              "buckets": [
                {
                  "doc_count": 5,
                  "key": "Tunnel4",
                  "metricAgg_AVG": {
                    "value": 0
                  }
                },
                {
                  "doc_count": 5,
                  "key": "Tunnel3",
                  "metricAgg_AVG": {
                    "value": 0
                  }
                },
                {
                  "doc_count": 5,
                  "key": "Tunnel2",
                  "metricAgg_AVG": {
                    "value": 0
                  }
                }
              ]
            },
            "key": "testrouter.mydoman.net"
          },
          {
            "doc_count": 15,
            "INT_Name": {
              "doc_count_error_upper_bound": 0,
              "sum_other_doc_count": 0,
              "buckets": [
                {
                  "doc_count": 5,
                  "key": "Loopback0",
                  "metricAgg_AVG": {
                    "value": 0
                  }
                },
                {
                  "doc_count": 5,
                  "key": "GigabitEthernet0/1",
                  "metricAgg_AVG": {
                    "value": 632613414.4
                  }
                },
                {
                  "doc_count": 5,
                  "key": "GigabitEthernet0/0",
                  "metricAgg_AVG": {
                    "value": 408257222.4
                  }
                }
              ]
            },
            "key": "testswitch.mydomain.net"
          }
        ]
      }

My condition is:

"condition" : {
    "always" : {}

Because all results are returned either way and I still need to make transformation to output only the objects which hit the threshold.
Regarding the query it returns exactly what I want, I use basically the same query in Grafana and it works fine.

See this snippet of yours.

ctx.payload.aggregations.NetworkDEV.buckets.stream().filter(s -> s.metricAgg_AVG.value > ctx.metadata.DBF_threshold)

This does not correlate with the JSON, because the metricAgg is inside of each nested bucket in the next buckets array (hope this makes sense).

Like ctx.payload.aggregations.NetworkDEV.buckets.INT_Name.buckets[IDX HERE].metricAgg_AVG.value

Yes that makes sense. Actually I have tried that before, but painless script does not seem to recognize INT_Name as "subbucket":

"transform": {
  "type": "script",
  "status": "failure",
  "reason": "runtime error",
  "error": {
    "root_cause": [
      {
        "type": "script_exception",
        "reason": "runtime error",
        "script_stack": [
          "return ['DBF_hit_devices' : ctx.payload.aggregations.NetworkDEV.buckets.INT_Name.buckets.stream().filter(",
          "                                                                       ^---- HERE"
        ],
        "script": "return ['DBF_hit_devices' : ctx.payload.aggregations.NetworkDEV.buckets.INT_Name.buckets.stream().filter(s -> s.metricAgg_AVG.value >ctx.metadata.DBF_threshold).collect(Collectors.toList())]",
        "lang": "painless"
      }
    ],
    "type": "script_exception",
    "reason": "runtime error",
    "script_stack": [
      "return ['DBF_hit_devices' : ctx.payload.aggregations.NetworkDEV.buckets.INT_Name.buckets.stream().filter(",
      "                                                                       ^---- HERE"
    ],
    "script": "return ['DBF_hit_devices' : ctx.payload.aggregations.NetworkDEV.buckets.INT_Name.buckets.stream().filter(s -> s.metricAgg_AVG.value >ctx.metadata.DBF_threshold).collect(Collectors.toList())]",
    "lang": "painless",
    "caused_by": {
      "type": "illegal_argument_exception",
      "reason": "Illegal list shortcut value [INT_Name]."
    }
  }
},

it does not, because buckets is a list, and thus does not have a child elements named INT_Name, but each element within buckets does... so you need to loop through these buckets as well.

I am not sure I get you correctly. You mean I should not treat NetworkDEV as buckets somehow access INT_Name?
Please forgive my ignorance but I am very new to your product and there is very little documentation and examples are for very simple cases.

you need to do two filter operations, one for ctx.payload.aggregations.NetworkDEV.buckets and one for each bucket inside INT_Name.buckets

I have tried several ways on but no luck. Its difficult to make script work when you are not a programmer and trying to reuse examples :slight_smile:
I was able to printout device names and all payload but I can not find a way to parse it correctly so I could connect device names interface names and metric values.
You are suggesting making a loop, but again I do not know how to do that without example or documentation. I have no knowledge in Java or Groovy, so painless scripting is pretty painful :slight_smile:

I can see in other threads that I am not the only one having hard time making transform or condition work with child elements using scripting. And no one seems to managed it to make work.

I think it would be a good idea to have a documented example of watch configuration for child elements, where action provides master and child names(terms/keys) and value of child metric which hit the threshold.

Hey,

you can check out our examples repo, which contains a fair share of watches parsing multiple levels of buckets doing exactly this.

It is not easy to derive the exact requirement what should happen with those multiple levels of buckets when the same terms are in different buckets - should they be summed up, should the max be used, etc? This makes it super hard to come up with the one and only example covering everyones use case.

--Alex

Hi Alexander, I'm working with Aurimas on the same case. Do I understand correctly, that I can do two filter operations like this? :

"transform": {
    "script": {
      "source": "return ['DBF_hit_devices' : ctx.payload.aggregations.NetworkDEV.buckets.stream().filter(s -> s.INT_Name.buckets.stream()).filter(s -> s.metricAgg_AVG.value > ctx.metadata.DBF_threshold).collect(Collectors.toList())]",
      "lang": "painless"
    }
  }

With that one I get:

   "transform": {
  "type": "script",
  "status": "failure",
  "reason": "runtime error",
  "error": {
    "root_cause": [
      {
        "type": "script_exception",
        "reason": "runtime error",
        "script_stack": [
          "java.util.stream.ReferencePipeline$2$1.accept(ReferencePipeline.java:174)",
          "java.util.ArrayList$ArrayListSpliterator.forEachRemaining(ArrayList.java:1382)",
          "java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:481)",
          "java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:471)",
          "java.util.stream.ReduceOps$ReduceOp.evaluateSequential(ReduceOps.java:708)",
          "java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234)",
          "java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:499)",
          "s -> s.metricAgg_AVG.value > ctx.metadata.DBF_threshold).collect(Collectors.toList())]",
          "                                                                           ^---- HERE"
        ],
        "script": "return ['DBF_hit_devices' : ctx.payload.aggregations.NetworkDEV.buckets.stream().filter(s -> s.INT_Name.buckets.stream()).filter(s -> s.metricAgg_AVG.value > ctx.metadata.DBF_threshold).collect(Collectors.toList())]",
        "lang": "painless"
      }
    ],
    "type": "script_exception",
    "reason": "runtime error",
    "script_stack": [
      "java.util.stream.ReferencePipeline$2$1.accept(ReferencePipeline.java:174)",
      "java.util.ArrayList$ArrayListSpliterator.forEachRemaining(ArrayList.java:1382)",
      "java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:481)",
      "java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:471)",
      "java.util.stream.ReduceOps$ReduceOp.evaluateSequential(ReduceOps.java:708)",
      "java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234)",
      "java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:499)",
      "s -> s.metricAgg_AVG.value > ctx.metadata.DBF_threshold).collect(Collectors.toList())]",
      "                                                                           ^---- HERE"
    ],
    "script": "return ['DBF_hit_devices' : ctx.payload.aggregations.NetworkDEV.buckets.stream().filter(s -> s.INT_Name.buckets.stream()).filter(s -> s.metricAgg_AVG.value > ctx.metadata.DBF_threshold).collect(Collectors.toList())]",
    "lang": "painless",
    "caused_by": {
      "type": "class_cast_exception",
      "reason": "java.util.stream.ReferencePipeline$Head cannot be cast to java.lang.Number"
    }
  }
},
"actions": []
  },
  "messages": [
"failed to execute watch transform"
  ]
}

This is a transform that creates the required list. If you have a hard time handling streams, you could also just fall back to use for loops and helper structures like the list below.

Note that a transform is executed after a condition so you still need to have a valid condition.

    "transform": {
      "script": "def entries = []; ctx.payload.aggregations.NetworkDEV.buckets.forEach(b -> { b.INT_Name.buckets.forEach(b2 -> { entries.add(b.key + '/' + b2.key + ': ' + b2.metricAgg_AVG.value) } ) } ) ; return entries;"
    },

Thanks, this helped a lot! Now I can see how this looping work. Thank you for your time.

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.