Watcher - Error! Execution failing intermittently

Hey, I've setup some watchers to query our application logs for error events and they seem to be working fine when testing and running them in DevTools. It runs successfully for periods of time when running as a watcher, and then at certain times of the day (completely random times) I start to see the State: "Error!" Comment: "Execution failing".

Any ideas why this is happening? Here is what the failed watcher looks like:

{
  "watch_id": "GS1_BU-warnings",
  "node": "uNQlvW8jTQS54Nr2GNM_hw",
  "state": "execution_not_needed",
  "user": "redacted",
  "status": {
    "state": {
      "active": true,
      "timestamp": "2019-09-25T14:42:15.524Z"
    },
    "last_checked": "2019-09-27T13:40:42.506Z",
    "last_met_condition": "2019-09-27T12:50:42.614Z",
    "actions": {
      "snow_webhook": {
        "ack": {
          "timestamp": "2019-09-27T11:55:42.347Z",
          "state": "awaits_successful_execution"
        },
        "last_execution": {
          "timestamp": "2019-09-27T12:50:42.614Z",
          "successful": false,
          "reason": ""
        },
        "last_successful_execution": {
          "timestamp": "2019-09-27T11:50:42.464Z",
          "successful": true
        }
      }
    },
    "execution_state": "execution_not_needed",
    "version": -1
  },
  "trigger_event": {
    "type": "schedule",
    "triggered_time": "2019-09-27T13:40:42.506Z",
    "schedule": {
      "scheduled_time": "2019-09-27T13:40:42.154Z"
    }
  },
  "input": {
    "search": {
      "request": {
        "search_type": "query_then_fetch",
        "indices": [
          "test-qa-b2b_gs1"
        ],
        "rest_total_hits_as_int": true,
        "body": {
          "query": {
            "bool": {
              "must": [
                {
                  "wildcard": {
                    "json.logging_error_code.keyword": "GS1_BU*"
                  }
                },
                {
                  "match": {
                    "json.loglevel": "WARNING"
                  }
                }
              ],
              "filter": {
                "range": {
                  "@timestamp": {
                    "from": "{{ctx.trigger.scheduled_time}}||-5m",
                    "to": "{{ctx.trigger.triggered_time}}"
                  }
                }
              }
            }
          }
        }
      }
    }
  },
  "condition": {
    "compare": {
      "ctx.payload.hits.total": {
        "gt": 0
      }
    }
  },
  "metadata": {
    "name": "GS1_BU-warnings",
    "xpack": {
      "type": "json"
    }
  },
  "result": {
    "execution_time": "2019-09-27T13:40:42.506Z",
    "execution_duration": 7,
    "input": {
      "type": "search",
      "status": "success",
      "payload": {
        "_shards": {
          "total": 3,
          "failed": 0,
          "successful": 3,
          "skipped": 0
        },
        "hits": {
          "hits": [],
          "total": 0,
          "max_score": null
        },
        "took": 7,
        "timed_out": false
      },
      "search": {
        "request": {
          "search_type": "query_then_fetch",
          "indices": [
            "test-qa-b2b_gs1"
          ],
          "rest_total_hits_as_int": true,
          "body": {
            "query": {
              "bool": {
                "must": [
                  {
                    "wildcard": {
                      "json.logging_error_code.keyword": "GS1_BU*"
                    }
                  },
                  {
                    "match": {
                      "json.loglevel": "WARNING"
                    }
                  }
                ],
                "filter": {
                  "range": {
                    "@timestamp": {
                      "from": "2019-09-27T13:40:42.154Z||-5m",
                      "to": "2019-09-27T13:40:42.506Z"
                    }
                  }
                }
              }
            }
          }
        }
      }
    },
    "condition": {
      "type": "compare",
      "status": "success",
      "met": false,
      "compare": {
        "resolved_values": {
          "ctx.payload.hits.total": 0
        }
      }
    },
    "actions": []
  },
  "messages": []
}

The output is exactly the same for a successful run, except this snippet:

failed attempt:

    "last_execution": {
      "timestamp": "2019-09-27T12:50:42.614Z",
      "successful": false,
      "reason": ""
    },

successful:

    "last_execution": {
      "timestamp": "2019-09-26T12:55:42.636Z",
      "successful": true
    },

Edit: wanted to add that the webhook action is working as expected and I have validated this on the receiving end.

Hey, was able to find the solution to this. In the elasticsearch logs, I found the following error:

{
   "type":"server",
   "timestamp":"2019-09-27T09:50:14,423+0000",
   "level":"DEBUG",
   "component":"o.e.a.b.TransportShardBulkAction",
   "cluster.name":"es-cluster",
   "node.name":"es-data-0",
   "cluster.uuid":"wgJD7vl4RPmbLuQ3oNL2HQ",
   "node.id":"oi3fQbNiSZmaoe5sBrsA-g",
   "message":"[test-qa-b2b_gs1][2] failed to execute bulk item (index) index {[test-qa-b2b_gs1][_doc][mewgcm0BUH6gG7PZyfN6], source[n/a, actual length: [4.6kb], max length: 2kb]}",
   "stacktrace":[
      "java.lang.IllegalArgumentException: Limit of total fields [1000] in index [test-qa-b2b_gs1] has been exceeded",
      "at org.elasticsearch.index.mapper.MapperService.checkTotalFieldsLimit(MapperService.java:602) ~[elasticsearch-7.0.1.jar:7.0.1]",
      "at org.elasticsearch.index.mapper.MapperService.internalMerge(MapperService.java:506) ~[elasticsearch-7.0.1.jar:7.0.1]",
      "at org.elasticsearch.index.mapper.MapperService.internalMerge(MapperService.java:398) ~[elasticsearch-7.0.1.jar:7.0.1]",
      "at org.elasticsearch.index.mapper.MapperService.merge(MapperService.java:331) ~[elasticsearch-7.0.1.jar:7.0.1]",
      "at org.elasticsearch.cluster.metadata.MetaDataMappingService$PutMappingExecutor.applyRequest(MetaDataMappingService.java:315) ~[elasticsearch-7.0.1.jar:7.0.1]",
      "at org.elasticsearch.cluster.metadata.MetaDataMappingService$PutMappingExecutor.execute(MetaDataMappingService.java:238) ~[elasticsearch-7.0.1.jar:7.0.1]",
      "at org.elasticsearch.cluster.service.MasterService.executeTasks(MasterService.java:687) ~[elasticsearch-7.0.1.jar:7.0.1]",
      "at org.elasticsearch.cluster.service.MasterService.calculateTaskOutputs(MasterService.java:310) ~[elasticsearch-7.0.1.jar:7.0.1]",
      "at org.elasticsearch.cluster.service.MasterService.runTasks(MasterService.java:210) ~[elasticsearch-7.0.1.jar:7.0.1]",
      "at org.elasticsearch.cluster.service.MasterService$Batcher.run(MasterService.java:142) ~[elasticsearch-7.0.1.jar:7.0.1]",
      "at org.elasticsearch.cluster.service.TaskBatcher.runIfNotProcessed(TaskBatcher.java:150) ~[elasticsearch-7.0.1.jar:7.0.1]",
      "at org.elasticsearch.cluster.service.TaskBatcher$BatchedTask.run(TaskBatcher.java:188) ~[elasticsearch-7.0.1.jar:7.0.1]",
      "at org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingRunnable.run(ThreadContext.java:681) ~[elasticsearch-7.0.1.jar:7.0.1]",
      "at org.elasticsearch.common.util.concurrent.PrioritizedEsThreadPoolExecutor$TieBreakingPrioritizedRunnable.runAndClean(PrioritizedEsThreadPoolExecutor.java:252) ~[elasticsearch-7.0.1.jar:7.0.1]",
      "at org.elasticsearch.common.util.concurrent.PrioritizedEsThreadPoolExecutor$TieBreakingPrioritizedRunnable.run(PrioritizedEsThreadPoolExecutor.java:215) ~[elasticsearch-7.0.1.jar:7.0.1]",
      "at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) [?:?]",
      "at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) [?:?]",
      "at java.lang.Thread.run(Thread.java:835) [?:?]"
   ]
}

This was resolved by increasing the total fields limit for this index:

PUT test-qa-b2b_gs1/_settings
{
   "index.mapping.total_fields.limit": 2000
}

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.