No Detections after the first time zero detection was detected

We have created an anomaly detection job using elastic machine learning. For this particular case, our device doing detections failed for electrical


issues and stopped doing detection. Previously for cases like these, We used to get an anomaly (and anomaly score) but we no longer get it now. The Anomaly Detection just stops detected anything further (screenshot attached for your reference). It has been happening for the past few days now, to multiple devices of ours.

It will be highly appreciated if someone can help me understand this. Thanks!

What is the detector configuration? This is relevant because some functions (like mean, max, etc) treat no data as null but other functions (like count) treat no data as 0. If you have the former, then a period of no data will be ignored.

Thanks for the reply,

It is count. Before 07.04, the same job used to run perfectly for cases like these. But all of a sudden it started treating it like no data

As a follow up to this problem,

I just noticed that something happened to our system at 07.04 at 9 am. And for all our devices, there were no detections made until 11:30am. After 11:30, the devices seemed to work fine, but no anomalies and model snapshots have been seen since then (until now). Screenshot attached for a better understanding, i hope this information can be of any help

Thanks for the clarification that this is count (and I assume it is not high_count). I think it is also notable that you have 9 system annotations on the data before the moment of interest. Do you mind sharing the contents of those annotations?

One is this one, while the others are model snapshots stored

Certainly a "missing data" annotation is not a good one to get. Do you know the cause and reason for this disruption?

In the meantime, I'm also interested in the version you are using and the JSON of your job configuration.

Unfortunately, We have not been able to find out the cause of disruption yet

The version is 7.10.1 and the JSON is below

{
  "job_id": "anomaly_detection_device_logs",
  "job_type": "anomaly_detector",
  "job_version": "7.10.1",
  "create_time": 1623749963638,
  "finished_time": 1649683638683,
  "model_snapshot_id": "1649683574",
  "custom_settings": {
    "custom_urls": [ ]
  },
  "description": "",
  "analysis_config": {
    "bucket_span": "15m",
    "detectors": [
      {
        "detector_description": "count partitionfield=\"hostname.keyword\"",
        "function": "count",
        "partition_field_name": "hostname.keyword",
        "detector_index": 0
      }
    ],
    "influencers": [
      "hostname.keyword",
      "detection.direction.keyword"
    ]
  },
  "analysis_limits": {
    "model_memory_limit": "2700mb",
    "categorization_examples_limit": 4
  },
  "data_description": {
    "time_field": "@timestamp",
    "time_format": "epoch_ms"
  },
  "model_plot_config": {
    "enabled": false,
    "annotations_enabled": false
  },
  "model_snapshot_retention_days": 10,
  "daily_model_snapshot_retention_after_days": 1,
  "results_index_name": "custom-anomaly_detection_device_logs",
  "allow_lazy_open": false,
  "data_counts": {
    "job_id": "anomaly_detection_device_logs",
    "processed_record_count": 2230825210,
    "processed_field_count": 2471359154,
    "input_bytes": 136337806228,
    "input_field_count": 2471359154,
    "invalid_date_count": 0,
    "missing_field_count": 1990291266,
    "out_of_order_timestamp_count": 0,
    "empty_bucket_count": 357,
    "sparse_bucket_count": 31,
    "bucket_count": 35658,
    "earliest_record_timestamp": 1617395143828,
    "latest_record_timestamp": 1649310299952,
    "last_data_time": 1649316305155,
    "latest_empty_bucket_timestamp": 1645839900000,
    "latest_sparse_bucket_timestamp": 1649214900000,
    "input_record_count": 2230825210,
    "log_time": 1649316305155,
    "latest_bucket_timestamp": 1649310300000
  },
  "model_size_stats": {
    "job_id": "anomaly_detection_device_logs",
    "result_type": "model_size_stats",
    "model_bytes": 107381150,
    "peak_model_bytes": 107620056,
    "model_bytes_exceeded": 0,
    "model_bytes_memory_limit": 2831155200,
    "total_by_field_count": 2168,
    "total_over_field_count": 0,
    "total_partition_field_count": 2974,
    "bucket_allocation_failures_count": 0,
    "memory_status": "ok",
    "assignment_memory_basis": "current_model_bytes",
    "categorized_doc_count": 0,
    "total_category_count": 0,
    "frequent_category_count": 0,
    "rare_category_count": 0,
    "dead_category_count": 0,
    "failed_category_count": 0,
    "categorization_status": "ok",
    "log_time": 1649683574759,
    "timestamp": 1649309400000
  },
  "forecasts_stats": {
    "total": 0,
    "forecasted_jobs": 0
  },
  "state": "closed",
  "timing_stats": {
    "job_id": "anomaly_detection_device_logs",
    "bucket_count": 80926,
    "total_bucket_processing_time_ms": 28810331.000000123,
    "minimum_bucket_processing_time_ms": 16,
    "maximum_bucket_processing_time_ms": 18608,
    "average_bucket_processing_time_ms": 356.00834095346517,
    "exponential_average_bucket_processing_time_ms": 595.6793996977913,
    "exponential_average_bucket_processing_time_per_hour_ms": 7389.606161268795
  },
  "datafeed_config": {
    "datafeed_id": "datafeed-anomaly_detection_device_logs",
    "job_id": "anomaly_detection_device_logs",
    "query_delay": "100m",
    "chunking_config": {
      "mode": "auto"
    },
    "indices_options": {
      "expand_wildcards": [
        "open"
      ],
      "ignore_unavailable": false,
      "allow_no_indices": true,
      "ignore_throttled": true
    },
    "query": {
      "bool": {}
    },
    "indices": [
      "device_logs*"
    ],
    "scroll_size": 1000,
    "delayed_data_check_config": {
      "enabled": true
    },
    "state": "stopped",
    "timing_stats": {
      "job_id": "anomaly_detection_device_logs",
      "search_count": 2488368,
      "bucket_count": 35647,
      "total_search_time_ms": 82224104,
      "average_search_time_per_bucket_ms": 2306.620585182484,
      "exponential_average_search_time_per_hour_ms": 16309.475428299429
    }
  }
}

Seems like there's nothing obviously wrong with your job configuration. A more in-depth investigation would likely need to happen here. I'd advise you to open a support ticket and we can attempt to replicate with some extracted data from your index.

We found this error in the messages ''Datafeed is encountering errors extracting data: Partial shards failure'' but still could not figure out what happened there (we have been having frequent shard failures on Elasticsearch lately). Nevertheless, I tried restarting the ML Job and now it seems to be working fine

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.