Datafeed has missed 152 documents due to ingest latency

Running ELK 7.16.3 version Platinum Subscription.

I am getting error message Datafeed has missed 152 documents due to ingest latency, latest bucket with missing data is [2022-10-10T17:00:00.000Z]. Consider increasing query_delay. I tried to increase query delay value but my job stop after some hours.

Here is the job config

{
  "job_id": "new",
  "job_type": "anomaly_detector",
  "job_version": "7.16.3",
  "create_time": 1665421273515,
  "model_snapshot_id": "1665421283",
  "groups": [
    "bib"
  ],
  "description": "",
  "analysis_config": {
    "bucket_span": "15m",
    "summary_count_field_name": "_doc_count",
    "detectors": [
      {
        "detector_description": "max(test)",
        "function": "max",
        "field_name": "test",
        "detector_index": 0
      }
    ],
    "influencers": [
      "verb.keyword"
    ]
  },
  "analysis_limits": {
    "model_memory_limit": "11mb",
    "categorization_examples_limit": 4
  },
  "data_description": {
    "time_field": "@timestamp",
    "time_format": "epoch_ms"
  },
  "model_plot_config": {
    "enabled": true,
    "annotations_enabled": true
  },
  "model_snapshot_retention_days": 10,
  "daily_model_snapshot_retention_after_days": 1,
  "results_index_name": "custom-new",
  "allow_lazy_open": false,
  "data_counts": {
    "job_id": "new",
    "processed_record_count": 14115,
    "processed_field_count": 28230,
    "input_bytes": 1012935,
    "input_field_count": 28230,
    "invalid_date_count": 0,
    "missing_field_count": 14115,
    "out_of_order_timestamp_count": 0,
    "empty_bucket_count": 0,
    "sparse_bucket_count": 0,
    "bucket_count": 741,
    "earliest_record_timestamp": 1664755206489,
    "latest_record_timestamp": 1665422106741,
    "last_data_time": 1665422612397,
    "input_record_count": 14115,
    "log_time": 1665422612397,
    "latest_bucket_timestamp": 1665422100000
  },
  "model_size_stats": {
    "job_id": "new",
    "result_type": "model_size_stats",
    "model_bytes": 28782,
    "peak_model_bytes": 31524,
    "model_bytes_exceeded": 0,
    "model_bytes_memory_limit": 11534336,
    "total_by_field_count": 1,
    "total_over_field_count": 0,
    "total_partition_field_count": 2,
    "bucket_allocation_failures_count": 0,
    "memory_status": "ok",
    "assignment_memory_basis": "current_model_bytes",
    "categorized_doc_count": 0,
    "total_category_count": 0,
    "frequent_category_count": 0,
    "rare_category_count": 0,
    "dead_category_count": 0,
    "failed_category_count": 0,
    "categorization_status": "ok",
    "log_time": 1665421281514,
    "timestamp": 1664772300000
  },
  "forecasts_stats": {
    "total": 0,
    "forecasted_jobs": 0
  },
  "state": "opened",
  "node": {
    "id": "p03Tck9FSNeCpT5HpYml8w",
    "name": "node5",
    "ephemeral_id": "SqZjuNWOTfSVyEGXuLcyUA",
    "transport_address": "xx.xx.xx.xx:9300",
    "attributes": {
      "ml.machine_memory": "168606924800",
      "ml.max_open_jobs": "512",
      "xpack.installed": "true",
      "ml.max_jvm_size": "33285996544",
      "transform.node": "false"
    }
  },
  "assignment_explanation": "",
  "open_time": "41265s",
  "timing_stats": {
    "job_id": "new",
    "bucket_count": 744,
    "total_bucket_processing_time_ms": 54.00000000000002,
    "minimum_bucket_processing_time_ms": 0,
    "maximum_bucket_processing_time_ms": 2,
    "average_bucket_processing_time_ms": 0.07258064516129036,
    "exponential_average_bucket_processing_time_ms": 0.06157431014508287,
    "exponential_average_bucket_processing_time_per_hour_ms": 0.7218152400879877
  },
  "datafeed_config": {
    "datafeed_id": "datafeed-new",
    "job_id": "new",
    "query_delay": "62272ms",
    "chunking_config": {
      "mode": "auto"
    },
    "indices_options": {
      "expand_wildcards": [
        "open"
      ],
      "ignore_unavailable": false,
      "allow_no_indices": true,
      "ignore_throttled": true
    },
    "query": {
      "match_phrase": {
        "log.file.path.keyword": "/test.log"
      }
    },
    "indices": [
      "profile-*"
    ],
    "scroll_size": 1000,
    "delayed_data_check_config": {
      "enabled": true
    },
    "runtime_mappings": {
      "test": {
        "type": "long",
        "script": {
          "source": "emit(Integer.parseInt(doc['value.keyword'].value))"
        }
      }
    },
    "state": "stopped",
    "timing_stats": {
      "job_id": "new",
      "search_count": 25,
      "bucket_count": 741,
      "total_search_time_ms": 736,
      "average_search_time_per_bucket_ms": 0.9932523616734144,
      "exponential_average_search_time_per_hour_ms": 33.03721539834794
    }
  }
}

The job is opened but the datafeed is stopped. (Search for "state" in what you posted to see that.)

The question is, why did that happen? There might be a clue in the "Job messages" tab in your screenshot.

If there is no explanation then you could try restarting the datafeed. But it might stop again if there is some major problem. The job messages might reveal what that is, or if not check for exceptions in the logs on node5 where the job is running.

Hey @droberts195 thank you for replying me
I've restart the datafeed and in the meantime i was monitoring the node5 here is the log

[2022-10-11T17:44:47,007][INFO ][o.e.x.m.d.DatafeedJob    ] [node5] [new] Datafeed started (from: 2022-10-10T17:15:06.742 Z to: real-time) with frequency [450000ms]
[2022-10-11T17:44:47,067][INFO ][o.e.x.m.p.l.CppLogMessageHandler] [node5] [new] [autodetect/355335] [CAnomalyJob.cc@1602 ] Pruning obsolete models
[2022-10-11T17:44:47,068][INFO ][o.e.x.m.p.l.CppLogMessageHandler] [node5] [new] [autodetect/355335] [CAnomalyJob.cc@1137 ] Background persist starting data copy
[2022-10-11T17:44:47,068][INFO ][o.e.x.m.p.l.CppLogMessageHandler] [node5] [new] [autodetect/355335] [CPersistenceManager .cc@271] Background persist starting background thread
[2022-10-11T17:44:47,592][INFO ][o.e.x.m.d.DatafeedJob    ] [node5] [new] Lookback has finished
[2022-10-11T17:44:47,593][INFO ][o.e.x.m.p.l.CppLogMessageHandler] [node5] [new] [autodetect/355335] [CAnomalyJob.cc@1602 ] Pruning obsolete models
[2022-10-11T17:44:47,593][INFO ][o.e.x.m.p.l.CppLogMessageHandler] [node5] [new] [autodetect/355335] [CAnomalyJob.cc@1137 ] Background persist starting data copy
[2022-10-11T17:44:47,593][INFO ][o.e.x.m.p.l.CppLogMessageHandler] [node5] [new] [autodetect/355335] [CPersistenceManager .cc@271] Background persist starting background thread
[2022-10-11T17:49:25,799][INFO ][o.e.x.s.a.AuthorizationService] [node5] Took [62ms] to resolve [1894] indices for action  [indices:data/read/search] and user [elastic]
[2022-10-11T17:51:23,592][INFO ][o.e.x.s.a.AuthorizationService] [node5] Took [73ms] to resolve [1894] indices for action  [indices:data/read/search] and user [elastic]
[2022-10-11T17:52:51,381][INFO ][o.e.x.s.a.AuthorizationService] [node5] Took [94ms] to resolve [957] indices for action  [indices:data/read/search] and user [jsdda]

But i didn't find anything in this

In Job message i saw this

2022-10-11 17:53:32
node5
Datafeed has missed 152 documents due to ingest latency, latest bucket with missing data is [2022-10-11T12:00:00.000Z]. Consider increasing query_delay

The job state is still opened but datafeed is stopped don't know how to figure out this. Can you please help me where i can findout the reason of this automatically datafeeding stop issue

There are two separate issues.

For the delayed data, are you 5.5 hours ahead of UTC, so 2022-10-11T12:00:00.000Z is 2022-10-11T17:30:00.000 in your local time? If so then we're detecting delayed data relatively soon after the end of the bucket that it's reported in. You haven't increased query_delay by very much - it's still only a little over 62 seconds. Are you sure that all data that's indexed into your Elasticsearch will be searchable within 62 seconds of the timestamp in the documents? Is it possible that 152 documents with timestamp 2022-10-11T17:44:59.000 actually weren't searchable until after 2022-10-11T17:46:03.000?

For the issue of the datafeed stopping with no job message or log message on the node where it is running, please can you also check the logs on the master node to see if there is anything interesting around the time it stopped. If not then I think we will need to get more detailed information from your cluster. Do you have a support contract? If so, please open a support case so that we can ask for the necessary information in private.

okay , I will raise the case . Thank you @droberts195 for your help