Hello,
Thanks for you answer.
- Yes no issues detected in cluster, before of when the issue occurs (when ilm pending tasks are blocked by the update_tsdb_data_stream_end_times tasks), the health statuts of the cluster remains green.
- Here is an extract of a GET request to elastiscearch cluster on url "_tasks" when the pending task "update_tsdb_data_stream_end_times" occurs :
{
"tasks" : {
"nP9kF78cTJyyRtgtVbmaTA:28184136" : {
"node" : "nP9kF78cTJyyRtgtVbmaTA",
"id" : 28184136,
"type" : "transport",
"action" : "cluster:monitor/tasks/lists",
"description" : "",
"start_time_in_millis" : 1746058741383,
"running_time_in_nanos" : 195985,
"cancellable" : true,
"cancelled" : false,
"headers" : { },
"children" : [
{
"node" : "nP9kF78cTJyyRtgtVbmaTA",
"id" : 28184137,
"type" : "transport",
"action" : "cluster:monitor/tasks/lists[n]",
"description" : "",
"start_time_in_millis" : 1746058741384,
"running_time_in_nanos" : 59306,
"cancellable" : true,
"cancelled" : false,
"parent_task_id" : "nP9kF78cTJyyRtgtVbmaTA:28184136",
"headers" : { }
},
{
"node" : "mv8BvCSqSeqbYKPBBRIzUA",
"id" : 31154761,
"type" : "transport",
"action" : "cluster:monitor/tasks/lists[n]",
"description" : "",
"start_time_in_millis" : 1746058741384,
"running_time_in_nanos" : 56617,
"cancellable" : true,
"cancelled" : false,
"parent_task_id" : "nP9kF78cTJyyRtgtVbmaTA:28184136",
"headers" : { }
},
{
"node" : "fQ5flyTAQjusRvNX_z2e1Q",
"id" : 30060773,
"type" : "transport",
"action" : "cluster:monitor/tasks/lists[n]",
"description" : "",
"start_time_in_millis" : 1746058741385,
"running_time_in_nanos" : 33331,
"cancellable" : true,
"cancelled" : false,
"parent_task_id" : "nP9kF78cTJyyRtgtVbmaTA:28184136",
"headers" : { }
}
]
},
"nP9kF78cTJyyRtgtVbmaTA:28184131" : {
"node" : "nP9kF78cTJyyRtgtVbmaTA",
"id" : 28184131,
"type" : "transport",
"action" : "indices:data/write/bulk",
"description" : "requests[125], indices[my_index_syslog]",
"start_time_in_millis" : 1746058741381,
"running_time_in_nanos" : 2692490,
"cancellable" : false,
"headers" : {
"X-elastic-product-origin" : "logstash-output-elasticsearch"
},
"children" : [
{
"node" : "nP9kF78cTJyyRtgtVbmaTA",
"id" : 28184134,
"type" : "transport",
"action" : "indices:data/write/bulk[s]",
"status" : {
"phase" : "waiting_on_primary"
},
"description" : "requests[125], index[my_index_syslog-000100][0]",
"start_time_in_millis" : 1746058741383,
"running_time_in_nanos" : 621558,
"cancellable" : false,
"parent_task_id" : "nP9kF78cTJyyRtgtVbmaTA:28184131",
"headers" : {
"X-elastic-product-origin" : "logstash-output-elasticsearch"
},
"children" : [
{
"node" : "nP9kF78cTJyyRtgtVbmaTA",
"id" : 28184135,
"type" : "transport",
"action" : "indices:data/write/bulk[s][p]",
"status" : {
"phase" : "primary"
},
"description" : "requests[125], index[my_index_syslog-000100][0]",
"start_time_in_millis" : 1746058741383,
"running_time_in_nanos" : 582414,
"cancellable" : false,
"parent_task_id" : "nP9kF78cTJyyRtgtVbmaTA:28184134",
"headers" : {
"X-elastic-product-origin" : "logstash-output-elasticsearch"
}
}
]
}
]
},
"fQ5flyTAQjusRvNX_z2e1Q:46" : {
"node" : "fQ5flyTAQjusRvNX_z2e1Q",
"id" : 46,
"type" : "persistent",
"action" : "health-node[c]",
"status" : {
"state" : "STARTED"
},
"description" : "id=health-node",
"start_time_in_millis" : 1745933117413,
"running_time_in_nanos" : 125623971764880,
"cancellable" : true,
"cancelled" : false,
"parent_task_id" : "cluster:8",
"headers" : { }
},
"fQ5flyTAQjusRvNX_z2e1Q:47" : {
"node" : "fQ5flyTAQjusRvNX_z2e1Q",
"id" : 47,
"type" : "persistent",
"action" : "geoip-downloader[c]",
"status" : {
"successful_downloads" : 0,
"failed_downloads" : 0,
"total_download_time" : 0,
"databases_count" : 0,
"skipped_updates" : 0,
"expired_databases" : 0
},
"description" : "id=geoip-downloader",
"start_time_in_millis" : 1745933117414,
"running_time_in_nanos" : 125623971070945,
"cancellable" : true,
"cancelled" : false,
"parent_task_id" : "cluster:9",
"headers" : { }
},
"mv8BvCSqSeqbYKPBBRIzUA:31154755" : {
"node" : "mv8BvCSqSeqbYKPBBRIzUA",
"id" : 31154755,
"type" : "transport",
"action" : "indices:data/write/bulk",
"description" : "requests[125], indices[my_index_syslog]",
"start_time_in_millis" : 1746058741372,
"running_time_in_nanos" : 12130414,
"cancellable" : false,
"headers" : {
"X-elastic-product-origin" : "logstash-output-elasticsearch"
},
"children" : [
{
"node" : "mv8BvCSqSeqbYKPBBRIzUA",
"id" : 31154756,
"type" : "transport",
"action" : "indices:data/write/bulk[s]",
"status" : {
"phase" : "rerouted"
},
"description" : "requests[125], index[my_index_syslog-000100][0]",
"start_time_in_millis" : 1746058741372,
"running_time_in_nanos" : 12002698,
"cancellable" : false,
"parent_task_id" : "mv8BvCSqSeqbYKPBBRIzUA:31154755",
"headers" : {
"X-elastic-product-origin" : "logstash-output-elasticsearch"
},
"children" : [
{
"node" : "nP9kF78cTJyyRtgtVbmaTA",
"id" : 28184127,
"type" : "transport",
"action" : "indices:data/write/bulk[s]",
"status" : {
"phase" : "waiting_on_primary"
},
"description" : "requests[125], index[my_index_syslog-000100][0]",
"start_time_in_millis" : 1746058741373,
"running_time_in_nanos" : 10952525,
"cancellable" : false,
"parent_task_id" : "mv8BvCSqSeqbYKPBBRIzUA:31154756",
"headers" : {
"X-elastic-product-origin" : "logstash-output-elasticsearch"
},
"children" : [
{
"node" : "nP9kF78cTJyyRtgtVbmaTA",
"id" : 28184128,
"type" : "transport",
"action" : "indices:data/write/bulk[s][p]",
"status" : {
"phase" : "primary"
},
"description" : "requests[125], index[my_index_syslog-000100][0]",
"start_time_in_millis" : 1746058741373,
"running_time_in_nanos" : 10872276,
"cancellable" : false,
"parent_task_id" : "nP9kF78cTJyyRtgtVbmaTA:28184127",
"headers" : {
"X-elastic-product-origin" : "logstash-output-elasticsearch"
}
}
]
}
]
}
]
},
"mv8BvCSqSeqbYKPBBRIzUA:31154759" : {
"node" : "mv8BvCSqSeqbYKPBBRIzUA",
"id" : 31154759,
"type" : "transport",
"action" : "indices:data/write/bulk",
"description" : "requests[125], indices[my_index_syslog]",
"start_time_in_millis" : 1746058741373,
"running_time_in_nanos" : 10824884,
"cancellable" : false,
"headers" : {
"X-elastic-product-origin" : "logstash-output-elasticsearch"
},
"children" : [
{
"node" : "mv8BvCSqSeqbYKPBBRIzUA",
"id" : 31154760,
"type" : "transport",
"action" : "indices:data/write/bulk[s]",
"status" : {
"phase" : "rerouted"
},
"description" : "requests[125], index[my_index_syslog-000100][0]",
"start_time_in_millis" : 1746058741373,
"running_time_in_nanos" : 10685634,
"cancellable" : false,
"parent_task_id" : "mv8BvCSqSeqbYKPBBRIzUA:31154759",
"headers" : {
"X-elastic-product-origin" : "logstash-output-elasticsearch"
},
"children" : [
{
"node" : "nP9kF78cTJyyRtgtVbmaTA",
"id" : 28184129,
"type" : "transport",
"action" : "indices:data/write/bulk[s]",
"status" : {
"phase" : "waiting_on_primary"
},
"description" : "requests[125], index[my_index_syslog-000100][0]",
"start_time_in_millis" : 1746058741374,
"running_time_in_nanos" : 9547174,
"cancellable" : false,
"parent_task_id" : "mv8BvCSqSeqbYKPBBRIzUA:31154760",
"headers" : {
"X-elastic-product-origin" : "logstash-output-elasticsearch"
},
"children" : [
{
"node" : "nP9kF78cTJyyRtgtVbmaTA",
"id" : 28184130,
"type" : "transport",
"action" : "indices:data/write/bulk[s][p]",
"status" : {
"phase" : "primary"
},
"description" : "requests[125], index[my_index_syslog-000100][0]",
"start_time_in_millis" : 1746058741374,
"running_time_in_nanos" : 9394368,
"cancellable" : false,
"parent_task_id" : "nP9kF78cTJyyRtgtVbmaTA:28184129",
"headers" : {
"X-elastic-product-origin" : "logstash-output-elasticsearch"
}
}
]
}
]
}
]
},
"fQ5flyTAQjusRvNX_z2e1Q:30060767" : {
"node" : "fQ5flyTAQjusRvNX_z2e1Q",
"id" : 30060767,
"type" : "transport",
"action" : "indices:data/write/bulk",
"description" : "requests[125], indices[my_index_syslog]",
"start_time_in_millis" : 1746058741380,
"running_time_in_nanos" : 4870133,
"cancellable" : false,
"headers" : {
"X-elastic-product-origin" : "logstash-output-elasticsearch"
},
"children" : [
{
"node" : "fQ5flyTAQjusRvNX_z2e1Q",
"id" : 30060768,
"type" : "transport",
"action" : "indices:data/write/bulk[s]",
"status" : {
"phase" : "rerouted"
},
"description" : "requests[125], index[my_index_syslog-000100][0]",
"start_time_in_millis" : 1746058741380,
"running_time_in_nanos" : 4728549,
"cancellable" : false,
"parent_task_id" : "fQ5flyTAQjusRvNX_z2e1Q:30060767",
"headers" : {
"X-elastic-product-origin" : "logstash-output-elasticsearch"
},
"children" : [
{
"node" : "nP9kF78cTJyyRtgtVbmaTA",
"id" : 28184132,
"type" : "transport",
"action" : "indices:data/write/bulk[s]",
"status" : {
"phase" : "waiting_on_primary"
},
"description" : "requests[125], index[my_index_syslog-000100][0]",
"start_time_in_millis" : 1746058741383,
"running_time_in_nanos" : 906145,
"cancellable" : false,
"parent_task_id" : "fQ5flyTAQjusRvNX_z2e1Q:30060768",
"headers" : {
"X-elastic-product-origin" : "logstash-output-elasticsearch"
},
"children" : [
{
"node" : "nP9kF78cTJyyRtgtVbmaTA",
"id" : 28184133,
"type" : "transport",
"action" : "indices:data/write/bulk[s][p]",
"status" : {
"phase" : "primary"
},
"description" : "requests[125], index[my_index_syslog-000100][0]",
"start_time_in_millis" : 1746058741383,
"running_time_in_nanos" : 797269,
"cancellable" : false,
"parent_task_id" : "nP9kF78cTJyyRtgtVbmaTA:28184132",
"headers" : {
"X-elastic-product-origin" : "logstash-output-elasticsearch"
}
}
]
}
]
}
]
},
"nP9kF78cTJyyRtgtVbmaTA:28184124" : {
"node" : "nP9kF78cTJyyRtgtVbmaTA",
"id" : 28184124,
"type" : "transport",
"action" : "indices:data/write/bulk",
"description" : "requests[125], indices[my_index_syslog]",
"start_time_in_millis" : 1746058741370,
"running_time_in_nanos" : 13491482,
"cancellable" : false,
"headers" : {
"X-elastic-product-origin" : "logstash-output-elasticsearch"
},
"children" : [
{
"node" : "nP9kF78cTJyyRtgtVbmaTA",
"id" : 28184125,
"type" : "transport",
"action" : "indices:data/write/bulk[s]",
"status" : {
"phase" : "waiting_on_primary"
},
"description" : "requests[125], index[my_index_syslog-000100][0]",
"start_time_in_millis" : 1746058741370,
"running_time_in_nanos" : 13291034,
"cancellable" : false,
"parent_task_id" : "nP9kF78cTJyyRtgtVbmaTA:28184124",
"headers" : {
"X-elastic-product-origin" : "logstash-output-elasticsearch"
},
"children" : [
{
"node" : "fQ5flyTAQjusRvNX_z2e1Q",
"id" : 30060771,
"type" : "transport",
"action" : "indices:data/write/bulk[s][r]",
"status" : {
"phase" : "replica"
},
"description" : "requests[125], index[my_index_syslog-000100][0]",
"start_time_in_millis" : 1746058741384,
"running_time_in_nanos" : 623658,
"cancellable" : false,
"parent_task_id" : "nP9kF78cTJyyRtgtVbmaTA:28184125",
"headers" : {
"X-elastic-product-origin" : "logstash-output-elasticsearch"
}
},
{
"node" : "nP9kF78cTJyyRtgtVbmaTA",
"id" : 28184126,
"type" : "transport",
"action" : "indices:data/write/bulk[s][p]",
"status" : {
"phase" : "primary"
},
"description" : "requests[125], index[my_index_syslog-000100][0]",
"start_time_in_millis" : 1746058741370,
"running_time_in_nanos" : 13206032,
"cancellable" : false,
"parent_task_id" : "nP9kF78cTJyyRtgtVbmaTA:28184125",
"headers" : {
"X-elastic-product-origin" : "logstash-output-elasticsearch"
}
}
]
}
]
}
}
}
- Nothing unusual has been detected on the elastiscearch logs, we can see that at a regular time interval there are logs indicating that ilm policy is ready for rollover to process be cause conditions are reached.
- Here is an extract of a GET request to elastiscearch cluster on url "my_index_syslog-000100/_ilm/explain" when the pending task "update_tsdb_data_stream_end_times" occurs:
{
"indices" : {
"my_index_syslog-000100" : {
"index" : "my_index_syslog-000100",
"managed" : true,
"policy" : "raw_syslog",
"index_creation_date_millis" : 1746057376892,
"time_since_index_creation" : "22.73m",
"lifecycle_date_millis" : 1746057376892,
"age" : "22.73m",
"phase" : "hot",
"phase_time_millis" : 1746057377030,
"action" : "rollover",
"action_time_millis" : 1746057377230,
"step" : "check-rollover-ready",
"step_time_millis" : 1746057377230,
"phase_execution" : {
"policy" : "raw_syslog",
"phase_definition" : {
"min_age" : "0ms",
"actions" : {
"rollover" : {
"max_age" : "20m",
"max_primary_shard_docs" : 200000000,
"min_docs" : 1,
"max_size" : "10gb"
}
}
},
"version" : 1,
"modified_date_in_millis" : 1745929943516
}
}
}
}
So we can see here that age is higher than max_age parameter, and I've seen that when the issue occurs, we can see in _ilm/explain url that the age will grow undefinedly until either disk saturation occurs or I restart elasticsearch nodes.
Thank you for your help.