Looking for some direction with this unique issue in our production cluster. Hoping that @HenningAndersen @dadoonet @Christian_Dahlqvist and/or @Armin_Braun might be able to poke in here and offer feedback.
Details/behavior:
-
This only happens to the 3 most recent indices for the .monitoring-es.
-
This only happens on the production cluster.
-
We do not experience the same behavior in the test cluster. The test cluster is completely green with the exact same configuration, java version, etc (to the best of our knowledge). The only obvious difference is that the test cluster shards are much smaller (400mb vs 4gb in production).
-
If we try
POST /_cluster/reroute?retry_failed=true
it results in the same 3 indices remaining yellow.
Java
# java -version
openjdk version "1.8.0_171"
OpenJDK Runtime Environment (build 1.8.0_171-b10)
OpenJDK 64-Bit Server VM (build 25.171-b10, mixed mode)
GET _cluster/stats?human&pretty
{
"_nodes" : {
"total" : 2,
"successful" : 2,
"failed" : 0
},
"cluster_name" : "ELK",
"cluster_uuid" : "xxx",
"timestamp" : 1581437726376,
"status" : "yellow",
"indices" : {
"count" : 1993,
"shards" : {
"total" : 6675,
"primaries" : 5745,
"replication" : 0.1618798955613577,
"index" : {
"shards" : {
"min" : 1,
"max" : 10,
"avg" : 3.3492222779729053
},
"primaries" : {
"min" : 1,
"max" : 5,
"avg" : 2.882589061716006
},
"replication" : {
"min" : 0.0,
"max" : 1.0,
"avg" : 0.10135474159558455
}
}
},
"docs" : {
"count" : 1872979590,
"deleted" : 526087
},
"store" : {
"size" : "900.5gb",
"size_in_bytes" : 966911625537
},
"fielddata" : {
"memory_size" : "6.7gb",
"memory_size_in_bytes" : 7240630464,
"evictions" : 0
},
"query_cache" : {
"memory_size" : "72.6mb",
"memory_size_in_bytes" : 76186271,
"total_count" : 2581869,
"hit_count" : 336991,
"miss_count" : 2244878,
"cache_size" : 5123,
"cache_count" : 5476,
"evictions" : 353
},
"completion" : {
"size" : "0b",
"size_in_bytes" : 0
},
"segments" : {
"count" : 9796,
"memory" : "1.6gb",
"memory_in_bytes" : 1752365455,
"terms_memory" : "869.3mb",
"terms_memory_in_bytes" : 911551073,
"stored_fields_memory" : "720.3mb",
"stored_fields_memory_in_bytes" : 755351840,
"term_vectors_memory" : "0b",
"term_vectors_memory_in_bytes" : 0,
"norms_memory" : "11mb",
"norms_memory_in_bytes" : 11637888,
"points_memory" : "59.9mb",
"points_memory_in_bytes" : 62903582,
"doc_values_memory" : "10.4mb",
"doc_values_memory_in_bytes" : 10921072,
"index_writer_memory" : "19.8mb",
"index_writer_memory_in_bytes" : 20774632,
"version_map_memory" : "0b",
"version_map_memory_in_bytes" : 0,
"fixed_bit_set" : "11.7mb",
"fixed_bit_set_memory_in_bytes" : 12320792,
"max_unsafe_auto_id_timestamp" : 1581429474359,
"file_sizes" : { }
}
},
"nodes" : {
"count" : {
"total" : 2,
"data" : 2,
"coordinating_only" : 0,
"master" : 1,
"ingest" : 1
},
"versions" : [
"6.8.1"
],
"os" : {
"available_processors" : 12,
"allocated_processors" : 12,
"names" : [
{
"name" : "Linux",
"count" : 2
}
],
"pretty_names" : [
{
"pretty_name" : "CentOS Linux 7 (Core)",
"count" : 2
}
],
"mem" : {
"total" : "44.7gb",
"total_in_bytes" : 48084279296,
"free" : "558.7mb",
"free_in_bytes" : 585912320,
"used" : "44.2gb",
"used_in_bytes" : 47498366976,
"free_percent" : 1,
"used_percent" : 99
}
},
"process" : {
"cpu" : {
"percent" : 50
},
"open_file_descriptors" : {
"min" : 9714,
"max" : 40306,
"avg" : 25010
}
},
"jvm" : {
"max_uptime" : "53.5d",
"max_uptime_in_millis" : 4625220711,
"versions" : [
{
"version" : "1.8.0_191",
"vm_name" : "OpenJDK 64-Bit Server VM",
"vm_version" : "25.191-b12",
"vm_vendor" : "Oracle Corporation",
"count" : 1
},
{
"version" : "1.8.0_171",
"vm_name" : "OpenJDK 64-Bit Server VM",
"vm_version" : "25.171-b10",
"vm_vendor" : "Oracle Corporation",
"count" : 1
}
],
"mem" : {
"heap_used" : "14.2gb",
"heap_used_in_bytes" : 15307236744,
"heap_max" : "22.9gb",
"heap_max_in_bytes" : 24591466496
},
"threads" : 444
},
"fs" : {
"total" : "7.3tb",
"total_in_bytes" : 8122104676352,
"free" : "5.8tb",
"free_in_bytes" : 6417517572096,
"available" : "5.4tb",
"available_in_bytes" : 6032500862976
},
...
}
GET _cluster/allocation/explain?pretty
{
"index" : ".monitoring-es-6-2020.02.09",
"shard" : 0,
"primary" : false,
"current_state" : "unassigned",
"unassigned_info" : {
"reason" : "ALLOCATION_FAILED",
"at" : "2020-02-10T20:53:42.002Z",
"failed_allocation_attempts" : 5,
"details" : "failed shard on node [cXzKDgu0TPqjEF-w9qJURw]: failed recovery, failure RecoveryFailedException[[.monitoring-es-6-2020.02.09][0]: Recovery failed from {ELK-01.hostname.com}{WNppBaICQTKHpO2DAedbpQ}{lKY-IuxnT3auVM50oh_IeA}{ELK-01.hostname.com}{192.168.98.102:9300}{ml.machine_memory=25103704064, ml.max_open_jobs=20, xpack.installed=true, box_type=hot, ml.enabled=true} into {ELK-02.hostname.com}{cXzKDgu0TPqjEF-w9qJURw}{nwT4vhVMSNKC_xthmClT_A}{ELK-02.hostname.com}{192.168.98.103:9300}{ml.machine_memory=22980575232, xpack.installed=true, box_type=warm, ml.max_open_jobs=20, ml.enabled=true}]; nested: RemoteTransportException[[ELK-01.hostname.com][192.168.98.102:9300][internal:index/shard/recovery/start_recovery]]; nested: RecoveryEngineException[Phase[1] phase1 failed]; nested: RecoverFilesRecoveryException[Failed to transfer [144] files with total size of [4.2gb]]; nested: RemoteTransportException[[ELK-02.hostname.com][192.168.98.103:9300][internal:index/shard/recovery/file_chunk]]; nested: CircuitBreakingException[[parent] Data too large, data for [<transport_request>] would be [8231262357/7.6gb], which is larger than the limit of [8231203635/7.6gb], usages [request=0/0b, fielddata=6623107425/6.1gb, in_flight_requests=1022428/998.4kb, accounting=1607132504/1.4gb]]; ",
"last_allocation_status" : "no_attempt"
},
"can_allocate" : "no",
"allocate_explanation" : "cannot allocate because allocation is not permitted to any of the nodes",
"node_allocation_decisions" : [
{
"node_id" : "WNppBaICQTKHpO2DAedbpQ",
"node_name" : "ELK-01.hostname.com",
"transport_address" : "192.168.98.102:9300",
"node_attributes" : {
"ml.machine_memory" : "25103704064",
"xpack.installed" : "true",
"box_type" : "hot",
"ml.max_open_jobs" : "20",
"ml.enabled" : "true"
},
"node_decision" : "no",
"deciders" : [
{
"decider" : "max_retry",
"decision" : "NO",
"explanation" : "shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [/_cluster/reroute?retry_failed=true] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2020-02-10T20:53:42.002Z], failed_attempts[5], delayed=false, details[failed shard on node [cXzKDgu0TPqjEF-w9qJURw]: failed recovery, failure RecoveryFailedException[[.monitoring-es-6-2020.02.09][0]: Recovery failed from {ELK-01.hostname.com}{WNppBaICQTKHpO2DAedbpQ}{lKY-IuxnT3auVM50oh_IeA}{ELK-01.hostname.com}{192.168.98.102:9300}{ml.machine_memory=25103704064, ml.max_open_jobs=20, xpack.installed=true, box_type=hot, ml.enabled=true} into {ELK-02.hostname.com}{cXzKDgu0TPqjEF-w9qJURw}{nwT4vhVMSNKC_xthmClT_A}{ELK-02.hostname.com}{192.168.98.103:9300}{ml.machine_memory=22980575232, xpack.installed=true, box_type=warm, ml.max_open_jobs=20, ml.enabled=true}]; nested: RemoteTransportException[[ELK-01.hostname.com][192.168.98.102:9300][internal:index/shard/recovery/start_recovery]]; nested: RecoveryEngineException[Phase[1] phase1 failed]; nested: RecoverFilesRecoveryException[Failed to transfer [144] files with total size of [4.2gb]]; nested: RemoteTransportException[[ELK-02.hostname.com][192.168.98.103:9300][internal:index/shard/recovery/file_chunk]]; nested: CircuitBreakingException[[parent] Data too large, data for [<transport_request>] would be [8231262357/7.6gb], which is larger than the limit of [8231203635/7.6gb], usages [request=0/0b, fielddata=6623107425/6.1gb, in_flight_requests=1022428/998.4kb, accounting=1607132504/1.4gb]]; ], allocation_status[no_attempt]]]"
},
{
"decider" : "same_shard",
"decision" : "NO",
"explanation" : "the shard cannot be allocated to the same node on which a copy of the shard already exists [[.monitoring-es-6-2020.02.09][0], node[WNppBaICQTKHpO2DAedbpQ], [P], s[STARTED], a[id=DpdjTOlISEqRX0DUk7ECnw]]"
}
]
},
{
"node_id" : "cXzKDgu0TPqjEF-w9qJURw",
"node_name" : "ELK-02.hostname.com",
"transport_address" : "192.168.98.103:9300",
"node_attributes" : {
"ml.machine_memory" : "22980575232",
"ml.max_open_jobs" : "20",
"xpack.installed" : "true",
"box_type" : "warm",
"ml.enabled" : "true"
},
"node_decision" : "no",
"deciders" : [
{
"decider" : "max_retry",
"decision" : "NO",
"explanation" : "shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [/_cluster/reroute?retry_failed=true] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2020-02-10T20:53:42.002Z], failed_attempts[5], delayed=false, details[failed shard on node [cXzKDgu0TPqjEF-w9qJURw]: failed recovery, failure RecoveryFailedException[[.monitoring-es-6-2020.02.09][0]: Recovery failed from {ELK-01.hostname.com}{WNppBaICQTKHpO2DAedbpQ}{lKY-IuxnT3auVM50oh_IeA}{ELK-01.hostname.com}{192.168.98.102:9300}{ml.machine_memory=25103704064, ml.max_open_jobs=20, xpack.installed=true, box_type=hot, ml.enabled=true} into {ELK-02.hostname.com}{cXzKDgu0TPqjEF-w9qJURw}{nwT4vhVMSNKC_xthmClT_A}{ELK-02.hostname.com}{192.168.98.103:9300}{ml.machine_memory=22980575232, xpack.installed=true, box_type=warm, ml.max_open_jobs=20, ml.enabled=true}]; nested: RemoteTransportException[[ELK-01.hostname.com][192.168.98.102:9300][internal:index/shard/recovery/start_recovery]]; nested: RecoveryEngineException[Phase[1] phase1 failed]; nested: RecoverFilesRecoveryException[Failed to transfer [144] files with total size of [4.2gb]]; nested: RemoteTransportException[[ELK-02.hostname.com][192.168.98.103:9300][internal:index/shard/recovery/file_chunk]]; nested: CircuitBreakingException[[parent] Data too large, data for [<transport_request>] would be [8231262357/7.6gb], which is larger than the limit of [8231203635/7.6gb], usages [request=0/0b, fielddata=6623107425/6.1gb, in_flight_requests=1022428/998.4kb, accounting=1607132504/1.4gb]]; ], allocation_status[no_attempt]]]"
}
]
}
]
}
I've seen a couple of different recommendations for other situations where this error arises. Such as changing bulk request size and editing the jvm.options file to update G1 GC settings. Since we are running Java 8, I don't believe the G1 GC settings apply.