Hi all,
After a node fail in a 3 node cluster, one of our indexes became in red state.
My solution to recover, at least a primary shard, was to run:
POST /_cluster/reroute
{
"commands" : [
{
"allocate_stale_primary" : {
"index" : "einv_prd_audit_20210112092343", "shard" : 0,
"node" : "Prd-elasticsearch-Node3",
"accept_data_loss":"true"
}
}
]
}
After this, i got my primary shard recovered, but now, my replicas don't recover.
{
"index" : "einv_prd_audit_20210112092343",
"shard" : 0,
"primary" : false,
"current_state" : "unassigned",
"unassigned_info" : {
"reason" : "ALLOCATION_FAILED",
"at" : "2022-02-17T09:48:20.495Z",
"failed_allocation_attempts" : 5,
"details" : "failed shard on node [JSsSc1bJSCy9oFEtavpRgw]: failed recovery, failure RecoveryFailedException[[einv_prd_audit_20210112092343][0]: Recovery failed from {Prd-elasticsearch-Node3}{lw59frLMRMOc0zwwiX143A}{gtDK0yZjRiOsBX1odsjwvg}{192.168.25.208}{192.168.25.208:9300}{cdhilmrstw}{ml.machine_memory=12389928960, ml.max_open_jobs=20, xpack.installed=true, transform.node=true} into {Prd-elasticsearch-Node1}{JSsSc1bJSCy9oFEtavpRgw}{13xDG492Sna74NHX_Nrhrw}{192.168.25.206}{192.168.25.206:9300}{cdhilmrstw}{ml.machine_memory=12389928960, xpack.installed=true, transform.node=true, ml.max_open_jobs=20} (no activity after [30m])]; nested: ElasticsearchTimeoutException[no activity after [30m]]; ",
"last_allocation_status" : "no_attempt"
},
"can_allocate" : "no",
"allocate_explanation" : "cannot allocate because allocation is not permitted to any of the nodes",
"node_allocation_decisions" : [
{
"node_id" : "JSsSc1bJSCy9oFEtavpRgw",
"node_name" : "Prd-elasticsearch-Node1",
"transport_address" : "192.168.25.206:9300",
"node_attributes" : {
"ml.machine_memory" : "12389928960",
"ml.max_open_jobs" : "20",
"xpack.installed" : "true",
"transform.node" : "true"
},
"node_decision" : "no",
"deciders" : [
{
"decider" : "max_retry",
"decision" : "NO",
"explanation" : "shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [/_cluster/reroute?retry_failed=true] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2022-02-17T09:48:20.495Z], failed_attempts[5], failed_nodes[[JSsSc1bJSCy9oFEtavpRgw, zANNQokbR26Lq71LG-Dyww]], delayed=false, details[failed shard on node [JSsSc1bJSCy9oFEtavpRgw]: failed recovery, failure RecoveryFailedException[[einv_prd_audit_20210112092343][0]: Recovery failed from {Prd-elasticsearch-Node3}{lw59frLMRMOc0zwwiX143A}{gtDK0yZjRiOsBX1odsjwvg}{192.168.25.208}{192.168.25.208:9300}{cdhilmrstw}{ml.machine_memory=12389928960, ml.max_open_jobs=20, xpack.installed=true, transform.node=true} into {Prd-elasticsearch-Node1}{JSsSc1bJSCy9oFEtavpRgw}{13xDG492Sna74NHX_Nrhrw}{192.168.25.206}{192.168.25.206:9300}{cdhilmrstw}{ml.machine_memory=12389928960, xpack.installed=true, transform.node=true, ml.max_open_jobs=20} (no activity after [30m])]; nested: ElasticsearchTimeoutException[no activity after [30m]]; ], allocation_status[no_attempt]]]"
}
]
},
{
"node_id" : "lw59frLMRMOc0zwwiX143A",
"node_name" : "Prd-elasticsearch-Node3",
"transport_address" : "192.168.25.208:9300",
"node_attributes" : {
"ml.machine_memory" : "12389928960",
"ml.max_open_jobs" : "20",
"xpack.installed" : "true",
"transform.node" : "true"
},
"node_decision" : "no",
"deciders" : [
{
"decider" : "max_retry",
"decision" : "NO",
"explanation" : "shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [/_cluster/reroute?retry_failed=true] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2022-02-17T09:48:20.495Z], failed_attempts[5], failed_nodes[[JSsSc1bJSCy9oFEtavpRgw, zANNQokbR26Lq71LG-Dyww]], delayed=false, details[failed shard on node [JSsSc1bJSCy9oFEtavpRgw]: failed recovery, failure RecoveryFailedException[[einv_prd_audit_20210112092343][0]: Recovery failed from {Prd-elasticsearch-Node3}{lw59frLMRMOc0zwwiX143A}{gtDK0yZjRiOsBX1odsjwvg}{192.168.25.208}{192.168.25.208:9300}{cdhilmrstw}{ml.machine_memory=12389928960, ml.max_open_jobs=20, xpack.installed=true, transform.node=true} into {Prd-elasticsearch-Node1}{JSsSc1bJSCy9oFEtavpRgw}{13xDG492Sna74NHX_Nrhrw}{192.168.25.206}{192.168.25.206:9300}{cdhilmrstw}{ml.machine_memory=12389928960, xpack.installed=true, transform.node=true, ml.max_open_jobs=20} (no activity after [30m])]; nested: ElasticsearchTimeoutException[no activity after [30m]]; ], allocation_status[no_attempt]]]"
},
{
"decider" : "same_shard",
"decision" : "NO",
"explanation" : "a copy of this shard is already allocated to this node [[einv_prd_audit_20210112092343][0], node[lw59frLMRMOc0zwwiX143A], [P], s[STARTED], a[id=aIjCK_Y8QSSVZYdL22o23Q]]"
}
]
},
{
"node_id" : "zANNQokbR26Lq71LG-Dyww",
"node_name" : "Prd-elasticsearch-Node2",
"transport_address" : "192.168.25.207:9300",
"node_attributes" : {
"ml.machine_memory" : "12389928960",
"ml.max_open_jobs" : "20",
"xpack.installed" : "true",
"transform.node" : "true"
},
"node_decision" : "no",
"deciders" : [
{
"decider" : "max_retry",
"decision" : "NO",
"explanation" : "shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [/_cluster/reroute?retry_failed=true] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2022-02-17T09:48:20.495Z], failed_attempts[5], failed_nodes[[JSsSc1bJSCy9oFEtavpRgw, zANNQokbR26Lq71LG-Dyww]], delayed=false, details[failed shard on node [JSsSc1bJSCy9oFEtavpRgw]: failed recovery, failure RecoveryFailedException[[einv_prd_audit_20210112092343][0]: Recovery failed from {Prd-elasticsearch-Node3}{lw59frLMRMOc0zwwiX143A}{gtDK0yZjRiOsBX1odsjwvg}{192.168.25.208}{192.168.25.208:9300}{cdhilmrstw}{ml.machine_memory=12389928960, ml.max_open_jobs=20, xpack.installed=true, transform.node=true} into {Prd-elasticsearch-Node1}{JSsSc1bJSCy9oFEtavpRgw}{13xDG492Sna74NHX_Nrhrw}{192.168.25.206}{192.168.25.206:9300}{cdhilmrstw}{ml.machine_memory=12389928960, xpack.installed=true, transform.node=true, ml.max_open_jobs=20} (no activity after [30m])]; nested: ElasticsearchTimeoutException[no activity after [30m]]; ], allocation_status[no_attempt]]]"
}
]
}
]
}
In my ELs logs I got these entries:
[2022-02-17T13:50:24,874][WARN ][o.e.i.c.IndicesClusterStateService] [Prd-elasticsearch-Node1] [einv_prd_audit_20210112092343][0] marking and sending shard failed due to [failed recovery]
org.elasticsearch.indices.recovery.RecoveryFailedException: [einv_prd_audit_20210112092343][0]: Recovery failed from {Prd-elasticsearch-Node3}{lw59frLMRMOc0zwwiX143A}{gtDK0yZjRiOsBX1odsjwvg}{192.168.25.208}{192.168.25.208:9300}{cdhilmrstw}{ml.machine_memory=12389928960, ml.max_open_jobs=20, xpack.installed=true, transform.node=true} into {Prd-elasticsearch-Node1}{JSsSc1bJSCy9oFEtavpRgw}{13xDG492Sna74NHX_Nrhrw}{192.168.25.206}{192.168.25.206:9300}{cdhilmrstw}{ml.machine_memory=12389928960, xpack.installed=true, transform.node=true, ml.max_open_jobs=20} (no activity after [30m])
at org.elasticsearch.indices.recovery.RecoveriesCollection$RecoveryMonitor.doRun(RecoveriesCollection.java:282) [elasticsearch-7.10.0.jar:7.10.0]
at org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:737) [elasticsearch-7.10.0.jar:7.10.0]
at org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) [elasticsearch-7.10.0.jar:7.10.0]
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1130) [?:?]
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:630) [?:?]
at java.lang.Thread.run(Thread.java:832) [?:?]
Caused by: org.elasticsearch.ElasticsearchTimeoutException: no activity after [30m]
... 6 more
[2022-02-17T13:50:24,875][WARN ][o.e.i.c.IndicesClusterStateService] [Prd-elasticsearch-Node1] [einv_prd_audit_20210112092343][0] marking and sending shard failed due to [shard failure, reason [failed to recover from translog]]
org.elasticsearch.index.engine.EngineException: failed to recover from translog
at org.elasticsearch.index.engine.InternalEngine.recoverFromTranslogInternal(InternalEngine.java:501) ~[elasticsearch-7.10.0.jar:7.10.0]
at org.elasticsearch.index.engine.InternalEngine.recoverFromTranslog(InternalEngine.java:474) ~[elasticsearch-7.10.0.jar:7.10.0]
at org.elasticsearch.index.engine.InternalEngine.recoverFromTranslog(InternalEngine.java:125) ~[elasticsearch-7.10.0.jar:7.10.0]
at org.elasticsearch.index.shard.IndexShard.recoverLocallyUpToGlobalCheckpoint(IndexShard.java:1473) ~[elasticsearch-7.10.0.jar:7.10.0]
at org.elasticsearch.indices.recovery.PeerRecoveryTargetService.doRecovery(PeerRecoveryTargetService.java:194) ~[elasticsearch-7.10.0.jar:7.10.0]
at org.elasticsearch.indices.recovery.PeerRecoveryTargetService.access$1000(PeerRecoveryTargetService.java:84) ~[elasticsearch-7.10.0.jar:7.10.0]
at org.elasticsearch.indices.recovery.PeerRecoveryTargetService$RecoveryRunner.doRun(PeerRecoveryTargetService.java:539) ~[elasticsearch-7.10.0.jar:7.10.0]
at org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:737) ~[elasticsearch-7.10.0.jar:7.10.0]
at org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) ~[elasticsearch-7.10.0.jar:7.10.0]
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1130) [?:?]
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:630) [?:?]
at java.lang.Thread.run(Thread.java:832) [?:?]
Caused by: org.elasticsearch.index.shard.IllegalIndexShardStateException: CurrentState[CLOSED] operation only allowed when recovering, origin [LOCAL_TRANSLOG_RECOVERY]
at org.elasticsearch.index.shard.IndexShard.ensureWriteAllowed(IndexShard.java:1764) ~[elasticsearch-7.10.0.jar:7.10.0]
at org.elasticsearch.index.shard.IndexShard.applyIndexOperation(IndexShard.java:769) ~[elasticsearch-7.10.0.jar:7.10.0]
at org.elasticsearch.index.shard.IndexShard.applyTranslogOperation(IndexShard.java:1537) ~[elasticsearch-7.10.0.jar:7.10.0]
at org.elasticsearch.index.shard.IndexShard.runTranslogRecovery(IndexShard.java:1568) ~[elasticsearch-7.10.0.jar:7.10.0]
at org.elasticsearch.index.shard.IndexShard.lambda$recoverLocallyUpToGlobalCheckpoint$7(IndexShard.java:1467) ~[elasticsearch-7.10.0.jar:7.10.0]
at org.elasticsearch.index.engine.InternalEngine.recoverFromTranslogInternal(InternalEngine.java:499) ~[elasticsearch-7.10.0.jar:7.10.0]
... 11 more
can someone help me to recover the unassigned replicas?