Hello, I'm looking for direction on this error.
Last week after cleaning and reinstanciating a cluster the error appear.
RemoteTransportException[[es-data-hot-z2-1-x0hxwanma57yajfoe61amoek8][10.0.34.36:9300][internal:index/shard/recovery/start_recovery]]; nested: RemoteTransportException[[File corruption occurred on recovery but checksums are ok]]
I have no clue where to begin.
The ELK stack is instanciated in a docker swarm environnement.
The IP mentionned are the overlay network from docker, if that would help.
Useful details below :
GET _cluster/allocation/explain?pretty
{
"index" : ".ds-pfs_cachehttp-000004",
"shard" : 1,
"primary" : false,
"current_state" : "unassigned",
"unassigned_info" : {
"reason" : "ALLOCATION_FAILED",
"at" : "2021-09-06T11:30:50.408Z",
"failed_allocation_attempts" : 5,
"details" : "failed shard on node [zG1R0rbIQl6X8Bt3A9tczw]: failed recovery, failure RecoveryFailedException[[.ds-pfs_cachehttp-000004][1]: Recovery failed from {es-data-hot-z2-1-x0hxwanma57yajfoe61amoek8}{OmvEo07cTTegiaRlL36KVg}{p6ODtzb5QP-xQu9vR5FWYA}{10.0.34.36}{10.0.34.36:9300}{cdhilrstw}{ml.machine_memory=17179869184, rack=zone2, ml.max_open_jobs=20, xpack.installed=true, transform.node=true} into {es-data-hot-z1-2-2e7hi8dql06atwgh5pu05bqmp}{zG1R0rbIQl6X8Bt3A9tczw}{X4nnGcNsSOqw2tmJAS3Qvw}{10.0.34.45}{10.0.34.45:9300}{cdhilrstw}{ml.machine_memory=17179869184, rack=zone1, xpack.installed=true, transform.node=true, ml.max_open_jobs=20}]; nested: RemoteTransportException[[es-data-hot-z2-1-x0hxwanma57yajfoe61amoek8][10.0.34.36:9300][internal:index/shard/recovery/start_recovery]]; nested: RemoteTransportException[[File corruption occurred on recovery but checksums are ok]]; ",
"last_allocation_status" : "no_attempt"
},
"can_allocate" : "no",
"allocate_explanation" : "cannot allocate because allocation is not permitted to any of the nodes",
"node_allocation_decisions" : [
{
"node_id" : "55yKGeAvTQOeAeXkwKsrrw",
"node_name" : "es-data-hot-z1-1-p9cf9jinorunhzipsu0jp4hev",
"transport_address" : "10.0.34.34:9300",
"node_attributes" : {
"ml.machine_memory" : "17179869184",
"rack" : "zone1",
"ml.max_open_jobs" : "20",
"xpack.installed" : "true",
"transform.node" : "true"
},
"node_decision" : "no",
"deciders" : [
{
"decider" : "max_retry",
"decision" : "NO",
"explanation" : "shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [/_cluster/reroute?retry_failed=true] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2021-09-06T11:30:50.408Z], failed_attempts[5], failed_nodes[[6cFblf1ISzC7Bc46G9V-0w, zG1R0rbIQl6X8Bt3A9tczw, 55yKGeAvTQOeAeXkwKsrrw]], delayed=false, details[failed shard on node [zG1R0rbIQl6X8Bt3A9tczw]: failed recovery, failure RecoveryFailedException[[.ds-pfs_cachehttp-000004][1]: Recovery failed from {es-data-hot-z2-1-x0hxwanma57yajfoe61amoek8}{OmvEo07cTTegiaRlL36KVg}{p6ODtzb5QP-xQu9vR5FWYA}{10.0.34.36}{10.0.34.36:9300}{cdhilrstw}{ml.machine_memory=17179869184, rack=zone2, ml.max_open_jobs=20, xpack.installed=true, transform.node=true} into {es-data-hot-z1-2-2e7hi8dql06atwgh5pu05bqmp}{zG1R0rbIQl6X8Bt3A9tczw}{X4nnGcNsSOqw2tmJAS3Qvw}{10.0.34.45}{10.0.34.45:9300}{cdhilrstw}{ml.machine_memory=17179869184, rack=zone1, xpack.installed=true, transform.node=true, ml.max_open_jobs=20}]; nested: RemoteTransportException[[es-data-hot-z2-1-x0hxwanma57yajfoe61amoek8][10.0.34.36:9300][internal:index/shard/recovery/start_recovery]]; nested: RemoteTransportException[[File corruption occurred on recovery but checksums are ok]]; ], allocation_status[no_attempt]]]"
}
]
},
{
"node_id" : "6cFblf1ISzC7Bc46G9V-0w",
"node_name" : "es-data-hot-z1-2-p9cf9jinorunhzipsu0jp4hev",
"transport_address" : "10.0.34.47:9300",
"node_attributes" : {
"ml.machine_memory" : "17179869184",
"rack" : "zone1",
"ml.max_open_jobs" : "20",
"xpack.installed" : "true",
"transform.node" : "true"
},
"node_decision" : "no",
"deciders" : [
{
"decider" : "max_retry",
"decision" : "NO",
"explanation" : "shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [/_cluster/reroute?retry_failed=true] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2021-09-06T11:30:50.408Z], failed_attempts[5], failed_nodes[[6cFblf1ISzC7Bc46G9V-0w, zG1R0rbIQl6X8Bt3A9tczw, 55yKGeAvTQOeAeXkwKsrrw]], delayed=false, details[failed shard on node [zG1R0rbIQl6X8Bt3A9tczw]: failed recovery, failure RecoveryFailedException[[.ds-pfs_cachehttp-000004][1]: Recovery failed from {es-data-hot-z2-1-x0hxwanma57yajfoe61amoek8}{OmvEo07cTTegiaRlL36KVg}{p6ODtzb5QP-xQu9vR5FWYA}{10.0.34.36}{10.0.34.36:9300}{cdhilrstw}{ml.machine_memory=17179869184, rack=zone2, ml.max_open_jobs=20, xpack.installed=true, transform.node=true} into {es-data-hot-z1-2-2e7hi8dql06atwgh5pu05bqmp}{zG1R0rbIQl6X8Bt3A9tczw}{X4nnGcNsSOqw2tmJAS3Qvw}{10.0.34.45}{10.0.34.45:9300}{cdhilrstw}{ml.machine_memory=17179869184, rack=zone1, xpack.installed=true, transform.node=true, ml.max_open_jobs=20}]; nested: RemoteTransportException[[es-data-hot-z2-1-x0hxwanma57yajfoe61amoek8][10.0.34.36:9300][internal:index/shard/recovery/start_recovery]]; nested: RemoteTransportException[[File corruption occurred on recovery but checksums are ok]]; ], allocation_status[no_attempt]]]"
}
]
},
{
"node_id" : "8CCfzfIlQfmGWK7VdYpWGw",
"node_name" : "es-data-hot-z2-1-8c8ds36js47jcu6a9p5juz90a",
"transport_address" : "10.0.34.38:9300",
"node_attributes" : {
"ml.machine_memory" : "17179869184",
"rack" : "zone2",
"ml.max_open_jobs" : "20",
"xpack.installed" : "true",
"transform.node" : "true"
},
"node_decision" : "no",
"deciders" : [
{
"decider" : "max_retry",
"decision" : "NO",
"explanation" : "shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [/_cluster/reroute?retry_failed=true] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2021-09-06T11:30:50.408Z], failed_attempts[5], failed_nodes[[6cFblf1ISzC7Bc46G9V-0w, zG1R0rbIQl6X8Bt3A9tczw, 55yKGeAvTQOeAeXkwKsrrw]], delayed=false, details[failed shard on node [zG1R0rbIQl6X8Bt3A9tczw]: failed recovery, failure RecoveryFailedException[[.ds-pfs_cachehttp-000004][1]: Recovery failed from {es-data-hot-z2-1-x0hxwanma57yajfoe61amoek8}{OmvEo07cTTegiaRlL36KVg}{p6ODtzb5QP-xQu9vR5FWYA}{10.0.34.36}{10.0.34.36:9300}{cdhilrstw}{ml.machine_memory=17179869184, rack=zone2, ml.max_open_jobs=20, xpack.installed=true, transform.node=true} into {es-data-hot-z1-2-2e7hi8dql06atwgh5pu05bqmp}{zG1R0rbIQl6X8Bt3A9tczw}{X4nnGcNsSOqw2tmJAS3Qvw}{10.0.34.45}{10.0.34.45:9300}{cdhilrstw}{ml.machine_memory=17179869184, rack=zone1, xpack.installed=true, transform.node=true, ml.max_open_jobs=20}]; nested: RemoteTransportException[[es-data-hot-z2-1-x0hxwanma57yajfoe61amoek8][10.0.34.36:9300][internal:index/shard/recovery/start_recovery]]; nested: RemoteTransportException[[File corruption occurred on recovery but checksums are ok]]; ], allocation_status[no_attempt]]]"
},
{
"decider" : "awareness",
"decision" : "NO",
"explanation" : "there are too many copies of the shard allocated to nodes with attribute [rack], there are [2] total configured shard copies for this shard id and [3] total attribute values, expected the allocated shard count per attribute [2] to be less than or equal to the upper bound of the required number of shards per attribute [1]"
}
]
},
{
"node_id" : "IhnjpdG-Rbq_n0caOqOlCQ",
"node_name" : "es-data-hot-z1-1-2e7hi8dql06atwgh5pu05bqmp",
"transport_address" : "10.0.34.32:9300",
"node_attributes" : {
"ml.machine_memory" : "17179869184",
"rack" : "zone1",
"ml.max_open_jobs" : "20",
"xpack.installed" : "true",
"transform.node" : "true"
},
"node_decision" : "no",
"deciders" : [
{
"decider" : "max_retry",
"decision" : "NO",
"explanation" : "shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [/_cluster/reroute?retry_failed=true] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2021-09-06T11:30:50.408Z], failed_attempts[5], failed_nodes[[6cFblf1ISzC7Bc46G9V-0w, zG1R0rbIQl6X8Bt3A9tczw, 55yKGeAvTQOeAeXkwKsrrw]], delayed=false, details[failed shard on node [zG1R0rbIQl6X8Bt3A9tczw]: failed recovery, failure RecoveryFailedException[[.ds-pfs_cachehttp-000004][1]: Recovery failed from {es-data-hot-z2-1-x0hxwanma57yajfoe61amoek8}{OmvEo07cTTegiaRlL36KVg}{p6ODtzb5QP-xQu9vR5FWYA}{10.0.34.36}{10.0.34.36:9300}{cdhilrstw}{ml.machine_memory=17179869184, rack=zone2, ml.max_open_jobs=20, xpack.installed=true, transform.node=true} into {es-data-hot-z1-2-2e7hi8dql06atwgh5pu05bqmp}{zG1R0rbIQl6X8Bt3A9tczw}{X4nnGcNsSOqw2tmJAS3Qvw}{10.0.34.45}{10.0.34.45:9300}{cdhilrstw}{ml.machine_memory=17179869184, rack=zone1, xpack.installed=true, transform.node=true, ml.max_open_jobs=20}]; nested: RemoteTransportException[[es-data-hot-z2-1-x0hxwanma57yajfoe61amoek8][10.0.34.36:9300][internal:index/shard/recovery/start_recovery]]; nested: RemoteTransportException[[File corruption occurred on recovery but checksums are ok]]; ], allocation_status[no_attempt]]]"
}
]
},
{
"node_id" : "M_HG7azrRASwJo5xVRk-gw",
"node_name" : "es-data-hot-z1-1-qz8oolh9idypl5vod0raah86k",
"transport_address" : "10.0.34.33:9300",
"node_attributes" : {
"ml.machine_memory" : "17179869184",
"rack" : "zone1",
"ml.max_open_jobs" : "20",
"xpack.installed" : "true",
"transform.node" : "true"
},
"node_decision" : "no",
"deciders" : [
{
"decider" : "max_retry",
"decision" : "NO",
"explanation" : "shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [/_cluster/reroute?retry_failed=true] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2021-09-06T11:30:50.408Z], failed_attempts[5], failed_nodes[[6cFblf1ISzC7Bc46G9V-0w, zG1R0rbIQl6X8Bt3A9tczw, 55yKGeAvTQOeAeXkwKsrrw]], delayed=false, details[failed shard on node [zG1R0rbIQl6X8Bt3A9tczw]: failed recovery, failure RecoveryFailedException[[.ds-pfs_cachehttp-000004][1]: Recovery failed from {es-data-hot-z2-1-x0hxwanma57yajfoe61amoek8}{OmvEo07cTTegiaRlL36KVg}{p6ODtzb5QP-xQu9vR5FWYA}{10.0.34.36}{10.0.34.36:9300}{cdhilrstw}{ml.machine_memory=17179869184, rack=zone2, ml.max_open_jobs=20, xpack.installed=true, transform.node=true} into {es-data-hot-z1-2-2e7hi8dql06atwgh5pu05bqmp}{zG1R0rbIQl6X8Bt3A9tczw}{X4nnGcNsSOqw2tmJAS3Qvw}{10.0.34.45}{10.0.34.45:9300}{cdhilrstw}{ml.machine_memory=17179869184, rack=zone1, xpack.installed=true, transform.node=true, ml.max_open_jobs=20}]; nested: RemoteTransportException[[es-data-hot-z2-1-x0hxwanma57yajfoe61amoek8][10.0.34.36:9300][internal:index/shard/recovery/start_recovery]]; nested: RemoteTransportException[[File corruption occurred on recovery but checksums are ok]]; ], allocation_status[no_attempt]]]"
}
]
},
...