We currently run Elasticsearch version 7.6.1 on three machines with roles dilm for all.
Our index is divided into three shards and 2 replicas. Lately we've noticed that shards are not allocated on one node.
When I run the _GET /_cluster/allocation/explain request, I get the following output
{
"index" : "logstash-2021.10.09",
"shard" : 1,
"primary" : false,
"current_state" : "unassigned",
"unassigned_info" : {
"reason" : "ALLOCATION_FAILED",
"at" : "2021-10-09T01:08:47.724Z",
"failed_allocation_attempts" : 5,
"details" : """failed shard on node [FoRT9BWgSIeSTIpyjIkK7w]: shard failure, reason [index id[M2NmYzU1NWEtY2VkMi00NmUwLTk4ZTMtZGJhYmFlODBiMWZl] origin[REPLICA] seq#[203428]], failure IllegalArgumentException[DocValuesField "_seq_no" appears more than once in this document (only one value is allowed per field)]""",
"last_allocation_status" : "no_attempt"
},
"can_allocate" : "no",
"allocate_explanation" : "cannot allocate because allocation is not permitted to any of the nodes",
"node_allocation_decisions" : [
{
"node_id" : "FoRT9BWgSIeSTIpyjIkK7w",
"node_name" : "elasticsearch-master-1",
"transport_address" : "10.244.9.21:9300",
"node_attributes" : {
"ml.machine_memory" : "12884901888",
"ml.max_open_jobs" : "20",
"xpack.installed" : "true"
},
"node_decision" : "no",
"deciders" : [
{
"decider" : "max_retry",
"decision" : "NO",
"explanation" : """shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [/_cluster/reroute?retry_failed=true] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2021-10-09T01:08:47.724Z], failed_attempts[5], failed_nodes[[FoRT9BWgSIeSTIpyjIkK7w]], delayed=false, details[failed shard on node [FoRT9BWgSIeSTIpyjIkK7w]: shard failure, reason [index id[M2NmYzU1NWEtY2VkMi00NmUwLTk4ZTMtZGJhYmFlODBiMWZl] origin[REPLICA] seq#[203428]], failure IllegalArgumentException[DocValuesField "_seq_no" appears more than once in this document (only one value is allowed per field)]], allocation_status[no_attempt]]]"""
}
]
},
{
"node_id" : "GQBcp41gQaivG98GWnqyeA",
"node_name" : "elasticsearch-master-2",
"transport_address" : "10.244.11.21:9300",
"node_attributes" : {
"ml.machine_memory" : "12884901888",
"ml.max_open_jobs" : "20",
"xpack.installed" : "true"
},
"node_decision" : "no",
"deciders" : [
{
"decider" : "max_retry",
"decision" : "NO",
"explanation" : """shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [/_cluster/reroute?retry_failed=true] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2021-10-09T01:08:47.724Z], failed_attempts[5], failed_nodes[[FoRT9BWgSIeSTIpyjIkK7w]], delayed=false, details[failed shard on node [FoRT9BWgSIeSTIpyjIkK7w]: shard failure, reason [index id[M2NmYzU1NWEtY2VkMi00NmUwLTk4ZTMtZGJhYmFlODBiMWZl] origin[REPLICA] seq#[203428]], failure IllegalArgumentException[DocValuesField "_seq_no" appears more than once in this document (only one value is allowed per field)]], allocation_status[no_attempt]]]"""
},
{
"decider" : "same_shard",
"decision" : "NO",
"explanation" : "the shard cannot be allocated to the same node on which a copy of the shard already exists [[logstash-2021.10.09][1], node[GQBcp41gQaivG98GWnqyeA], [P], s[STARTED], a[id=J99PeqvlR5qhJOXxLiG9oQ]]"
}
]
},
{
"node_id" : "bI1It63aTzGd8REJwbr8Lw",
"node_name" : "elasticsearch-master-0",
"transport_address" : "10.244.15.41:9300",
"node_attributes" : {
"ml.machine_memory" : "12884901888",
"xpack.installed" : "true",
"ml.max_open_jobs" : "20"
},
"node_decision" : "no",
"deciders" : [
{
"decider" : "max_retry",
"decision" : "NO",
"explanation" : """shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [/_cluster/reroute?retry_failed=true] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2021-10-09T01:08:47.724Z], failed_attempts[5], failed_nodes[[FoRT9BWgSIeSTIpyjIkK7w]], delayed=false, details[failed shard on node [FoRT9BWgSIeSTIpyjIkK7w]: shard failure, reason [index id[M2NmYzU1NWEtY2VkMi00NmUwLTk4ZTMtZGJhYmFlODBiMWZl] origin[REPLICA] seq#[203428]], failure IllegalArgumentException[DocValuesField "_seq_no" appears more than once in this document (only one value is allowed per field)]], allocation_status[no_attempt]]]"""
},
{
"decider" : "same_shard",
"decision" : "NO",
"explanation" : "the shard cannot be allocated to the same node on which a copy of the shard already exists [[logstash-2021.10.09][1], node[bI1It63aTzGd8REJwbr8Lw], [R], s[STARTED], a[id=LYBfDO5jRwqyMkbgrEvJpw]]"
}
]
}
]
}
Here's the output of _cluster/stats
{
"_nodes" : {
"total" : 3,
"successful" : 3,
"failed" : 0
},
"cluster_name" : "elasticsearch",
"cluster_uuid" : "VlWTsbX-SPOWCCBjkjcbKA",
"timestamp" : 1634197751864,
"status" : "red",
"indices" : {
"count" : 32,
"shards" : {
"total" : 248,
"primaries" : 93,
"replication" : 1.6666666666666667,
"index" : {
"shards" : {
"min" : 2,
"max" : 9,
"avg" : 7.75
},
"primaries" : {
"min" : 1,
"max" : 3,
"avg" : 2.90625
},
"replication" : {
"min" : 1.0,
"max" : 2.0,
"avg" : 1.6458333333333335
}
}
},
"docs" : {
"count" : 35391644,
"deleted" : 65449
},
"store" : {
"size_in_bytes" : 33812395169
},
"fielddata" : {
"memory_size_in_bytes" : 0,
"evictions" : 0
},
"query_cache" : {
"memory_size_in_bytes" : 410743,
"total_count" : 1829538,
"hit_count" : 223667,
"miss_count" : 1605871,
"cache_size" : 564,
"cache_count" : 3854,
"evictions" : 3290
},
"completion" : {
"size_in_bytes" : 0
},
"segments" : {
"count" : 1255,
"memory_in_bytes" : 49950841,
"terms_memory_in_bytes" : 25573661,
"stored_fields_memory_in_bytes" : 22275880,
"term_vectors_memory_in_bytes" : 0,
"norms_memory_in_bytes" : 1334144,
"points_memory_in_bytes" : 0,
"doc_values_memory_in_bytes" : 767156,
"index_writer_memory_in_bytes" : 16312408,
"version_map_memory_in_bytes" : 31408,
"fixed_bit_set_memory_in_bytes" : 976,
"max_unsafe_auto_id_timestamp" : 1634170437024,
"file_sizes" : { }
}
},
"nodes" : {
"count" : {
"total" : 3,
"coordinating_only" : 0,
"data" : 3,
"ingest" : 3,
"master" : 3,
"ml" : 3,
"voting_only" : 0
},
"versions" : [
"7.6.1"
],
"os" : {
"available_processors" : 3,
"allocated_processors" : 3,
"names" : [
{
"name" : "Linux",
"count" : 3
}
],
"pretty_names" : [
{
"pretty_name" : "CentOS Linux 7 (Core)",
"count" : 3
}
],
"mem" : {
"total_in_bytes" : 50362867712,
"free_in_bytes" : 4381495296,
"used_in_bytes" : 45981372416,
"free_percent" : 9,
"used_percent" : 91
}
},
"process" : {
"cpu" : {
"percent" : 0
},
"open_file_descriptors" : {
"min" : 814,
"max" : 2341,
"avg" : 1328
}
},
"jvm" : {
"max_uptime_in_millis" : 12005603937,
"versions" : [
{
"version" : "13.0.2",
"vm_name" : "OpenJDK 64-Bit Server VM",
"vm_version" : "13.0.2+8",
"vm_vendor" : "AdoptOpenJDK",
"bundled_jdk" : true,
"using_bundled_jdk" : true,
"count" : 3
}
],
"mem" : {
"heap_used_in_bytes" : 10277882608,
"heap_max_in_bytes" : 19301203968
},
"threads" : 154
},
"fs" : {
"total_in_bytes" : 947980271616,
"free_in_bytes" : 910911021056,
"available_in_bytes" : 862542307328
},
"plugins" : [ ],
"network_types" : {
"transport_types" : {
"security4" : 3
},
"http_types" : {
"security4" : 3
}
},
"discovery_types" : {
"zen" : 3
},
"packaging_types" : [
{
"flavor" : "default",
"type" : "docker",
"count" : 3
}
],
"ingest" : {
"number_of_pipelines" : 2,
"processor_stats" : {
"gsub" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time_in_millis" : 0
},
"script" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time_in_millis" : 0
}
}
}
}
}
This problem appears to only affect a specific node, even on a separate index