Problems with Shard allocation

We currently run Elasticsearch version 7.6.1 on three machines with roles dilm for all.

Our index is divided into three shards and 2 replicas. Lately we've noticed that shards are not allocated on one node.
When I run the _GET /_cluster/allocation/explain request, I get the following output

{
  "index" : "logstash-2021.10.09",
  "shard" : 1,
  "primary" : false,
  "current_state" : "unassigned",
  "unassigned_info" : {
    "reason" : "ALLOCATION_FAILED",
    "at" : "2021-10-09T01:08:47.724Z",
    "failed_allocation_attempts" : 5,
    "details" : """failed shard on node [FoRT9BWgSIeSTIpyjIkK7w]: shard failure, reason [index id[M2NmYzU1NWEtY2VkMi00NmUwLTk4ZTMtZGJhYmFlODBiMWZl] origin[REPLICA] seq#[203428]], failure IllegalArgumentException[DocValuesField "_seq_no" appears more than once in this document (only one value is allowed per field)]""",
    "last_allocation_status" : "no_attempt"
  },
  "can_allocate" : "no",
  "allocate_explanation" : "cannot allocate because allocation is not permitted to any of the nodes",
  "node_allocation_decisions" : [
    {
      "node_id" : "FoRT9BWgSIeSTIpyjIkK7w",
      "node_name" : "elasticsearch-master-1",
      "transport_address" : "10.244.9.21:9300",
      "node_attributes" : {
        "ml.machine_memory" : "12884901888",
        "ml.max_open_jobs" : "20",
        "xpack.installed" : "true"
      },
      "node_decision" : "no",
      "deciders" : [
        {
          "decider" : "max_retry",
          "decision" : "NO",
          "explanation" : """shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [/_cluster/reroute?retry_failed=true] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2021-10-09T01:08:47.724Z], failed_attempts[5], failed_nodes[[FoRT9BWgSIeSTIpyjIkK7w]], delayed=false, details[failed shard on node [FoRT9BWgSIeSTIpyjIkK7w]: shard failure, reason [index id[M2NmYzU1NWEtY2VkMi00NmUwLTk4ZTMtZGJhYmFlODBiMWZl] origin[REPLICA] seq#[203428]], failure IllegalArgumentException[DocValuesField "_seq_no" appears more than once in this document (only one value is allowed per field)]], allocation_status[no_attempt]]]"""
        }
      ]
    },
    {
      "node_id" : "GQBcp41gQaivG98GWnqyeA",
      "node_name" : "elasticsearch-master-2",
      "transport_address" : "10.244.11.21:9300",
      "node_attributes" : {
        "ml.machine_memory" : "12884901888",
        "ml.max_open_jobs" : "20",
        "xpack.installed" : "true"
      },
      "node_decision" : "no",
      "deciders" : [
        {
          "decider" : "max_retry",
          "decision" : "NO",
          "explanation" : """shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [/_cluster/reroute?retry_failed=true] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2021-10-09T01:08:47.724Z], failed_attempts[5], failed_nodes[[FoRT9BWgSIeSTIpyjIkK7w]], delayed=false, details[failed shard on node [FoRT9BWgSIeSTIpyjIkK7w]: shard failure, reason [index id[M2NmYzU1NWEtY2VkMi00NmUwLTk4ZTMtZGJhYmFlODBiMWZl] origin[REPLICA] seq#[203428]], failure IllegalArgumentException[DocValuesField "_seq_no" appears more than once in this document (only one value is allowed per field)]], allocation_status[no_attempt]]]"""
        },
        {
          "decider" : "same_shard",
          "decision" : "NO",
          "explanation" : "the shard cannot be allocated to the same node on which a copy of the shard already exists [[logstash-2021.10.09][1], node[GQBcp41gQaivG98GWnqyeA], [P], s[STARTED], a[id=J99PeqvlR5qhJOXxLiG9oQ]]"
        }
      ]
    },
    {
      "node_id" : "bI1It63aTzGd8REJwbr8Lw",
      "node_name" : "elasticsearch-master-0",
      "transport_address" : "10.244.15.41:9300",
      "node_attributes" : {
        "ml.machine_memory" : "12884901888",
        "xpack.installed" : "true",
        "ml.max_open_jobs" : "20"
      },
      "node_decision" : "no",
      "deciders" : [
        {
          "decider" : "max_retry",
          "decision" : "NO",
          "explanation" : """shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [/_cluster/reroute?retry_failed=true] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2021-10-09T01:08:47.724Z], failed_attempts[5], failed_nodes[[FoRT9BWgSIeSTIpyjIkK7w]], delayed=false, details[failed shard on node [FoRT9BWgSIeSTIpyjIkK7w]: shard failure, reason [index id[M2NmYzU1NWEtY2VkMi00NmUwLTk4ZTMtZGJhYmFlODBiMWZl] origin[REPLICA] seq#[203428]], failure IllegalArgumentException[DocValuesField "_seq_no" appears more than once in this document (only one value is allowed per field)]], allocation_status[no_attempt]]]"""
        },
        {
          "decider" : "same_shard",
          "decision" : "NO",
          "explanation" : "the shard cannot be allocated to the same node on which a copy of the shard already exists [[logstash-2021.10.09][1], node[bI1It63aTzGd8REJwbr8Lw], [R], s[STARTED], a[id=LYBfDO5jRwqyMkbgrEvJpw]]"
        }
      ]
    }
  ]
}

Here's the output of _cluster/stats

{
  "_nodes" : {
    "total" : 3,
    "successful" : 3,
    "failed" : 0
  },
  "cluster_name" : "elasticsearch",
  "cluster_uuid" : "VlWTsbX-SPOWCCBjkjcbKA",
  "timestamp" : 1634197751864,
  "status" : "red",
  "indices" : {
    "count" : 32,
    "shards" : {
      "total" : 248,
      "primaries" : 93,
      "replication" : 1.6666666666666667,
      "index" : {
        "shards" : {
          "min" : 2,
          "max" : 9,
          "avg" : 7.75
        },
        "primaries" : {
          "min" : 1,
          "max" : 3,
          "avg" : 2.90625
        },
        "replication" : {
          "min" : 1.0,
          "max" : 2.0,
          "avg" : 1.6458333333333335
        }
      }
    },
    "docs" : {
      "count" : 35391644,
      "deleted" : 65449
    },
    "store" : {
      "size_in_bytes" : 33812395169
    },
    "fielddata" : {
      "memory_size_in_bytes" : 0,
      "evictions" : 0
    },
    "query_cache" : {
      "memory_size_in_bytes" : 410743,
      "total_count" : 1829538,
      "hit_count" : 223667,
      "miss_count" : 1605871,
      "cache_size" : 564,
      "cache_count" : 3854,
      "evictions" : 3290
    },
    "completion" : {
      "size_in_bytes" : 0
    },
    "segments" : {
      "count" : 1255,
      "memory_in_bytes" : 49950841,
      "terms_memory_in_bytes" : 25573661,
      "stored_fields_memory_in_bytes" : 22275880,
      "term_vectors_memory_in_bytes" : 0,
      "norms_memory_in_bytes" : 1334144,
      "points_memory_in_bytes" : 0,
      "doc_values_memory_in_bytes" : 767156,
      "index_writer_memory_in_bytes" : 16312408,
      "version_map_memory_in_bytes" : 31408,
      "fixed_bit_set_memory_in_bytes" : 976,
      "max_unsafe_auto_id_timestamp" : 1634170437024,
      "file_sizes" : { }
    }
  },
  "nodes" : {
    "count" : {
      "total" : 3,
      "coordinating_only" : 0,
      "data" : 3,
      "ingest" : 3,
      "master" : 3,
      "ml" : 3,
      "voting_only" : 0
    },
    "versions" : [
      "7.6.1"
    ],
    "os" : {
      "available_processors" : 3,
      "allocated_processors" : 3,
      "names" : [
        {
          "name" : "Linux",
          "count" : 3
        }
      ],
      "pretty_names" : [
        {
          "pretty_name" : "CentOS Linux 7 (Core)",
          "count" : 3
        }
      ],
      "mem" : {
        "total_in_bytes" : 50362867712,
        "free_in_bytes" : 4381495296,
        "used_in_bytes" : 45981372416,
        "free_percent" : 9,
        "used_percent" : 91
      }
    },
    "process" : {
      "cpu" : {
        "percent" : 0
      },
      "open_file_descriptors" : {
        "min" : 814,
        "max" : 2341,
        "avg" : 1328
      }
    },
    "jvm" : {
      "max_uptime_in_millis" : 12005603937,
      "versions" : [
        {
          "version" : "13.0.2",
          "vm_name" : "OpenJDK 64-Bit Server VM",
          "vm_version" : "13.0.2+8",
          "vm_vendor" : "AdoptOpenJDK",
          "bundled_jdk" : true,
          "using_bundled_jdk" : true,
          "count" : 3
        }
      ],
      "mem" : {
        "heap_used_in_bytes" : 10277882608,
        "heap_max_in_bytes" : 19301203968
      },
      "threads" : 154
    },
    "fs" : {
      "total_in_bytes" : 947980271616,
      "free_in_bytes" : 910911021056,
      "available_in_bytes" : 862542307328
    },
    "plugins" : [ ],
    "network_types" : {
      "transport_types" : {
        "security4" : 3
      },
      "http_types" : {
        "security4" : 3
      }
    },
    "discovery_types" : {
      "zen" : 3
    },
    "packaging_types" : [
      {
        "flavor" : "default",
        "type" : "docker",
        "count" : 3
      }
    ],
    "ingest" : {
      "number_of_pipelines" : 2,
      "processor_stats" : {
        "gsub" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 0
        },
        "script" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 0
        }
      }
    }
  }
}

This problem appears to only affect a specific node, even on a separate index

Welcome to our community! :smiley:

Is that correct?

Yes, this is correct. It makes sense as 2 replica shards cannot be assigned on the same node.

The real problem appears to be this though, right?

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.