Problems with Shard allocation

We currently run Elasticsearch version 7.6.1 on three machines with roles dilm for all.

Our index is divided into three shards and 2 replicas. Lately we've noticed that shards are not allocated on one node.
When I run the _GET /_cluster/allocation/explain request, I get the following output

{
  "index" : "logstash-2021.10.09",
  "shard" : 1,
  "primary" : false,
  "current_state" : "unassigned",
  "unassigned_info" : {
    "reason" : "ALLOCATION_FAILED",
    "at" : "2021-10-09T01:08:47.724Z",
    "failed_allocation_attempts" : 5,
    "details" : """failed shard on node [FoRT9BWgSIeSTIpyjIkK7w]: shard failure, reason [index id[M2NmYzU1NWEtY2VkMi00NmUwLTk4ZTMtZGJhYmFlODBiMWZl] origin[REPLICA] seq#[203428]], failure IllegalArgumentException[DocValuesField "_seq_no" appears more than once in this document (only one value is allowed per field)]""",
    "last_allocation_status" : "no_attempt"
  },
  "can_allocate" : "no",
  "allocate_explanation" : "cannot allocate because allocation is not permitted to any of the nodes",
  "node_allocation_decisions" : [
    {
      "node_id" : "FoRT9BWgSIeSTIpyjIkK7w",
      "node_name" : "elasticsearch-master-1",
      "transport_address" : "10.244.9.21:9300",
      "node_attributes" : {
        "ml.machine_memory" : "12884901888",
        "ml.max_open_jobs" : "20",
        "xpack.installed" : "true"
      },
      "node_decision" : "no",
      "deciders" : [
        {
          "decider" : "max_retry",
          "decision" : "NO",
          "explanation" : """shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [/_cluster/reroute?retry_failed=true] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2021-10-09T01:08:47.724Z], failed_attempts[5], failed_nodes[[FoRT9BWgSIeSTIpyjIkK7w]], delayed=false, details[failed shard on node [FoRT9BWgSIeSTIpyjIkK7w]: shard failure, reason [index id[M2NmYzU1NWEtY2VkMi00NmUwLTk4ZTMtZGJhYmFlODBiMWZl] origin[REPLICA] seq#[203428]], failure IllegalArgumentException[DocValuesField "_seq_no" appears more than once in this document (only one value is allowed per field)]], allocation_status[no_attempt]]]"""
        }
      ]
    },
    {
      "node_id" : "GQBcp41gQaivG98GWnqyeA",
      "node_name" : "elasticsearch-master-2",
      "transport_address" : "10.244.11.21:9300",
      "node_attributes" : {
        "ml.machine_memory" : "12884901888",
        "ml.max_open_jobs" : "20",
        "xpack.installed" : "true"
      },
      "node_decision" : "no",
      "deciders" : [
        {
          "decider" : "max_retry",
          "decision" : "NO",
          "explanation" : """shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [/_cluster/reroute?retry_failed=true] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2021-10-09T01:08:47.724Z], failed_attempts[5], failed_nodes[[FoRT9BWgSIeSTIpyjIkK7w]], delayed=false, details[failed shard on node [FoRT9BWgSIeSTIpyjIkK7w]: shard failure, reason [index id[M2NmYzU1NWEtY2VkMi00NmUwLTk4ZTMtZGJhYmFlODBiMWZl] origin[REPLICA] seq#[203428]], failure IllegalArgumentException[DocValuesField "_seq_no" appears more than once in this document (only one value is allowed per field)]], allocation_status[no_attempt]]]"""
        },
        {
          "decider" : "same_shard",
          "decision" : "NO",
          "explanation" : "the shard cannot be allocated to the same node on which a copy of the shard already exists [[logstash-2021.10.09][1], node[GQBcp41gQaivG98GWnqyeA], [P], s[STARTED], a[id=J99PeqvlR5qhJOXxLiG9oQ]]"
        }
      ]
    },
    {
      "node_id" : "bI1It63aTzGd8REJwbr8Lw",
      "node_name" : "elasticsearch-master-0",
      "transport_address" : "10.244.15.41:9300",
      "node_attributes" : {
        "ml.machine_memory" : "12884901888",
        "xpack.installed" : "true",
        "ml.max_open_jobs" : "20"
      },
      "node_decision" : "no",
      "deciders" : [
        {
          "decider" : "max_retry",
          "decision" : "NO",
          "explanation" : """shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [/_cluster/reroute?retry_failed=true] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2021-10-09T01:08:47.724Z], failed_attempts[5], failed_nodes[[FoRT9BWgSIeSTIpyjIkK7w]], delayed=false, details[failed shard on node [FoRT9BWgSIeSTIpyjIkK7w]: shard failure, reason [index id[M2NmYzU1NWEtY2VkMi00NmUwLTk4ZTMtZGJhYmFlODBiMWZl] origin[REPLICA] seq#[203428]], failure IllegalArgumentException[DocValuesField "_seq_no" appears more than once in this document (only one value is allowed per field)]], allocation_status[no_attempt]]]"""
        },
        {
          "decider" : "same_shard",
          "decision" : "NO",
          "explanation" : "the shard cannot be allocated to the same node on which a copy of the shard already exists [[logstash-2021.10.09][1], node[bI1It63aTzGd8REJwbr8Lw], [R], s[STARTED], a[id=LYBfDO5jRwqyMkbgrEvJpw]]"
        }
      ]
    }
  ]
}

Here's the output of _cluster/stats

{
  "_nodes" : {
    "total" : 3,
    "successful" : 3,
    "failed" : 0
  },
  "cluster_name" : "elasticsearch",
  "cluster_uuid" : "VlWTsbX-SPOWCCBjkjcbKA",
  "timestamp" : 1634197751864,
  "status" : "red",
  "indices" : {
    "count" : 32,
    "shards" : {
      "total" : 248,
      "primaries" : 93,
      "replication" : 1.6666666666666667,
      "index" : {
        "shards" : {
          "min" : 2,
          "max" : 9,
          "avg" : 7.75
        },
        "primaries" : {
          "min" : 1,
          "max" : 3,
          "avg" : 2.90625
        },
        "replication" : {
          "min" : 1.0,
          "max" : 2.0,
          "avg" : 1.6458333333333335
        }
      }
    },
    "docs" : {
      "count" : 35391644,
      "deleted" : 65449
    },
    "store" : {
      "size_in_bytes" : 33812395169
    },
    "fielddata" : {
      "memory_size_in_bytes" : 0,
      "evictions" : 0
    },
    "query_cache" : {
      "memory_size_in_bytes" : 410743,
      "total_count" : 1829538,
      "hit_count" : 223667,
      "miss_count" : 1605871,
      "cache_size" : 564,
      "cache_count" : 3854,
      "evictions" : 3290
    },
    "completion" : {
      "size_in_bytes" : 0
    },
    "segments" : {
      "count" : 1255,
      "memory_in_bytes" : 49950841,
      "terms_memory_in_bytes" : 25573661,
      "stored_fields_memory_in_bytes" : 22275880,
      "term_vectors_memory_in_bytes" : 0,
      "norms_memory_in_bytes" : 1334144,
      "points_memory_in_bytes" : 0,
      "doc_values_memory_in_bytes" : 767156,
      "index_writer_memory_in_bytes" : 16312408,
      "version_map_memory_in_bytes" : 31408,
      "fixed_bit_set_memory_in_bytes" : 976,
      "max_unsafe_auto_id_timestamp" : 1634170437024,
      "file_sizes" : { }
    }
  },
  "nodes" : {
    "count" : {
      "total" : 3,
      "coordinating_only" : 0,
      "data" : 3,
      "ingest" : 3,
      "master" : 3,
      "ml" : 3,
      "voting_only" : 0
    },
    "versions" : [
      "7.6.1"
    ],
    "os" : {
      "available_processors" : 3,
      "allocated_processors" : 3,
      "names" : [
        {
          "name" : "Linux",
          "count" : 3
        }
      ],
      "pretty_names" : [
        {
          "pretty_name" : "CentOS Linux 7 (Core)",
          "count" : 3
        }
      ],
      "mem" : {
        "total_in_bytes" : 50362867712,
        "free_in_bytes" : 4381495296,
        "used_in_bytes" : 45981372416,
        "free_percent" : 9,
        "used_percent" : 91
      }
    },
    "process" : {
      "cpu" : {
        "percent" : 0
      },
      "open_file_descriptors" : {
        "min" : 814,
        "max" : 2341,
        "avg" : 1328
      }
    },
    "jvm" : {
      "max_uptime_in_millis" : 12005603937,
      "versions" : [
        {
          "version" : "13.0.2",
          "vm_name" : "OpenJDK 64-Bit Server VM",
          "vm_version" : "13.0.2+8",
          "vm_vendor" : "AdoptOpenJDK",
          "bundled_jdk" : true,
          "using_bundled_jdk" : true,
          "count" : 3
        }
      ],
      "mem" : {
        "heap_used_in_bytes" : 10277882608,
        "heap_max_in_bytes" : 19301203968
      },
      "threads" : 154
    },
    "fs" : {
      "total_in_bytes" : 947980271616,
      "free_in_bytes" : 910911021056,
      "available_in_bytes" : 862542307328
    },
    "plugins" : [ ],
    "network_types" : {
      "transport_types" : {
        "security4" : 3
      },
      "http_types" : {
        "security4" : 3
      }
    },
    "discovery_types" : {
      "zen" : 3
    },
    "packaging_types" : [
      {
        "flavor" : "default",
        "type" : "docker",
        "count" : 3
      }
    ],
    "ingest" : {
      "number_of_pipelines" : 2,
      "processor_stats" : {
        "gsub" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 0
        },
        "script" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 0
        }
      }
    }
  }
}

This problem appears to only affect a specific node, even on a separate index

Welcome to our community! :smiley:

Is that correct?

Yes, this is correct. It makes sense as 2 replica shards cannot be assigned on the same node.

The real problem appears to be this though, right?