Elasticsearch reached the limit of ongoing initial primary recoveries

Hello, I've been monitoring my cluster performance and I found my cluster health turned into red
when I check with GET _cluster/allocation/explain?pretty I got this result

"node_allocation_decisions" : [
    {
      "node_id" : "node_id",
      "node_name" : "node-data-01",
      "transport_address" : "ip:port",
      "node_attributes" : {
        "xpack.installed" : "true",
        "transform.node" : "false"
      },
      "node_decision" : "throttled",
      "weight_ranking" : 1,
      "deciders" : [
        {
          "decider" : "throttling",
          "decision" : "THROTTLE",
          "explanation" : "reached the limit of ongoing initial primary recoveries [4], cluster setting [cluster.routing.allocation.node_initial_primaries_recoveries=4]"
        }
      ]
    }

from the error written, I tried to change the cluster setting by writing this

PUT _cluster/settings
{
  "transient": {
    "cluster.routing.allocation.node_initial_primaries_recoveries": "5"
  }
}

I also run POST /_cluster/reroute?retry_failed=true
the problem is still there, anyone have any clue about this?

Thanks in advance

What is the full output of the cluster stats API?

thanks for the answer @Christian_Dahlqvist , here is my cluster stats

{
  "_nodes" : {
    "total" : 7,
    "successful" : 7,
    "failed" : 0
  },
  "cluster_name" : "uat",
  "cluster_uuid" : "uuid",
  "timestamp" : 1637126841332,
  "status" : "yellow",
  "indices" : {
    "count" : 2514,
    "shards" : {
      "total" : 3250,
      "primaries" : 3214,
      "replication" : 0.01120099564405725,
      "index" : {
        "shards" : {
          "min" : 1,
          "max" : 3,
          "avg" : 1.292760540970565
        },
        "primaries" : {
          "min" : 1,
          "max" : 3,
          "avg" : 1.2784407319013524
        },
        "replication" : {
          "min" : 0.0,
          "max" : 1.0,
          "avg" : 0.014319809069212411
        }
      }
    },
    "docs" : {
      "count" : 1862269802,
      "deleted" : 4125828
    },
    "store" : {
      "size_in_bytes" : 327103220148,
      "reserved_in_bytes" : 0
    },
    "fielddata" : {
      "memory_size_in_bytes" : 4152,
      "evictions" : 0
    },
    "query_cache" : {
      "memory_size_in_bytes" : 5298776,
      "total_count" : 248706,
      "hit_count" : 27741,
      "miss_count" : 220965,
      "cache_size" : 1445,
      "cache_count" : 10067,
      "evictions" : 8622
    },
    "completion" : {
      "size_in_bytes" : 0
    },
    "segments" : {
      "count" : 19193,
      "memory_in_bytes" : 199912930,
      "terms_memory_in_bytes" : 141097712,
      "stored_fields_memory_in_bytes" : 30182504,
      "term_vectors_memory_in_bytes" : 0,
      "norms_memory_in_bytes" : 18956224,
      "points_memory_in_bytes" : 0,
      "doc_values_memory_in_bytes" : 9676490,
      "index_writer_memory_in_bytes" : 576597686,
      "version_map_memory_in_bytes" : 140238,
      "fixed_bit_set_memory_in_bytes" : 14248,
      "max_unsafe_auto_id_timestamp" : 1637021510573,
      "file_sizes" : { }
    },
    "mappings" : {
      "field_types" : [
        {
          "name" : "alias",
          "count" : 19,
          "index_count" : 3
        },
        {
          "name" : "boolean",
          "count" : 1636,
          "index_count" : 888
        },
        {
          "name" : "byte",
          "count" : 3,
          "index_count" : 3
        },
        {
          "name" : "constant_keyword",
          "count" : 2,
          "index_count" : 2
        },
        {
          "name" : "date",
          "count" : 3295,
          "index_count" : 2426
        },
        {
          "name" : "double",
          "count" : 413,
          "index_count" : 19
        },
        {
          "name" : "float",
          "count" : 551,
          "index_count" : 92
        },
        {
          "name" : "geo_point",
          "count" : 25,
          "index_count" : 4
        },
        {
          "name" : "half_float",
          "count" : 5,
          "index_count" : 1
        },
        {
          "name" : "integer",
          "count" : 2,
          "index_count" : 2
        },
        {
          "name" : "ip",
          "count" : 261,
          "index_count" : 64
        },
        {
          "name" : "keyword",
          "count" : 60831,
          "index_count" : 2476
        },
        {
          "name" : "long",
          "count" : 13008,
          "index_count" : 1494
        },
        {
          "name" : "nested",
          "count" : 3,
          "index_count" : 3
        },
        {
          "name" : "object",
          "count" : 15828,
          "index_count" : 2001
        },
        {
          "name" : "scaled_float",
          "count" : 408,
          "index_count" : 62
        },
        {
          "name" : "short",
          "count" : 98,
          "index_count" : 1
        },
        {
          "name" : "text",
          "count" : 53541,
          "index_count" : 2452
        }
      ]
    },
    "analysis" : {
      "char_filter_types" : [ ],
      "tokenizer_types" : [ ],
      "filter_types" : [ ],
      "analyzer_types" : [ ],
      "built_in_char_filters" : [ ],
      "built_in_tokenizers" : [ ],
      "built_in_filters" : [ ],
      "built_in_analyzers" : [ ]
    },
    "versions" : [
      {
        "version" : "6.5.4",
        "index_count" : 664,
        "primary_shard_count" : 1362,
        "total_primary_bytes" : 161695725301
      },
      {
        "version" : "7.11.2",
        "index_count" : 1900,
        "primary_shard_count" : 1902,
        "total_primary_bytes" : 165242458376
      }
    ]
  },
  "nodes" : {
    "count" : {
      "total" : 7,
      "coordinating_only" : 0,
      "data" : 4,
      "data_cold" : 0,
      "data_content" : 0,
      "data_hot" : 0,
      "data_warm" : 0,
      "ingest" : 4,
      "master" : 1,
      "ml" : 0,
      "remote_cluster_client" : 0,
      "transform" : 0,
      "voting_only" : 0
    },
    "versions" : [
      "7.11.2"
    ],
    "os" : {
      "available_processors" : 152,
      "allocated_processors" : 152,
      "names" : [
        {
          "name" : "Linux",
          "count" : 7
        }
      ],
      "pretty_names" : [
        {
          "pretty_name" : "Red Hat Enterprise Linux Server 7.9 (Maipo)",
          "count" : 7
        }
      ],
      "mem" : {
        "total_in_bytes" : 370188652544,
        "free_in_bytes" : 7645855744,
        "used_in_bytes" : 362542796800,
        "free_percent" : 2,
        "used_percent" : 98
      }
    },
    "process" : {
      "cpu" : {
        "percent" : 2
      },
      "open_file_descriptors" : {
        "min" : 447,
        "max" : 6301,
        "avg" : 3809
      }
    },
    "jvm" : {
      "max_uptime_in_millis" : 1051705497,
      "versions" : [
        {
          "version" : "1.8.0_144",
          "vm_name" : "Java HotSpot(TM) 64-Bit Server VM",
          "vm_version" : "25.144-b01",
          "vm_vendor" : "Oracle Corporation",
          "bundled_jdk" : true,
          "using_bundled_jdk" : false,
          "count" : 3
        },
        {
          "version" : "15.0.1",
          "vm_name" : "OpenJDK 64-Bit Server VM",
          "vm_version" : "15.0.1+9",
          "vm_vendor" : "AdoptOpenJDK",
          "bundled_jdk" : true,
          "using_bundled_jdk" : true,
          "count" : 4
        }
      ],
      "mem" : {
        "heap_used_in_bytes" : 17290953776,
        "heap_max_in_bytes" : 39553204224
      },
      "threads" : 1026
    },
    "fs" : {
      "total_in_bytes" : 790272499712,
      "free_in_bytes" : 601138151424,
      "available_in_bytes" : 560947294208
    },
    "plugins" : [
      {
        "name" : "repository-s3",
        "version" : "7.11.2",
        "elasticsearch_version" : "7.11.2",
        "java_version" : "1.8",
        "description" : "The S3 repository plugin adds S3 repositories",
        "classname" : "org.elasticsearch.repositories.s3.S3RepositoryPlugin",
        "extended_plugins" : [ ],
        "has_native_controller" : false,
        "licensed" : false,
        "type" : "isolated"
      }
    ],
    "network_types" : {
      "transport_types" : {
        "security4" : 7
      },
      "http_types" : {
        "security4" : 7
      }
    },
    "discovery_types" : {
      "zen" : 7
    },
    "packaging_types" : [
      {
        "flavor" : "default",
        "type" : "tar",
        "count" : 7
      }
    ],
    "ingest" : {
      "number_of_pipelines" : 28,
      "processor_stats" : {
        "conditional" : {
          "count" : 347758535,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 244798
        },
        "convert" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 0
        },
        "date" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 0
        },
        "dot_expander" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 0
        },
        "grok" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 0
        },
        "gsub" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 0
        },
        "json" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 0
        },
        "pipeline" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 0
        },
        "remove" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 0
        },
        "rename" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 0
        },
        "script" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 0
        },
        "set" : {
          "count" : 24598991,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 35591
        },
        "split" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 0
        }
      }
    }
  }
}

from the stats I sent, the health became yellow, but now it become red again

Where is the cluster hosted? What type of hardware and storage are you using? What is the use case?

the cluster is hosted on a virtual machine, using hdd as a storage,
the use case: daily indices is initializing and won't started for a very long time

You have a lot of very small shards and that is very inefficient. Given that you only have 4 data nodes you are also approaching the limit so I would recommend you look to reduce the number of daily indices and shards in the cluster. It is generally recommended to aim for an average shard size of at least 10GB and your are if I calculate correctly only around 100MB in size.

Ther fact that you are indexing into a lot of small indices and have slow disks probably also contributes to the problem. What does disk I/O and iowait look like on the data nodes?

should I look the I/O on stack monitoring or there's an API to get into this info?

the app teams want to send daily indices with a daily naming indices, so I can't do much about this, in fact my cluster is ever got into 6k shards, and this problem happened on around 3k shards,
and yes ofc I am trying to tune the sharding, but it is still on progress, the one I'm trying to solve now is about this one, because cluster health couldn't get into green until right now

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.