All the shards are being assigned to a single node

Hi..

We have a 6 data node cluster and we have around 2000 indices with 9500 shards.
We have the below cluster settings and have enabled all the shards to be re-balanced to distribute the shards across the cluster.

{
  "persistent" : {
    "cluster" : {
      "routing" : {
        "rebalance" : {
          "enable" : "all"
        },
        "allocation" : {
          "allow_rebalance" : "always",
          "cluster_concurrent_rebalance" : "10",
          "node_initial_primaries_recoveries" : "20",
          "enable" : "all"
        }
      },
      "max_shards_per_node" : "20000"
    },
    "xpack" : {
      "monitoring" : {
        "collection" : {
          "enabled" : "true"
        }
      }
    }
  },
  "transient" : {
    "cluster" : {
      "routing" : {
        "allocation" : {
          "enable" : "all",
          "exclude" : {
            "_name" : ""
          }
        }
      }
    }
  }
}

But most of the shards are being assigned to node-1 and rest of the nodes have very minimal shards allocated.
Please find the _cat/allocation result below

shards disk.indices disk.used disk.avail disk.total disk.percent node
  4336          2tb       2tb    375.6gb      2.4tb           84 data-node1
  1174      813.2gb   834.7gb      1.6tb      2.4tb           33 data-node2
  1115          1tb       1tb      1.3tb      2.4tb           44 data-node3
   907      500.1gb   518.7gb      1.9tb      2.4tb           20 data-node4
   724      613.9gb   635.6gb      1.8tb      2.4tb           25 data-node5
  1209      781.9gb   803.3gb      1.6tb      2.4tb           32 data-node6

Please suggest what has to be done to have my cluster equally balanced.

Thanks,
Sanjay Reddy

What is the full output of the cluster stats API?

Hi @Christian_Dahlqvist,

Thanks for your response.
Please find the _cluster/stats output below

{
  "_nodes" : {
    "total" : 10,
    "successful" : 10,
    "failed" : 0
  },
  "cluster_name" : "elasticsearch",
  "cluster_uuid" : "VHTnPH73uB2echXL5kwQ",
  "timestamp" : 1677765367930,
  "status" : "green",
  "indices" : {
    "count" : 2607,
    "shards" : {
      "total" : 9465,
      "primaries" : 9455,
      "replication" : 0.0010576414595452142,
      "index" : {
        "shards" : {
          "min" : 1,
          "max" : 5,
          "avg" : 3.630609896432681
        },
        "primaries" : {
          "min" : 1,
          "max" : 5,
          "avg" : 3.6267740698120443
        },
        "replication" : {
          "min" : 0.0,
          "max" : 1.0,
          "avg" : 0.0038358266206367474
        }
      }
    },
    "docs" : {
      "count" : 8289623365,
      "deleted" : 281019071
    },
    "store" : {
      "size" : "5.7tb",
      "size_in_bytes" : 6332913723434,
      "total_data_set_size" : "5.7tb",
      "total_data_set_size_in_bytes" : 6332913723434,
      "reserved" : "0b",
      "reserved_in_bytes" : 0
    },
    "fielddata" : {
      "memory_size" : "461.8mb",
      "memory_size_in_bytes" : 484290512,
      "evictions" : 0
    },
    "query_cache" : {
      "memory_size" : "1.8gb",
      "memory_size_in_bytes" : 1954442972,
      "total_count" : 2307688387,
      "hit_count" : 450921572,
      "miss_count" : 1856766815,
      "cache_size" : 882596,
      "cache_count" : 13705828,
      "evictions" : 12823232
    },
    "completion" : {
      "size" : "0b",
      "size_in_bytes" : 0
    },
    "segments" : {
      "count" : 99107,
      "memory" : "4gb",
      "memory_in_bytes" : 4389254278,
      "terms_memory" : "3.8gb",
      "terms_memory_in_bytes" : 4094197672,
      "stored_fields_memory" : "86.2mb",
      "stored_fields_memory_in_bytes" : 90477496,
      "term_vectors_memory" : "0b",
      "term_vectors_memory_in_bytes" : 0,
      "norms_memory" : "22.1mb",
      "norms_memory_in_bytes" : 23268160,
      "points_memory" : "0b",
      "points_memory_in_bytes" : 0,
      "doc_values_memory" : "172.9mb",
      "doc_values_memory_in_bytes" : 181310950,
      "index_writer_memory" : "48.1mb",
      "index_writer_memory_in_bytes" : 50449000,
      "version_map_memory" : "1.3mb",
      "version_map_memory_in_bytes" : 1380531,
      "fixed_bit_set" : "1.4kb",
      "fixed_bit_set_memory_in_bytes" : 1448,
      "max_unsafe_auto_id_timestamp" : 1677736849135,
      "file_sizes" : { }
    },
    "mappings" : {
      "field_types" : [
        {
          "name" : "boolean",
          "count" : 613,
          "index_count" : 336,
          "script_count" : 0
        },
        {
          "name" : "constant_keyword",
          "count" : 33,
          "index_count" : 11,
          "script_count" : 0
        },
        {
          "name" : "date",
          "count" : 7145,
          "index_count" : 2577,
          "script_count" : 0
        },
        {
          "name" : "float",
          "count" : 3076,
          "index_count" : 519,
          "script_count" : 0
        },
        {
          "name" : "geo_point",
          "count" : 2378,
          "index_count" : 2249,
          "script_count" : 0
        },
        {
          "name" : "half_float",
          "count" : 4372,
          "index_count" : 2186,
          "script_count" : 0
        },
        {
          "name" : "ip",
          "count" : 2453,
          "index_count" : 2260,
          "script_count" : 0
        },
        {
          "name" : "keyword",
          "count" : 166966,
          "index_count" : 2591,
          "script_count" : 0
        },
        {
          "name" : "long",
          "count" : 13052,
          "index_count" : 2521,
          "script_count" : 0
        },
        {
          "name" : "nested",
          "count" : 3,
          "index_count" : 3,
          "script_count" : 0
        },
        {
          "name" : "object",
          "count" : 17774,
          "index_count" : 2268,
          "script_count" : 0
        },
        {
          "name" : "text",
          "count" : 165997,
          "index_count" : 2589,
          "script_count" : 0
        },
        {
          "name" : "version",
          "count" : 3,
          "index_count" : 3,
          "script_count" : 0
        }
      ],
      "runtime_field_types" : [ ]
    },
    "analysis" : {
      "char_filter_types" : [ ],
      "tokenizer_types" : [ ],
      "filter_types" : [ ],
      "analyzer_types" : [ ],
      "built_in_char_filters" : [ ],
      "built_in_tokenizers" : [ ],
      "built_in_filters" : [ ],
      "built_in_analyzers" : [ ]
    },
    "versions" : [
      {
        "version" : "7.16.2",
        "index_count" : 65,
        "primary_shard_count" : 317,
        "total_primary_size" : "323.2gb",
        "total_primary_bytes" : 347089190328
      },
      {
        "version" : "7.16.3",
        "index_count" : 2542,
        "primary_shard_count" : 9138,
        "total_primary_size" : "5.4tb",
        "total_primary_bytes" : 5985779024133
      }
    ]
  },
  "nodes" : {
    "count" : {
      "total" : 10,
      "coordinating_only" : 3,
      "data" : 6,
      "data_cold" : 0,
      "data_content" : 0,
      "data_frozen" : 0,
      "data_hot" : 0,
      "data_warm" : 0,
      "ingest" : 0,
      "master" : 3,
      "ml" : 0,
      "remote_cluster_client" : 0,
      "transform" : 0,
      "voting_only" : 0
    },
    "versions" : [
      "7.16.3"
    ],
    "os" : {
      "available_processors" : 80,
      "allocated_processors" : 80,
      "names" : [
        {
          "name" : "Linux",
          "count" : 10
        }
      ],
      "pretty_names" : [
        {
          "pretty_name" : "Ubuntu 20.04.3 LTS",
          "count" : 5
        },
        {
          "pretty_name" : "Ubuntu 20.04.4 LTS",
          "count" : 3
        },
        {
          "pretty_name" : "Ubuntu 20.04.5 LTS",
          "count" : 2
        }
      ],
      "architectures" : [
        {
          "arch" : "amd64",
          "count" : 10
        }
      ],
      "mem" : {
        "total" : "621.9gb",
        "total_in_bytes" : 667768614912,
        "free" : "107.5gb",
        "free_in_bytes" : 115495567360,
        "used" : "514.3gb",
        "used_in_bytes" : 552273047552,
        "free_percent" : 17,
        "used_percent" : 83
      }
    },
    "process" : {
      "cpu" : {
        "percent" : 96
      },
      "open_file_descriptors" : {
        "min" : 587,
        "max" : 46746,
        "avg" : 10763
      }
    },
    "jvm" : {
      "max_uptime" : "15.7d",
      "max_uptime_in_millis" : 1357439835,
      "versions" : [
        {
          "version" : "17.0.1",
          "vm_name" : "OpenJDK 64-Bit Server VM",
          "vm_version" : "17.0.1+12",
          "vm_vendor" : "Eclipse Adoptium",
          "bundled_jdk" : true,
          "using_bundled_jdk" : true,
          "count" : 10
        }
      ],
      "mem" : {
        "heap_used" : "160.5gb",
        "heap_used_in_bytes" : 172407627288,
        "heap_max" : "320gb",
        "heap_max_in_bytes" : 343597383680
      },
      "threads" : 1356
    },
    "fs" : {
      "total" : "15.2tb",
      "total_in_bytes" : 16807466500096,
      "free" : "9.3tb",
      "free_in_bytes" : 10298586378240,
      "available" : "9.3tb",
      "available_in_bytes" : 10298552823808
    },
    "plugins" : [
      {
        "name" : "repository-s3",
        "version" : "7.16.3",
        "elasticsearch_version" : "7.16.3",
        "java_version" : "1.8",
        "description" : "The S3 repository plugin adds S3 repositories",
        "classname" : "org.elasticsearch.repositories.s3.S3RepositoryPlugin",
        "extended_plugins" : [ ],
        "has_native_controller" : false,
        "licensed" : false,
        "type" : "isolated"
      }
    ],
    "network_types" : {
      "transport_types" : {
        "security4" : 10
      },
      "http_types" : {
        "security4" : 10
      }
    },
    "discovery_types" : {
      "zen" : 10
    },
    "packaging_types" : [
      {
        "flavor" : "default",
        "type" : "deb",
        "count" : 10
      }
    ],
    "ingest" : {
      "number_of_pipelines" : 2,
      "processor_stats" : {
        "gsub" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "script" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        }
      }
    }
  }
}

You have far too many shards that are also too small and it's likely causing issues.

What sort of data is this?

I totally agree on that @warkolm. In my case, I have increased the "cluster_concurrent_rebalance" value from default to 10 for faster rebalance of shards. As 10 shards are rebalancing at a time, it overlook the shard count and all the shards are being store on one node.
When I reduced it to 2, all shards are now balanced in the cluster.

I'll definitely work on reducing the number of shards in the cluster.

Thanks much for your time.