All the shards are being assigned to a single node

Hi..

We have a 6 data node cluster and we have around 2000 indices with 9500 shards.
We have the below cluster settings and have enabled all the shards to be re-balanced to distribute the shards across the cluster.

{
  "persistent" : {
    "cluster" : {
      "routing" : {
        "rebalance" : {
          "enable" : "all"
        },
        "allocation" : {
          "allow_rebalance" : "always",
          "cluster_concurrent_rebalance" : "10",
          "node_initial_primaries_recoveries" : "20",
          "enable" : "all"
        }
      },
      "max_shards_per_node" : "20000"
    },
    "xpack" : {
      "monitoring" : {
        "collection" : {
          "enabled" : "true"
        }
      }
    }
  },
  "transient" : {
    "cluster" : {
      "routing" : {
        "allocation" : {
          "enable" : "all",
          "exclude" : {
            "_name" : ""
          }
        }
      }
    }
  }
}

But most of the shards are being assigned to node-1 and rest of the nodes have very minimal shards allocated.
Please find the _cat/allocation result below

shards disk.indices disk.used disk.avail disk.total disk.percent node
  4336          2tb       2tb    375.6gb      2.4tb           84 data-node1
  1174      813.2gb   834.7gb      1.6tb      2.4tb           33 data-node2
  1115          1tb       1tb      1.3tb      2.4tb           44 data-node3
   907      500.1gb   518.7gb      1.9tb      2.4tb           20 data-node4
   724      613.9gb   635.6gb      1.8tb      2.4tb           25 data-node5
  1209      781.9gb   803.3gb      1.6tb      2.4tb           32 data-node6

Please suggest what has to be done to have my cluster equally balanced.

Thanks,
Sanjay Reddy

What is the full output of the cluster stats API?

Hi @Christian_Dahlqvist,

Thanks for your response.
Please find the _cluster/stats output below

{
  "_nodes" : {
    "total" : 10,
    "successful" : 10,
    "failed" : 0
  },
  "cluster_name" : "elasticsearch",
  "cluster_uuid" : "VHTnPH73uB2echXL5kwQ",
  "timestamp" : 1677765367930,
  "status" : "green",
  "indices" : {
    "count" : 2607,
    "shards" : {
      "total" : 9465,
      "primaries" : 9455,
      "replication" : 0.0010576414595452142,
      "index" : {
        "shards" : {
          "min" : 1,
          "max" : 5,
          "avg" : 3.630609896432681
        },
        "primaries" : {
          "min" : 1,
          "max" : 5,
          "avg" : 3.6267740698120443
        },
        "replication" : {
          "min" : 0.0,
          "max" : 1.0,
          "avg" : 0.0038358266206367474
        }
      }
    },
    "docs" : {
      "count" : 8289623365,
      "deleted" : 281019071
    },
    "store" : {
      "size" : "5.7tb",
      "size_in_bytes" : 6332913723434,
      "total_data_set_size" : "5.7tb",
      "total_data_set_size_in_bytes" : 6332913723434,
      "reserved" : "0b",
      "reserved_in_bytes" : 0
    },
    "fielddata" : {
      "memory_size" : "461.8mb",
      "memory_size_in_bytes" : 484290512,
      "evictions" : 0
    },
    "query_cache" : {
      "memory_size" : "1.8gb",
      "memory_size_in_bytes" : 1954442972,
      "total_count" : 2307688387,
      "hit_count" : 450921572,
      "miss_count" : 1856766815,
      "cache_size" : 882596,
      "cache_count" : 13705828,
      "evictions" : 12823232
    },
    "completion" : {
      "size" : "0b",
      "size_in_bytes" : 0
    },
    "segments" : {
      "count" : 99107,
      "memory" : "4gb",
      "memory_in_bytes" : 4389254278,
      "terms_memory" : "3.8gb",
      "terms_memory_in_bytes" : 4094197672,
      "stored_fields_memory" : "86.2mb",
      "stored_fields_memory_in_bytes" : 90477496,
      "term_vectors_memory" : "0b",
      "term_vectors_memory_in_bytes" : 0,
      "norms_memory" : "22.1mb",
      "norms_memory_in_bytes" : 23268160,
      "points_memory" : "0b",
      "points_memory_in_bytes" : 0,
      "doc_values_memory" : "172.9mb",
      "doc_values_memory_in_bytes" : 181310950,
      "index_writer_memory" : "48.1mb",
      "index_writer_memory_in_bytes" : 50449000,
      "version_map_memory" : "1.3mb",
      "version_map_memory_in_bytes" : 1380531,
      "fixed_bit_set" : "1.4kb",
      "fixed_bit_set_memory_in_bytes" : 1448,
      "max_unsafe_auto_id_timestamp" : 1677736849135,
      "file_sizes" : { }
    },
    "mappings" : {
      "field_types" : [
        {
          "name" : "boolean",
          "count" : 613,
          "index_count" : 336,
          "script_count" : 0
        },
        {
          "name" : "constant_keyword",
          "count" : 33,
          "index_count" : 11,
          "script_count" : 0
        },
        {
          "name" : "date",
          "count" : 7145,
          "index_count" : 2577,
          "script_count" : 0
        },
        {
          "name" : "float",
          "count" : 3076,
          "index_count" : 519,
          "script_count" : 0
        },
        {
          "name" : "geo_point",
          "count" : 2378,
          "index_count" : 2249,
          "script_count" : 0
        },
        {
          "name" : "half_float",
          "count" : 4372,
          "index_count" : 2186,
          "script_count" : 0
        },
        {
          "name" : "ip",
          "count" : 2453,
          "index_count" : 2260,
          "script_count" : 0
        },
        {
          "name" : "keyword",
          "count" : 166966,
          "index_count" : 2591,
          "script_count" : 0
        },
        {
          "name" : "long",
          "count" : 13052,
          "index_count" : 2521,
          "script_count" : 0
        },
        {
          "name" : "nested",
          "count" : 3,
          "index_count" : 3,
          "script_count" : 0
        },
        {
          "name" : "object",
          "count" : 17774,
          "index_count" : 2268,
          "script_count" : 0
        },
        {
          "name" : "text",
          "count" : 165997,
          "index_count" : 2589,
          "script_count" : 0
        },
        {
          "name" : "version",
          "count" : 3,
          "index_count" : 3,
          "script_count" : 0
        }
      ],
      "runtime_field_types" : [ ]
    },
    "analysis" : {
      "char_filter_types" : [ ],
      "tokenizer_types" : [ ],
      "filter_types" : [ ],
      "analyzer_types" : [ ],
      "built_in_char_filters" : [ ],
      "built_in_tokenizers" : [ ],
      "built_in_filters" : [ ],
      "built_in_analyzers" : [ ]
    },
    "versions" : [
      {
        "version" : "7.16.2",
        "index_count" : 65,
        "primary_shard_count" : 317,
        "total_primary_size" : "323.2gb",
        "total_primary_bytes" : 347089190328
      },
      {
        "version" : "7.16.3",
        "index_count" : 2542,
        "primary_shard_count" : 9138,
        "total_primary_size" : "5.4tb",
        "total_primary_bytes" : 5985779024133
      }
    ]
  },
  "nodes" : {
    "count" : {
      "total" : 10,
      "coordinating_only" : 3,
      "data" : 6,
      "data_cold" : 0,
      "data_content" : 0,
      "data_frozen" : 0,
      "data_hot" : 0,
      "data_warm" : 0,
      "ingest" : 0,
      "master" : 3,
      "ml" : 0,
      "remote_cluster_client" : 0,
      "transform" : 0,
      "voting_only" : 0
    },
    "versions" : [
      "7.16.3"
    ],
    "os" : {
      "available_processors" : 80,
      "allocated_processors" : 80,
      "names" : [
        {
          "name" : "Linux",
          "count" : 10
        }
      ],
      "pretty_names" : [
        {
          "pretty_name" : "Ubuntu 20.04.3 LTS",
          "count" : 5
        },
        {
          "pretty_name" : "Ubuntu 20.04.4 LTS",
          "count" : 3
        },
        {
          "pretty_name" : "Ubuntu 20.04.5 LTS",
          "count" : 2
        }
      ],
      "architectures" : [
        {
          "arch" : "amd64",
          "count" : 10
        }
      ],
      "mem" : {
        "total" : "621.9gb",
        "total_in_bytes" : 667768614912,
        "free" : "107.5gb",
        "free_in_bytes" : 115495567360,
        "used" : "514.3gb",
        "used_in_bytes" : 552273047552,
        "free_percent" : 17,
        "used_percent" : 83
      }
    },
    "process" : {
      "cpu" : {
        "percent" : 96
      },
      "open_file_descriptors" : {
        "min" : 587,
        "max" : 46746,
        "avg" : 10763
      }
    },
    "jvm" : {
      "max_uptime" : "15.7d",
      "max_uptime_in_millis" : 1357439835,
      "versions" : [
        {
          "version" : "17.0.1",
          "vm_name" : "OpenJDK 64-Bit Server VM",
          "vm_version" : "17.0.1+12",
          "vm_vendor" : "Eclipse Adoptium",
          "bundled_jdk" : true,
          "using_bundled_jdk" : true,
          "count" : 10
        }
      ],
      "mem" : {
        "heap_used" : "160.5gb",
        "heap_used_in_bytes" : 172407627288,
        "heap_max" : "320gb",
        "heap_max_in_bytes" : 343597383680
      },
      "threads" : 1356
    },
    "fs" : {
      "total" : "15.2tb",
      "total_in_bytes" : 16807466500096,
      "free" : "9.3tb",
      "free_in_bytes" : 10298586378240,
      "available" : "9.3tb",
      "available_in_bytes" : 10298552823808
    },
    "plugins" : [
      {
        "name" : "repository-s3",
        "version" : "7.16.3",
        "elasticsearch_version" : "7.16.3",
        "java_version" : "1.8",
        "description" : "The S3 repository plugin adds S3 repositories",
        "classname" : "org.elasticsearch.repositories.s3.S3RepositoryPlugin",
        "extended_plugins" : [ ],
        "has_native_controller" : false,
        "licensed" : false,
        "type" : "isolated"
      }
    ],
    "network_types" : {
      "transport_types" : {
        "security4" : 10
      },
      "http_types" : {
        "security4" : 10
      }
    },
    "discovery_types" : {
      "zen" : 10
    },
    "packaging_types" : [
      {
        "flavor" : "default",
        "type" : "deb",
        "count" : 10
      }
    ],
    "ingest" : {
      "number_of_pipelines" : 2,
      "processor_stats" : {
        "gsub" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "script" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        }
      }
    }
  }
}

You have far too many shards that are also too small and it's likely causing issues.

What sort of data is this?

I totally agree on that @warkolm. In my case, I have increased the "cluster_concurrent_rebalance" value from default to 10 for faster rebalance of shards. As 10 shards are rebalancing at a time, it overlook the shard count and all the shards are being store on one node.
When I reduced it to 2, all shards are now balanced in the cluster.

I'll definitely work on reducing the number of shards in the cluster.

Thanks much for your time.

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.