Elasticsearch Cluster Rebalance

I have all indices has primary shards are going to node: data02 only after cluster restart, how can I rebalance cluster and how to make primary shards go to different node with default setting?

ES is 5.6.3. Below is response from GET /_cluster/allocation/explain?include_yes_decisions=true

    {
"index": "my_index-2019.06.08",
"shard": 0,
"primary": true,
"current_state": "started",
"current_node": {
"id": "W3pA3iD_T7CW9jbII78qZA",
"name": "data02",
"transport_address": "10.10.51.22:9300",
"weight_ranking": 2
},
 "can_remain_on_current_node": "yes",
 "can_rebalance_cluster": "no",
 "can_rebalance_cluster_decisions": [
  {
   "decider": "rebalance_only_when_active",
   "decision": "YES",
   "explanation": "rebalancing is allowed as all replicas are active in the cluster"
   },
  {
   "decider": "cluster_rebalance",
   "decision": "NO",
   "explanation": "the cluster has unassigned shards and cluster setting 
     [cluster.routing.allocation.allow_rebalance] is set to [indices_all_active]"
    },
   {
   "decider": "concurrent_rebalance",
   "decision": "THROTTLE",
   "explanation": "reached the limit of concurrently rebalancing shards [4], cluster setting 
    [cluster.routing.allocation.cluster_concurrent_rebalance=2]"
   },
   {
   "decider": "enable",
   "decision": "YES",
    "explanation": "all rebalancing is allowed"
 },
 {
   "decider": "snapshot_in_progress",
   "decision": "YES",
   "explanation": "no snapshots are currently running"
   }
],
"can_rebalance_to_other_node": "no",
"rebalance_explanation": "rebalancing is not allowed",
"node_allocation_decisions": [
 {
   "node_id": "akXRnbSAQZGzi-PVIdKmKQ",
  "node_name": "data03",
  "transport_address": "10.10.51.23:9300",
  "node_decision": "no",
  "weight_ranking": 1,
  "deciders": [
    {
      "decider": "max_retry",
      "decision": "YES",
      "explanation": "shard has no previous failures"
    },
    {
      "decider": "replica_after_primary_active",
      "decision": "YES",
      "explanation": "shard is primary and can be allocated"
    },
    {
      "decider": "enable",
      "decision": "YES",
      "explanation": "all allocations are allowed"
    },
    {
      "decider": "node_version",
      "decision": "YES",
      "explanation": "target node version [5.6.3] is the same or newer than source node version [5.6.3]"
    },
    {
      "decider": "snapshot_in_progress",
      "decision": "YES",
      "explanation": "no snapshots are currently running"
    },
    {
      "decider": "filter",
      "decision": "YES",
      "explanation": "node passes include/exclude/require filters"
    },
    {
      "decider": "same_shard",
      "decision": "NO",
      "explanation": "the shard cannot be allocated to the same node on which a copy of the shard already exists [[my_index-2019.06.08][0], node[akXRnbSAQZGzi-PVIdKmKQ], [R], s[STARTED], a[id=CvVywjIcS8-KoB6nMB_mxQ]]"
    },
    {
      "decider": "disk_threshold",
      "decision": "YES",
      "explanation": "enough disk for shard on node, free: [3tb], shard size: [13.1gb], free after allocating shard: [3tb]"
    },
    {
      "decider": "throttling",
      "decision": "THROTTLE",
      "explanation": "reached the limit of incoming shard recoveries [2], cluster setting [cluster.routing.allocation.node_concurrent_incoming_recoveries=2] (can also be set via [cluster.routing.allocation.node_concurrent_recoveries])"
    },
    {
      "decider": "shards_limit",
      "decision": "YES",
      "explanation": "total shard limits are disabled: [index: -1, cluster: -1] <= 0"
    },
    {
      "decider": "awareness",
      "decision": "YES",
      "explanation": "allocation awareness is not enabled, set cluster setting [cluster.routing.allocation.awareness.attributes] to enable it"
    }
  ]
},
{
  "node_id": "2qEyND9UQOCJ9n_N43Pj3w",
  "node_name": "data04",
  "transport_address": "10.10.51.24:9300",
  "node_decision": "worse_balance",
  "weight_ranking": 3,
  "deciders": [
    {
      "decider": "max_retry",
      "decision": "YES",
      "explanation": "shard has no previous failures"
    },
    {
      "decider": "replica_after_primary_active",
      "decision": "YES",
      "explanation": "shard is primary and can be allocated"
    },
    {
      "decider": "enable",
      "decision": "YES",
      "explanation": "all allocations are allowed"
    },
    {
      "decider": "node_version",
      "decision": "YES",
      "explanation": "target node version [5.6.3] is the same or newer than source node version [5.6.3]"
    },
    {
      "decider": "snapshot_in_progress",
      "decision": "YES",
      "explanation": "no snapshots are currently running"
    },
    {
      "decider": "filter",
      "decision": "YES",
      "explanation": "node passes include/exclude/require filters"
    },
    {
      "decider": "same_shard",
      "decision": "YES",
      "explanation": "the shard does not exist on the same node"
    },
    {
      "decider": "disk_threshold",
      "decision": "YES",
      "explanation": "enough disk for shard on node, free: [3.8tb], shard size: [13.1gb], free after allocating shard: [3.8tb]"
    },
    {
      "decider": "throttling",
      "decision": "THROTTLE",
      "explanation": "reached the limit of outgoing shard recoveries [4] on the node [2qEyND9UQOCJ9n_N43Pj3w] which holds the primary, cluster setting [cluster.routing.allocation.node_concurrent_outgoing_recoveries=2] (can also be set via [cluster.routing.allocation.node_concurrent_recoveries])"
    },
    {
      "decider": "shards_limit",
      "decision": "YES",
      "explanation": "total shard limits are disabled: [index: -1, cluster: -1] <= 0"
    },
    {
      "decider": "awareness",
      "decision": "YES",
      "explanation": "allocation awareness is not enabled, set cluster setting [cluster.routing.allocation.awareness.attributes] to enable it"
    }
  ]
},
{
  "node_id": "ofWeBwLwSZGvWUDfDdZTyQ",
  "node_name": "data01",
  "transport_address": "10.10.51.21:9300",
  "node_decision": "worse_balance",
  "weight_ranking": 4,
  "deciders": [
     same as data04
    
  ]
  }
  ]
  }

Looks like unassigned shards caused it somehow and running cluster as 'yellow' state.

I have used below POST command to assigned those shards and cluster start self managing rebalance once become 'green'

POST _cluster/reroute?retry_failed=true
{ "commands" :
      [ { "allocate" : 
          { "index" : "my_index", "shard" : 0, "node": "<NODE_NAME>", "allow_primary": "true" }
      }]
}

Ref -

  1. https://help.relativity.com/9.0/Content/Relativity/Data_Grid/Primary_shard_allocation_and_manual_rebalancing.htm
  2. https://logz.io/blog/elasticsearch-cheat-sheet/
  3. https://www.datadoghq.com/blog/elasticsearch-unassigned-shards/

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.