Logs to investigate constant shard initialization

Hello,

I would like to enable more verbose logs to help investigate what is causing constant shard initialization tasks in my cluster.

I've set the following without any luck:
org.elasticsearch.cluster.routing: TRACE
org.elasticsearch.transport: DEBUG
org.elasticsearch.discovery: DEBUG

Is there another logger that could provide more information on why there are always such tasks in the cluster? Also, what are the indicators that would show that it was caused by rebalance or node availability?

Welcome to our community! :smiley:

Can we start with the output from the _cluster/stats?pretty&human API, to get a better idea into your cluster.

Hello @warkolm,

here is the information you've requested:

{
  "_nodes" : {
    "total" : 82,
    "successful" : 82,
    "failed" : 0
  },
  "cluster_name" : "es-cluster",
  "cluster_uuid" : "kelkfjewflsjan-m--4gIA",
  "timestamp" : 1630887322520,
  "status" : "yellow",
  "indices" : {
    "count" : 3302,
    "shards" : {
      "total" : 12892,
      "primaries" : 6474,
      "replication" : 0.991350015446401,
      "index" : {
        "shards" : {
          "min" : 1,
          "max" : 120,
          "avg" : 3.9043004239854633
        },
        "primaries" : {
          "min" : 1,
          "max" : 60,
          "avg" : 1.9606299212598426
        },
        "replication" : {
          "min" : 0.0,
          "max" : 2.0,
          "avg" : 0.9933121340601655
        }
      }
    },
    "docs" : {
      "count" : 190691346275,
      "deleted" : 14123071412
    },
    "store" : {
      "size" : "257.4tb",
      "size_in_bytes" : 283081033649221
    },
    "fielddata" : {
      "memory_size" : "771.5gb",
      "memory_size_in_bytes" : 828408349128,
      "evictions" : 0
    },
    "query_cache" : {
      "memory_size" : "245.2gb",
      "memory_size_in_bytes" : 263360361663,
      "total_count" : 52790536308,
      "hit_count" : 2448634476,
      "miss_count" : 50341901832,
      "cache_size" : 2190101,
      "cache_count" : 891705001,
      "evictions" : 889514900
    },
    "completion" : {
      "size" : "0b",
      "size_in_bytes" : 0
    },
    "segments" : {
      "count" : 311551,
      "memory" : "223.3gb",
      "memory_in_bytes" : 239813767943,
      "terms_memory" : "112.7gb",
      "terms_memory_in_bytes" : 121034671162,
      "stored_fields_memory" : "71.2gb",
      "stored_fields_memory_in_bytes" : 76453497472,
      "term_vectors_memory" : "0b",
      "term_vectors_memory_in_bytes" : 0,
      "norms_memory" : "64.8mb",
      "norms_memory_in_bytes" : 67994304,
      "points_memory" : "38gb",
      "points_memory_in_bytes" : 40891775859,
      "doc_values_memory" : "1.2gb",
      "doc_values_memory_in_bytes" : 1365829146,
      "index_writer_memory" : "3.2gb",
      "index_writer_memory_in_bytes" : 3466346404,
      "version_map_memory" : "206.7mb",
      "version_map_memory_in_bytes" : 216829898,
      "fixed_bit_set" : "79.6mb",
      "fixed_bit_set_memory_in_bytes" : 83486560,
      "max_unsafe_auto_id_timestamp" : 1630887238442,
      "file_sizes" : { }
    }
  },
  "nodes" : {
    "count" : {
      "total" : 82,
      "coordinating_only" : 0,
      "data" : 79,
      "ingest" : 79,
      "master" : 3,
      "ml" : 82,
      "voting_only" : 0
    },
    "versions" : [
      "7.4.2"
    ],
    "os" : {
      "available_processors" : 1968,
      "allocated_processors" : 1968,
      "names" : [
        {
          "name" : "Linux",
          "count" : 82
        }
      ],
      "pretty_names" : [
        {
          "pretty_name" : "CentOS Linux 7 (Core)",
          "count" : 82
        }
      ],
      "mem" : {
        "total" : "10tb",
        "total_in_bytes" : 11076833697792,
        "free" : "477.7gb",
        "free_in_bytes" : 512974430208,
        "used" : "9.6tb",
        "used_in_bytes" : 10563859267584,
        "free_percent" : 5,
        "used_percent" : 95
      }
    },
    "process" : {
      "cpu" : {
        "percent" : 1274
      },
      "open_file_descriptors" : {
        "min" : 2355,
        "max" : 6472,
        "avg" : 5451
      }
    },
    "jvm" : {
      "max_uptime" : "10.4d",
      "max_uptime_in_millis" : 903373624,
      "versions" : [
        {
          "version" : "13.0.1",
          "vm_name" : "OpenJDK 64-Bit Server VM",
          "vm_version" : "13.0.1+9",
          "vm_vendor" : "AdoptOpenJDK",
          "bundled_jdk" : true,
          "using_bundled_jdk" : true,
          "count" : 82
        }
      ],
      "mem" : {
        "heap_used" : "1.8tb",
        "heap_used_in_bytes" : 2004458201224,
        "heap_max" : "2.5tb",
        "heap_max_in_bytes" : 2814747082752
      },
      "threads" : 18192
    },
    "fs" : {
      "total" : "404.9tb",
      "total_in_bytes" : 445241552904192,
      "free" : "145.4tb",
      "free_in_bytes" : 159914219859968,
      "available" : "144.6tb",
      "available_in_bytes" : 159032375832576
    },
    "plugins" : [
      {
        "name" : "repository-s3",
        "version" : "7.4.2",
        "elasticsearch_version" : "7.4.2",
        "java_version" : "1.8",
        "description" : "The S3 repository plugin adds S3 repositories",
        "classname" : "org.elasticsearch.repositories.s3.S3RepositoryPlugin",
        "extended_plugins" : [ ],
        "has_native_controller" : false
      }
    ],
    "network_types" : {
      "transport_types" : {
        "security4" : 82
      },
      "http_types" : {
        "security4" : 82
      }
    },
    "discovery_types" : {
      "zen" : 82
    },
    "packaging_types" : [
      {
        "flavor" : "default",
        "type" : "docker",
        "count" : 82
      }
    ]
  }
}

I am suspecting of a very strict index balance configuration for index:

cluster.routing.allocation.balance.index" : "2.7f"

another point might be nodes problems causing the cluster to declarer them offline.

Anyways, it would be great to understand the log source for tracking the events that trigger these shard allocations.

Ok first up, 7.4 is EOL. Please upgrade Elasticsearch and your JVM.

Other than that I don't know specific loggers that can provide this level of information.
Is there relevant entries in your master logs?

Thanks, @warkolm, removing data and adding more nodes seems to have helped, which raises the question of rebalancing stability as we would expect most rebalances to occur on nodes above the low disk watermark. We also noticed that the rebalance was causing nodes to cross the high disk watermark.

Anyways, we will plan the upgrade as we might gain improvements on this part.