Logs to investigate constant shard initialization

Maycon_Santos · September 3, 2021, 4:58pm

Hello,

I would like to enable more verbose logs to help investigate what is causing constant shard initialization tasks in my cluster.

I've set the following without any luck:
org.elasticsearch.cluster.routing: TRACE
org.elasticsearch.transport: DEBUG
org.elasticsearch.discovery: DEBUG

Is there another logger that could provide more information on why there are always such tasks in the cluster? Also, what are the indicators that would show that it was caused by rebalance or node availability?

warkolm · September 5, 2021, 10:27pm

Welcome to our community!

Can we start with the output from the _cluster/stats?pretty&human API, to get a better idea into your cluster.

Maycon_Santos · September 6, 2021, 7:04am

Hello @warkolm,

here is the information you've requested:

{
  "_nodes" : {
    "total" : 82,
    "successful" : 82,
    "failed" : 0
  },
  "cluster_name" : "es-cluster",
  "cluster_uuid" : "kelkfjewflsjan-m--4gIA",
  "timestamp" : 1630887322520,
  "status" : "yellow",
  "indices" : {
    "count" : 3302,
    "shards" : {
      "total" : 12892,
      "primaries" : 6474,
      "replication" : 0.991350015446401,
      "index" : {
        "shards" : {
          "min" : 1,
          "max" : 120,
          "avg" : 3.9043004239854633
        },
        "primaries" : {
          "min" : 1,
          "max" : 60,
          "avg" : 1.9606299212598426
        },
        "replication" : {
          "min" : 0.0,
          "max" : 2.0,
          "avg" : 0.9933121340601655
        }
      }
    },
    "docs" : {
      "count" : 190691346275,
      "deleted" : 14123071412
    },
    "store" : {
      "size" : "257.4tb",
      "size_in_bytes" : 283081033649221
    },
    "fielddata" : {
      "memory_size" : "771.5gb",
      "memory_size_in_bytes" : 828408349128,
      "evictions" : 0
    },
    "query_cache" : {
      "memory_size" : "245.2gb",
      "memory_size_in_bytes" : 263360361663,
      "total_count" : 52790536308,
      "hit_count" : 2448634476,
      "miss_count" : 50341901832,
      "cache_size" : 2190101,
      "cache_count" : 891705001,
      "evictions" : 889514900
    },
    "completion" : {
      "size" : "0b",
      "size_in_bytes" : 0
    },
    "segments" : {
      "count" : 311551,
      "memory" : "223.3gb",
      "memory_in_bytes" : 239813767943,
      "terms_memory" : "112.7gb",
      "terms_memory_in_bytes" : 121034671162,
      "stored_fields_memory" : "71.2gb",
      "stored_fields_memory_in_bytes" : 76453497472,
      "term_vectors_memory" : "0b",
      "term_vectors_memory_in_bytes" : 0,
      "norms_memory" : "64.8mb",
      "norms_memory_in_bytes" : 67994304,
      "points_memory" : "38gb",
      "points_memory_in_bytes" : 40891775859,
      "doc_values_memory" : "1.2gb",
      "doc_values_memory_in_bytes" : 1365829146,
      "index_writer_memory" : "3.2gb",
      "index_writer_memory_in_bytes" : 3466346404,
      "version_map_memory" : "206.7mb",
      "version_map_memory_in_bytes" : 216829898,
      "fixed_bit_set" : "79.6mb",
      "fixed_bit_set_memory_in_bytes" : 83486560,
      "max_unsafe_auto_id_timestamp" : 1630887238442,
      "file_sizes" : { }
    }
  },
  "nodes" : {
    "count" : {
      "total" : 82,
      "coordinating_only" : 0,
      "data" : 79,
      "ingest" : 79,
      "master" : 3,
      "ml" : 82,
      "voting_only" : 0
    },
    "versions" : [
      "7.4.2"
    ],
    "os" : {
      "available_processors" : 1968,
      "allocated_processors" : 1968,
      "names" : [
        {
          "name" : "Linux",
          "count" : 82
        }
      ],
      "pretty_names" : [
        {
          "pretty_name" : "CentOS Linux 7 (Core)",
          "count" : 82
        }
      ],
      "mem" : {
        "total" : "10tb",
        "total_in_bytes" : 11076833697792,
        "free" : "477.7gb",
        "free_in_bytes" : 512974430208,
        "used" : "9.6tb",
        "used_in_bytes" : 10563859267584,
        "free_percent" : 5,
        "used_percent" : 95
      }
    },
    "process" : {
      "cpu" : {
        "percent" : 1274
      },
      "open_file_descriptors" : {
        "min" : 2355,
        "max" : 6472,
        "avg" : 5451
      }
    },
    "jvm" : {
      "max_uptime" : "10.4d",
      "max_uptime_in_millis" : 903373624,
      "versions" : [
        {
          "version" : "13.0.1",
          "vm_name" : "OpenJDK 64-Bit Server VM",
          "vm_version" : "13.0.1+9",
          "vm_vendor" : "AdoptOpenJDK",
          "bundled_jdk" : true,
          "using_bundled_jdk" : true,
          "count" : 82
        }
      ],
      "mem" : {
        "heap_used" : "1.8tb",
        "heap_used_in_bytes" : 2004458201224,
        "heap_max" : "2.5tb",
        "heap_max_in_bytes" : 2814747082752
      },
      "threads" : 18192
    },
    "fs" : {
      "total" : "404.9tb",
      "total_in_bytes" : 445241552904192,
      "free" : "145.4tb",
      "free_in_bytes" : 159914219859968,
      "available" : "144.6tb",
      "available_in_bytes" : 159032375832576
    },
    "plugins" : [
      {
        "name" : "repository-s3",
        "version" : "7.4.2",
        "elasticsearch_version" : "7.4.2",
        "java_version" : "1.8",
        "description" : "The S3 repository plugin adds S3 repositories",
        "classname" : "org.elasticsearch.repositories.s3.S3RepositoryPlugin",
        "extended_plugins" : [ ],
        "has_native_controller" : false
      }
    ],
    "network_types" : {
      "transport_types" : {
        "security4" : 82
      },
      "http_types" : {
        "security4" : 82
      }
    },
    "discovery_types" : {
      "zen" : 82
    },
    "packaging_types" : [
      {
        "flavor" : "default",
        "type" : "docker",
        "count" : 82
      }
    ]
  }
}

I am suspecting of a very strict index balance configuration for index:

cluster.routing.allocation.balance.index" : "2.7f"

another point might be nodes problems causing the cluster to declarer them offline.

Anyways, it would be great to understand the log source for tracking the events that trigger these shard allocations.

warkolm · September 6, 2021, 11:11pm

Ok first up, 7.4 is EOL. Please upgrade Elasticsearch and your JVM.

Other than that I don't know specific loggers that can provide this level of information.
Is there relevant entries in your master logs?

Maycon_Santos · September 15, 2021, 11:59am

Thanks, @warkolm, removing data and adding more nodes seems to have helped, which raises the question of rebalancing stability as we would expect most rebalances to occur on nodes above the low disk watermark. We also noticed that the rebalance was causing nodes to cross the high disk watermark.

Anyways, we will plan the upgrade as we might gain improvements on this part.

system · October 13, 2021, 11:59am

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
ES instance restart causing shard initialization Elasticsearch	7	2085	July 5, 2017
Shards keep re-initializing Elasticsearch	8	1492	June 24, 2020
My cluster frequently has shards on initializing_shards Elasticsearch	13	567	February 16, 2023
Initializing shards everytime! Elasticsearch	11	1690	July 5, 2017
Unusual logging Elasticsearch	1	345	July 6, 2017

Logs to investigate constant shard initialization

Related topics