Thank you
The output is here, but problem was resolved by excluding the nodes from cluster and including them back few hours later, so number os shards is now the same on all nodes.
the events that preceded it may explain why it happened.
I have 4 warm nodes in the cluster and they got overloaded.
This is also an interesting situation because on an index if I have a policy to move for example a 2 days old index to a warm node and after another 4 days to perform a DELETE, if the move to warm is not performed, the delete is not performed either. So indexes that would normally be deleted after 6 days remain on hot nodes and create this shard asymmetric situation.
{
"_nodes" : {
"total" : 49,
"successful" : 49,
"failed" : 0
},
"cluster_name" : "o2-cz-cem",
"cluster_uuid" : "OAIGGQ4QTqiz4i_tgdFx3g",
"timestamp" : 1668402292568,
"status" : "green",
"indices" : {
"count" : 2589,
"shards" : {
"total" : 6910,
"primaries" : 4043,
"replication" : 0.7091268859757606,
"index" : {
"shards" : {
"min" : 1,
"max" : 10,
"avg" : 2.668984163769795
},
"primaries" : {
"min" : 1,
"max" : 10,
"avg" : 1.5616067979915025
},
"replication" : {
"min" : 0.0,
"max" : 1.0,
"avg" : 0.9061413673232909
}
}
},
"docs" : {
"count" : 97849433267,
"deleted" : 23754684
},
"store" : {
"size" : "36.8tb",
"size_in_bytes" : 40474758411128,
"total_data_set_size" : "36.8tb",
"total_data_set_size_in_bytes" : 40474758411128,
"reserved" : "0b",
"reserved_in_bytes" : 0
},
"fielddata" : {
"memory_size" : "27gb",
"memory_size_in_bytes" : 29015723980,
"evictions" : 0
},
"query_cache" : {
"memory_size" : "33.1gb",
"memory_size_in_bytes" : 35577828271,
"total_count" : 2499707452,
"hit_count" : 33507667,
"miss_count" : 2466199785,
"cache_size" : 90414,
"cache_count" : 859526,
"evictions" : 769112
},
"completion" : {
"size" : "0b",
"size_in_bytes" : 0
},
"segments" : {
"count" : 81227,
"memory" : "1.5gb",
"memory_in_bytes" : 1683680760,
"terms_memory" : "1.2gb",
"terms_memory_in_bytes" : 1329788888,
"stored_fields_memory" : "124.1mb",
"stored_fields_memory_in_bytes" : 130174152,
"term_vectors_memory" : "0b",
"term_vectors_memory_in_bytes" : 0,
"norms_memory" : "33.1mb",
"norms_memory_in_bytes" : 34746816,
"points_memory" : "0b",
"points_memory_in_bytes" : 0,
"doc_values_memory" : "180.2mb",
"doc_values_memory_in_bytes" : 188970904,
"index_writer_memory" : "3gb",
"index_writer_memory_in_bytes" : 3267327274,
"version_map_memory" : "21.1mb",
"version_map_memory_in_bytes" : 22125762,
"fixed_bit_set" : "2.3gb",
"fixed_bit_set_memory_in_bytes" : 2557768664,
"max_unsafe_auto_id_timestamp" : 1668402120162,
"file_sizes" : { }
},
"mappings" : {
"field_types" : [
{
"name" : "alias",
"count" : 53,
"index_count" : 14,
"script_count" : 0
},
{
"name" : "binary",
"count" : 8,
"index_count" : 8,
"script_count" : 0
},
{
"name" : "boolean",
"count" : 1635,
"index_count" : 234,
"script_count" : 0
},
{
"name" : "byte",
"count" : 15,
"index_count" : 15,
"script_count" : 0
},
{
"name" : "constant_keyword",
"count" : 46,
"index_count" : 16,
"script_count" : 0
},
{
"name" : "date",
"count" : 7092,
"index_count" : 2303,
"script_count" : 0
},
{
"name" : "date_nanos",
"count" : 2,
"index_count" : 2,
"script_count" : 0
},
{
"name" : "date_range",
"count" : 2,
"index_count" : 2,
"script_count" : 0
},
{
"name" : "double",
"count" : 3703,
"index_count" : 17,
"script_count" : 0
},
{
"name" : "double_range",
"count" : 2,
"index_count" : 2,
"script_count" : 0
},
{
"name" : "flattened",
"count" : 144,
"index_count" : 12,
"script_count" : 0
},
{
"name" : "float",
"count" : 3531,
"index_count" : 277,
"script_count" : 0
},
{
"name" : "float_range",
"count" : 2,
"index_count" : 2,
"script_count" : 0
},
{
"name" : "geo_point",
"count" : 248,
"index_count" : 124,
"script_count" : 0
},
{
"name" : "geo_shape",
"count" : 5,
"index_count" : 5,
"script_count" : 0
},
{
"name" : "half_float",
"count" : 72,
"index_count" : 16,
"script_count" : 0
},
{
"name" : "integer",
"count" : 1157,
"index_count" : 295,
"script_count" : 0
},
{
"name" : "integer_range",
"count" : 2,
"index_count" : 2,
"script_count" : 0
},
{
"name" : "ip",
"count" : 284,
"index_count" : 19,
"script_count" : 0
},
{
"name" : "ip_range",
"count" : 2,
"index_count" : 2,
"script_count" : 0
},
{
"name" : "keyword",
"count" : 173856,
"index_count" : 2217,
"script_count" : 0
},
{
"name" : "long",
"count" : 43016,
"index_count" : 1503,
"script_count" : 0
},
{
"name" : "long_range",
"count" : 2,
"index_count" : 2,
"script_count" : 0
},
{
"name" : "match_only_text",
"count" : 780,
"index_count" : 12,
"script_count" : 0
},
{
"name" : "nested",
"count" : 311,
"index_count" : 59,
"script_count" : 0
},
{
"name" : "object",
"count" : 78289,
"index_count" : 639,
"script_count" : 0
},
{
"name" : "scaled_float",
"count" : 1740,
"index_count" : 12,
"script_count" : 0
},
{
"name" : "shape",
"count" : 2,
"index_count" : 2,
"script_count" : 0
},
{
"name" : "short",
"count" : 2,
"index_count" : 2,
"script_count" : 0
},
{
"name" : "text",
"count" : 57229,
"index_count" : 1006,
"script_count" : 0
},
{
"name" : "unsigned_long",
"count" : 80,
"index_count" : 40,
"script_count" : 0
},
{
"name" : "version",
"count" : 4,
"index_count" : 4,
"script_count" : 0
},
{
"name" : "wildcard",
"count" : 204,
"index_count" : 12,
"script_count" : 0
}
],
"runtime_field_types" : [
{
"name" : "keyword",
"count" : 25,
"index_count" : 13,
"scriptless_count" : 25,
"shadowed_count" : 25,
"lang" : [ ],
"lines_max" : 0,
"lines_total" : 0,
"chars_max" : 0,
"chars_total" : 0,
"source_max" : 0,
"source_total" : 0,
"doc_max" : 0,
"doc_total" : 0
}
]
},
"analysis" : {
"char_filter_types" : [ ],
"tokenizer_types" : [ ],
"filter_types" : [ ],
"analyzer_types" : [ ],
"built_in_char_filters" : [ ],
"built_in_tokenizers" : [ ],
"built_in_filters" : [ ],
"built_in_analyzers" : [ ]
},
"versions" : [
{
"version" : "6.4.2",
"index_count" : 14,
"primary_shard_count" : 14,
"total_primary_size" : "177mb",
"total_primary_bytes" : 185689169
},
{
"version" : "6.8.5",
"index_count" : 2,
"primary_shard_count" : 2,
"total_primary_size" : "44.3mb",
"total_primary_bytes" : 46548931
},
{
"version" : "7.4.2",
"index_count" : 54,
"primary_shard_count" : 54,
"total_primary_size" : "2.5gb",
"total_primary_bytes" : 2697589090
},
{
"version" : "7.9.0",
"index_count" : 392,
"primary_shard_count" : 401,
"total_primary_size" : "96.3gb",
"total_primary_bytes" : 103458363825
},
{
"version" : "7.13.3",
"index_count" : 393,
"primary_shard_count" : 413,
"total_primary_size" : "114.9gb",
"total_primary_bytes" : 123433679941
},
{
"version" : "7.17.0",
"index_count" : 1734,
"primary_shard_count" : 3159,
"total_primary_size" : "28.8tb",
"total_primary_bytes" : 31747985489456
}
]
},
"nodes" : {
"count" : {
"total" : 49,
"coordinating_only" : 2,
"data" : 0,
"data_cold" : 0,
"data_content" : 38,
"data_frozen" : 0,
"data_hot" : 38,
"data_warm" : 4,
"ingest" : 42,
"master" : 3,
"ml" : 44,
"remote_cluster_client" : 2,
"transform" : 42,
"voting_only" : 0
},
"versions" : [
"7.17.0"
],
"os" : {
"available_processors" : 356,
"allocated_processors" : 356,
"names" : [
{
"name" : "Linux",
"count" : 49
}
],
"pretty_names" : [
{
"pretty_name" : "CentOS Linux 7 (Core)",
"count" : 48
},
{
"pretty_name" : "Ubuntu 20.04.3 LTS",
"count" : 1
}
],
"architectures" : [
{
"arch" : "amd64",
"count" : 49
}
],
"mem" : {
"total" : "1.4tb",
"total_in_bytes" : 1619387199488,
"free" : "87.5gb",
"free_in_bytes" : 94023417856,
"used" : "1.3tb",
"used_in_bytes" : 1525363781632,
"free_percent" : 6,
"used_percent" : 94
}
},
"process" : {
"cpu" : {
"percent" : 109
},
"open_file_descriptors" : {
"min" : 1331,
"max" : 3335,
"avg" : 2794
}
},
"jvm" : {
"max_uptime" : "283.5d",
"max_uptime_in_millis" : 24495166521,
"versions" : [
{
"version" : "17.0.1",
"vm_name" : "OpenJDK 64-Bit Server VM",
"vm_version" : "17.0.1+12",
"vm_vendor" : "Eclipse Adoptium",
"bundled_jdk" : true,
"using_bundled_jdk" : true,
"count" : 49
}
],
"mem" : {
"heap_used" : "305gb",
"heap_used_in_bytes" : 327509691232,
"heap_max" : "672gb",
"heap_max_in_bytes" : 721554505728
},
"threads" : 7237
},
"fs" : {
"total" : "82tb",
"total_in_bytes" : 90232745926656,
"free" : "44.7tb",
"free_in_bytes" : 49177679351808,
"available" : "43.9tb",
"available_in_bytes" : 48332613103616
},
"plugins" : [ ],
"network_types" : {
"transport_types" : {
"security4" : 49
},
"http_types" : {
"security4" : 49
}
},
"discovery_types" : {
"zen" : 49
},
"packaging_types" : [
{
"flavor" : "default",
"type" : "rpm",
"count" : 48
},
{
"flavor" : "default",
"type" : "docker",
"count" : 1
}
],
"ingest" : {
"number_of_pipelines" : 23,
"processor_stats" : {
"conditional" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"convert" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"geoip" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"grok" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"gsub" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"remove" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"rename" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"script" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"set" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"set_security_user" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
}
}
}
}
}
you could see now the shars are perfectly the same on each node
shards disk.indices disk.used disk.avail disk.total disk.percent node
176 1.1tb 1.2tb 731.1gb 1.9tb 62 tela01prahkz
176 1.2tb 1.3tb 613.6gb 1.9tb 68 tela02prahkz
176 583.5gb 675gb 1.2tb 1.9tb 34 tela03prahkz
176 1tb 1.1tb 811.9gb 1.9tb 58 tela04prahkz
176 884.7gb 982.7gb 984.7gb 1.9tb 49 tela05prahkz
176 398.6gb 490.8gb 1.4tb 1.9tb 24 tela06prahkz
176 311.3gb 402.1gb 1.5tb 1.9tb 20 tela07prahkz
176 630gb 720.7gb 1.2tb 1.9tb 36 tela08prahkz
176 799.3gb 891.4gb 1tb 1.9tb 45 tela09prahkz
176 841.3gb 846.6gb 1.1tb 1.9tb 42 tela10prahkz
176 990.7gb 996.5gb 985.7gb 1.9tb 50 tela11prahkz
176 929.7gb 935.1gb 1tb 1.9tb 47 tela12prahkz
176 561.4gb 565.7gb 1.3tb 1.9tb 28 tela13prahkz
176 429.7gb 435.2gb 1.5tb 1.9tb 21 tela14prahkz
176 872gb 877gb 1tb 1.9tb 44 tela15prahkz
176 1tb 1tb 917.2gb 1.9tb 53 tela16prahkz
176 1.7tb 1.7tb 219.5gb 1.9tb 88 tela17prahkz
176 569.3gb 573.3gb 1.3tb 1.9tb 28 tela18prahkz
176 1.1tb 1.1tb 759.2gb 1.9tb 61 tela19prahkz
176 1tb 1tb 943.3gb 1.9tb 52 tela20prahkz
176 878.5gb 883.3gb 1tb 1.9tb 44 tela21prahkz
176 815gb 820.6gb 1.1tb 1.9tb 41 tela22prahkz
176 564.4gb 569.8gb 1.3tb 1.9tb 28 tela23prahkz
176 536.8gb 542.3gb 1.4tb 1.9tb 27 tela24prahkz
176 1tb 1tb 927.2gb 1.9tb 53 tela25prahkz
176 836.4gb 841.2gb 1.1tb 1.9tb 42 tela26prahkz
176 1012.3gb 1017.4gb 964.8gb 1.9tb 51 tela27prahkz
176 738.4gb 743.3gb 1.2tb 1.9tb 37 tela28prahkz
176 990.2gb 995.8gb 986.4gb 1.9tb 50 tela29prahkz
175 423.4gb 427.2gb 1.5tb 1.9tb 21 tela30prahkz
176 647.8gb 652.9gb 1.3tb 1.9tb 32 tela31prahkz
176 514.7gb 519.1gb 1.4tb 1.9tb 25 tela32prahkz
176 608.5gb 615.5gb 1.3tb 1.9tb 30 tela33prahkz
176 467.7gb 472.2gb 1.5tb 1.9tb 23 tela34prahkz
176 707.5gb 713gb 1.2tb 1.9tb 35 tela35prahkz
176 905.4gb 910.4gb 1tb 1.9tb 44 tela36prahkz
176 548.1gb 552.6gb 1.4tb 1.9tb 27 tela37prahkz
176 1.3tb 1.3tb 641.8gb 1.9tb 68 tela38prahkz
56 1.6tb 1.6tb 285.6gb 1.9tb 85 tela51prahkz
55 1.6tb 1.6tb 290gb 1.9tb 85 tela52prahkz
56 1.6tb 1.6tb 285.6gb 1.9tb 85 tela53prahkz
56 1.6tb 1.6tb 283.5gb 1.9tb 85 tela54prahkz