Shards Failure - Kibana Dashboard

Hi Team,

We are are using ELK 7.16.3 version and when ever the timeframe is more than 30 days i am getting the shards failure in my dashboard.

node_disconnected_exception at shard 0 index sd-daily-20220422-00105-000004 node ODanHuRARUSZsFk2QU7wtw

Type

node_disconnected_exception

Reason

[elk-master-0][172.16.20.10:9300][indices:data/read/search[phase/query]] disconnected

Response code
{
"took": 5073,
"timed_out": false,
"_shards": {
"total": 3,
"successful": 2,
"skipped": 0,
"failed": 1,
"failures": [
{
"shard": 0,
"index": "sd-daily -20220422-00105-000004",
"node": "ODanHuRARUSZsFk2QU7wtw",
"reason": {
"type": "node_disconnected_exception",
"reason": "[elk-master-0][172.16.20.10:9300][indices:data/read/search[phase/query]] disconnected"
}
}
]
},

What is the output from the _cluster/stats?pretty&human API?

{
"_nodes" : {
"total" : 3,
"successful" : 3,
"failed" : 0
},
"cluster_name" : "elk-gd",
"cluster_uuid" : "f0hXcMLaQo-RlnM1zJrRqQ",
"timestamp" : 1667801136032,
"status" : "green",
"indices" : {
"count" : 1673,
"shards" : {
"total" : 2487,
"primaries" : 2331,
"replication" : 0.06692406692406692,
"index" : {
"shards" : {
"min" : 1,
"max" : 6,
"avg" : 1.4865511057979677
},
"primaries" : {
"min" : 1,
"max" : 3,
"avg" : 1.393305439330544
},
"replication" : {
"min" : 0.0,
"max" : 1.0,
"avg" : 0.08846383741781232
}
}
},
"docs" : {
"count" : 1774970083,
"deleted" : 4479637
},
"store" : {
"size" : "572.4gb",
"size_in_bytes" : 614621542072,
"total_data_set_size" : "572.4gb",
"total_data_set_size_in_bytes" : 614621542072,
"reserved" : "0b",
"reserved_in_bytes" : 0
},
"fielddata" : {
"memory_size" : "1mb",
"memory_size_in_bytes" : 1132944,
"evictions" : 0
},
"query_cache" : {
"memory_size" : "12.8mb",
"memory_size_in_bytes" : 13439539,
"total_count" : 6754554,
"hit_count" : 210691,
"miss_count" : 6543863,
"cache_size" : 3237,
"cache_count" : 10458,
"evictions" : 7221
},
"completion" : {
"size" : "0b",
"size_in_bytes" : 0
},
"segments" : {
"count" : 17862,
"memory" : "239.1mb",
"memory_in_bytes" : 250772330,
"terms_memory" : "167.5mb",
"terms_memory_in_bytes" : 175647856,
"stored_fields_memory" : "10mb",
"stored_fields_memory_in_bytes" : 10514464,
"term_vectors_memory" : "0b",
"term_vectors_memory_in_bytes" : 0,
"norms_memory" : "11.2mb",
"norms_memory_in_bytes" : 11809344,
"points_memory" : "0b",
"points_memory_in_bytes" : 0,
"doc_values_memory" : "50.3mb",
"doc_values_memory_in_bytes" : 52800666,
"index_writer_memory" : "1.1gb",
"index_writer_memory_in_bytes" : 1198878944,
"version_map_memory" : "21.9mb",
"version_map_memory_in_bytes" : 23008231,
"fixed_bit_set" : "13.7mb",
"fixed_bit_set_memory_in_bytes" : 14374768,
"max_unsafe_auto_id_timestamp" : 1667800451011,
"file_sizes" : { }
},
"mappings" : {
"field_types" : [
{
"name" : "alias",
"count" : 3092,
"index_count" : 1355,
"script_count" : 0
},
{
"name" : "binary",
"count" : 5,
"index_count" : 5,
"script_count" : 0
},
{
"name" : "boolean",
"count" : 41385,
"index_count" : 1522,
"script_count" : 0
},
{
"name" : "byte",
"count" : 1316,
"index_count" : 1316,
"script_count" : 0
},
{
"name" : "constant_keyword",
"count" : 2663,
"index_count" : 888,
"script_count" : 0
},
{
"name" : "date",
"count" : 66455,
"index_count" : 1643,
"script_count" : 0
},
{
"name" : "date_nanos",
"count" : 2,
"index_count" : 2,
"script_count" : 0
},
{
"name" : "date_range",
"count" : 5,
"index_count" : 5,
"script_count" : 0
},
{
"name" : "double",
"count" : 5260,
"index_count" : 66,
"script_count" : 0
},
{
"name" : "double_range",
"count" : 2,
"index_count" : 2,
"script_count" : 0
},
{
"name" : "flattened",
"count" : 10894,
"index_count" : 922,
"script_count" : 0
},
{
"name" : "float",
"count" : 19877,
"index_count" : 1396,
"script_count" : 0
},
{
"name" : "float_range",
"count" : 2,
"index_count" : 2,
"script_count" : 0
},
{
"name" : "geo_point",
"count" : 11296,
"index_count" : 1357,
"script_count" : 0
},
{
"name" : "geo_shape",
"count" : 2,
"index_count" : 2,
"script_count" : 0
},
{
"name" : "half_float",
"count" : 2,
"index_count" : 2,
"script_count" : 0
},
{
"name" : "histogram",
"count" : 1279,
"index_count" : 1279,
"script_count" : 0
},
{
"name" : "integer",
"count" : 295,
"index_count" : 99,
"script_count" : 0
},
{
"name" : "integer_range",
"count" : 2,
"index_count" : 2,
"script_count" : 0
},
{
"name" : "ip",
"count" : 24352,
"index_count" : 1358,
"script_count" : 0
},
{
"name" : "ip_range",
"count" : 5,
"index_count" : 5,
"script_count" : 0
},
{
"name" : "keyword",
"count" : 1335162,
"index_count" : 1643,
"script_count" : 0
},
{
"name" : "long",
"count" : 296206,
"index_count" : 1573,
"script_count" : 0
},
{
"name" : "long_range",
"count" : 2,
"index_count" : 2,
"script_count" : 0
},
{
"name" : "match_only_text",
"count" : 51212,
"index_count" : 884,
"script_count" : 0
},
{
"name" : "nested",
"count" : 11652,
"index_count" : 926,
"script_count" : 0
},
{
"name" : "object",
"count" : 483247,
"index_count" : 1516,
"script_count" : 0
},
{
"name" : "scaled_float",
"count" : 14302,
"index_count" : 1327,
"script_count" : 0
},
{
"name" : "shape",
"count" : 2,
"index_count" : 2,
"script_count" : 0
},
{
"name" : "short",
"count" : 3139,
"index_count" : 33,
"script_count" : 0
},
{
"name" : "text",
"count" : 37668,
"index_count" : 1643,
"script_count" : 0
},
{
"name" : "version",
"count" : 4,
"index_count" : 4,
"script_count" : 0
},
{
"name" : "wildcard",
"count" : 13263,
"index_count" : 887,
"script_count" : 0
}
],
"runtime_field_types" : [
{
"name" : "keyword",
"count" : 3,
"index_count" : 3,
"scriptless_count" : 0,
"shadowed_count" : 0,
"lang" : [
"painless"
],
"lines_max" : 1,
"lines_total" : 3,
"chars_max" : 31,
"chars_total" : 93,
"source_max" : 0,
"source_total" : 0,
"doc_max" : 1,
"doc_total" : 3
}
]
},
"analysis" : {
"char_filter_types" : ,
"tokenizer_types" : ,
"filter_types" : ,
"analyzer_types" : ,
"built_in_char_filters" : ,
"built_in_tokenizers" : ,
"built_in_filters" : ,
"built_in_analyzers" : [
{
"name" : "simple",
"count" : 6,
"index_count" : 3
}
]
},
"versions" : [
{
"version" : "7.10.0",
"index_count" : 11,
"primary_shard_count" : 11,
"total_primary_size" : "406.3mb",
"total_primary_bytes" : 426052031
},
{
"version" : "7.11.1",
"index_count" : 7,
"primary_shard_count" : 9,
"total_primary_size" : "24.1gb",
"total_primary_bytes" : 25974561508
},
{
"version" : "7.13.0",
"index_count" : 141,
"primary_shard_count" : 171,
"total_primary_size" : "57gb",
"total_primary_bytes" : 61303241295
},
{
"version" : "7.16.2",
"index_count" : 1514,
"primary_shard_count" : 2140,
"total_primary_size" : "487.7gb",
"total_primary_bytes" : 523718585823
}
]
},
"nodes" : {
"count" : {
"total" : 3,
"coordinating_only" : 0,
"data" : 3,
"data_cold" : 3,
"data_content" : 3,
"data_frozen" : 3,
"data_hot" : 3,
"data_warm" : 3,
"ingest" : 3,
"master" : 3,
"ml" : 3,
"remote_cluster_client" : 3,
"transform" : 3,
"voting_only" : 0
},
"versions" : [
"7.16.2"
],
"os" : {
"available_processors" : 3,
"allocated_processors" : 3,
"names" : [
{
"name" : "Linux",
"count" : 3
}
],
"pretty_names" : [
{
"pretty_name" : "Ubuntu 20.04.3 LTS",
"count" : 3
}
],
"architectures" : [
{
"arch" : "amd64",
"count" : 3
}
],
"mem" : {
"total" : "24gb",
"total_in_bytes" : 25769803776,
"free" : "617.3mb",
"free_in_bytes" : 647376896,
"used" : "23.3gb",
"used_in_bytes" : 25122426880,
"free_percent" : 3,
"used_percent" : 97
}
},
"process" : {
"cpu" : {
"percent" : 42
},
"open_file_descriptors" : {
"min" : 7882,
"max" : 8534,
"avg" : 8142
}
},
"jvm" : {
"max_uptime" : "2.4d",
"max_uptime_in_millis" : 210987876,
"versions" : [
{
"version" : "17.0.1",
"vm_name" : "OpenJDK 64-Bit Server VM",
"vm_version" : "17.0.1+12",
"vm_vendor" : "Eclipse Adoptium",
"bundled_jdk" : true,
"using_bundled_jdk" : true,
"count" : 3
}
],
"mem" : {
"heap_used" : "11.4gb",
"heap_used_in_bytes" : 12243000832,
"heap_max" : "18gb",
"heap_max_in_bytes" : 19327352832
},
"threads" : 505
},
"fs" : {
"total" : "882.6gb",
"total_in_bytes" : 947778945024,
"free" : "305.7gb",
"free_in_bytes" : 328301461504,
"available" : "305.7gb",
"available_in_bytes" : 328251129856
},
"plugins" : ,
"network_types" : {
"transport_types" : {
"security4" : 3
},
"http_types" : {
"security4" : 3
}
},
"discovery_types" : {
"zen" : 3
},
"packaging_types" : [
{
"flavor" : "default",
"type" : "docker",
"count" : 3
}
],
"ingest" : {
"number_of_pipelines" : 45,
"processor_stats" : {
"append" : {
"count" : 99245,
"failed" : 0,
"current" : 0,
"time" : "53ms",
"time_in_millis" : 53
},
"conditional" : {
"count" : 892430,
"failed" : 0,
"current" : 0,
"time" : "7.6s",
"time_in_millis" : 7617
},
"convert" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"csv" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"date" : {
"count" : 194635,
"failed" : 0,
"current" : 0,
"time" : "3.9s",
"time_in_millis" : 3953
},
"geoip" : {
"count" : 463665,
"failed" : 0,
"current" : 0,
"time" : "13.5s",
"time_in_millis" : 13519
},
"grok" : {
"count" : 488970,
"failed" : 455,
"current" : 0,
"time" : "6.8s",
"time_in_millis" : 6833
},
"gsub" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"pipeline" : {
"count" : 564940,
"failed" : 0,
"current" : 0,
"time" : "7s",
"time_in_millis" : 7011
},
"remove" : {
"count" : 665149,
"failed" : 0,
"current" : 0,
"time" : "891ms",
"time_in_millis" : 891
},
"rename" : {
"count" : 649730,
"failed" : 0,
"current" : 0,
"time" : "520ms",
"time_in_millis" : 520
},
"script" : {
"count" : 99245,
"failed" : 0,
"current" : 0,
"time" : "2.4s",
"time_in_millis" : 2435
},
"set" : {
"count" : 379647,
"failed" : 0,
"current" : 0,
"time" : "2.2s",
"time_in_millis" : 2250
},
"set_security_user" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"uppercase" : {
"count" : 62,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"uri_parts" : {
"count" : 99245,
"failed" : 5688,
"current" : 0,
"time" : "316ms",
"time_in_millis" : 316
},
"urldecode" : {
"count" : 96383,
"failed" : 0,
"current" : 0,
"time" : "194ms",
"time_in_millis" : 194
},
"user_agent" : {
"count" : 299588,
"failed" : 61970,
"current" : 0,
"time" : "2.8s",
"time_in_millis" : 2802
}
}
}
}
}

Please format your code/logs/config using the </> button, or markdown style back ticks. It helps to make things easy to read which helps us help you :slight_smile:

This is probably why, you have far too many shards for your nodes and the amount of data in your cluster.

You should look at how you can reduce that count.

Hi Warkolm, You mean reduce the shards count ? i am getting this error only for this dashboard ? error msg like "Request error: node_not_connected_exception, Node not connected & Request error: i_o_exception, Invalid string; unexpected character: 129 hex: 81"

The issue is likely related to having too many shards for your cluster to efficiently manage, reducing the count reduces the load that is required to manage things.