Shards Failure - Kibana Dashboard

Hi Team,

We are are using ELK 7.16.3 version and when ever the timeframe is more than 30 days i am getting the shards failure in my dashboard.

node_disconnected_exception at shard 0 index sd-daily-20220422-00105-000004 node ODanHuRARUSZsFk2QU7wtw

Type

node_disconnected_exception

Reason

[elk-master-0][172.16.20.10:9300][indices:data/read/search[phase/query]] disconnected

Response code
{
"took": 5073,
"timed_out": false,
"_shards": {
"total": 3,
"successful": 2,
"skipped": 0,
"failed": 1,
"failures": [
{
"shard": 0,
"index": "sd-daily -20220422-00105-000004",
"node": "ODanHuRARUSZsFk2QU7wtw",
"reason": {
"type": "node_disconnected_exception",
"reason": "[elk-master-0][172.16.20.10:9300][indices:data/read/search[phase/query]] disconnected"
}
}
]
},

What is the output from the _cluster/stats?pretty&human API?

{
"_nodes" : {
"total" : 3,
"successful" : 3,
"failed" : 0
},
"cluster_name" : "elk-gd",
"cluster_uuid" : "f0hXcMLaQo-RlnM1zJrRqQ",
"timestamp" : 1667801136032,
"status" : "green",
"indices" : {
"count" : 1673,
"shards" : {
"total" : 2487,
"primaries" : 2331,
"replication" : 0.06692406692406692,
"index" : {
"shards" : {
"min" : 1,
"max" : 6,
"avg" : 1.4865511057979677
},
"primaries" : {
"min" : 1,
"max" : 3,
"avg" : 1.393305439330544
},
"replication" : {
"min" : 0.0,
"max" : 1.0,
"avg" : 0.08846383741781232
}
}
},
"docs" : {
"count" : 1774970083,
"deleted" : 4479637
},
"store" : {
"size" : "572.4gb",
"size_in_bytes" : 614621542072,
"total_data_set_size" : "572.4gb",
"total_data_set_size_in_bytes" : 614621542072,
"reserved" : "0b",
"reserved_in_bytes" : 0
},
"fielddata" : {
"memory_size" : "1mb",
"memory_size_in_bytes" : 1132944,
"evictions" : 0
},
"query_cache" : {
"memory_size" : "12.8mb",
"memory_size_in_bytes" : 13439539,
"total_count" : 6754554,
"hit_count" : 210691,
"miss_count" : 6543863,
"cache_size" : 3237,
"cache_count" : 10458,
"evictions" : 7221
},
"completion" : {
"size" : "0b",
"size_in_bytes" : 0
},
"segments" : {
"count" : 17862,
"memory" : "239.1mb",
"memory_in_bytes" : 250772330,
"terms_memory" : "167.5mb",
"terms_memory_in_bytes" : 175647856,
"stored_fields_memory" : "10mb",
"stored_fields_memory_in_bytes" : 10514464,
"term_vectors_memory" : "0b",
"term_vectors_memory_in_bytes" : 0,
"norms_memory" : "11.2mb",
"norms_memory_in_bytes" : 11809344,
"points_memory" : "0b",
"points_memory_in_bytes" : 0,
"doc_values_memory" : "50.3mb",
"doc_values_memory_in_bytes" : 52800666,
"index_writer_memory" : "1.1gb",
"index_writer_memory_in_bytes" : 1198878944,
"version_map_memory" : "21.9mb",
"version_map_memory_in_bytes" : 23008231,
"fixed_bit_set" : "13.7mb",
"fixed_bit_set_memory_in_bytes" : 14374768,
"max_unsafe_auto_id_timestamp" : 1667800451011,
"file_sizes" : { }
},
"mappings" : {
"field_types" : [
{
"name" : "alias",
"count" : 3092,
"index_count" : 1355,
"script_count" : 0
},
{
"name" : "binary",
"count" : 5,
"index_count" : 5,
"script_count" : 0
},
{
"name" : "boolean",
"count" : 41385,
"index_count" : 1522,
"script_count" : 0
},
{
"name" : "byte",
"count" : 1316,
"index_count" : 1316,
"script_count" : 0
},
{
"name" : "constant_keyword",
"count" : 2663,
"index_count" : 888,
"script_count" : 0
},
{
"name" : "date",
"count" : 66455,
"index_count" : 1643,
"script_count" : 0
},
{
"name" : "date_nanos",
"count" : 2,
"index_count" : 2,
"script_count" : 0
},
{
"name" : "date_range",
"count" : 5,
"index_count" : 5,
"script_count" : 0
},
{
"name" : "double",
"count" : 5260,
"index_count" : 66,
"script_count" : 0
},
{
"name" : "double_range",
"count" : 2,
"index_count" : 2,
"script_count" : 0
},
{
"name" : "flattened",
"count" : 10894,
"index_count" : 922,
"script_count" : 0
},
{
"name" : "float",
"count" : 19877,
"index_count" : 1396,
"script_count" : 0
},
{
"name" : "float_range",
"count" : 2,
"index_count" : 2,
"script_count" : 0
},
{
"name" : "geo_point",
"count" : 11296,
"index_count" : 1357,
"script_count" : 0
},
{
"name" : "geo_shape",
"count" : 2,
"index_count" : 2,
"script_count" : 0
},
{
"name" : "half_float",
"count" : 2,
"index_count" : 2,
"script_count" : 0
},
{
"name" : "histogram",
"count" : 1279,
"index_count" : 1279,
"script_count" : 0
},
{
"name" : "integer",
"count" : 295,
"index_count" : 99,
"script_count" : 0
},
{
"name" : "integer_range",
"count" : 2,
"index_count" : 2,
"script_count" : 0
},
{
"name" : "ip",
"count" : 24352,
"index_count" : 1358,
"script_count" : 0
},
{
"name" : "ip_range",
"count" : 5,
"index_count" : 5,
"script_count" : 0
},
{
"name" : "keyword",
"count" : 1335162,
"index_count" : 1643,
"script_count" : 0
},
{
"name" : "long",
"count" : 296206,
"index_count" : 1573,
"script_count" : 0
},
{
"name" : "long_range",
"count" : 2,
"index_count" : 2,
"script_count" : 0
},
{
"name" : "match_only_text",
"count" : 51212,
"index_count" : 884,
"script_count" : 0
},
{
"name" : "nested",
"count" : 11652,
"index_count" : 926,
"script_count" : 0
},
{
"name" : "object",
"count" : 483247,
"index_count" : 1516,
"script_count" : 0
},
{
"name" : "scaled_float",
"count" : 14302,
"index_count" : 1327,
"script_count" : 0
},
{
"name" : "shape",
"count" : 2,
"index_count" : 2,
"script_count" : 0
},
{
"name" : "short",
"count" : 3139,
"index_count" : 33,
"script_count" : 0
},
{
"name" : "text",
"count" : 37668,
"index_count" : 1643,
"script_count" : 0
},
{
"name" : "version",
"count" : 4,
"index_count" : 4,
"script_count" : 0
},
{
"name" : "wildcard",
"count" : 13263,
"index_count" : 887,
"script_count" : 0
}
],
"runtime_field_types" : [
{
"name" : "keyword",
"count" : 3,
"index_count" : 3,
"scriptless_count" : 0,
"shadowed_count" : 0,
"lang" : [
"painless"
],
"lines_max" : 1,
"lines_total" : 3,
"chars_max" : 31,
"chars_total" : 93,
"source_max" : 0,
"source_total" : 0,
"doc_max" : 1,
"doc_total" : 3
}
]
},
"analysis" : {
"char_filter_types" : ,
"tokenizer_types" : ,
"filter_types" : ,
"analyzer_types" : ,
"built_in_char_filters" : ,
"built_in_tokenizers" : ,
"built_in_filters" : ,
"built_in_analyzers" : [
{
"name" : "simple",
"count" : 6,
"index_count" : 3
}
]
},
"versions" : [
{
"version" : "7.10.0",
"index_count" : 11,
"primary_shard_count" : 11,
"total_primary_size" : "406.3mb",
"total_primary_bytes" : 426052031
},
{
"version" : "7.11.1",
"index_count" : 7,
"primary_shard_count" : 9,
"total_primary_size" : "24.1gb",
"total_primary_bytes" : 25974561508
},
{
"version" : "7.13.0",
"index_count" : 141,
"primary_shard_count" : 171,
"total_primary_size" : "57gb",
"total_primary_bytes" : 61303241295
},
{
"version" : "7.16.2",
"index_count" : 1514,
"primary_shard_count" : 2140,
"total_primary_size" : "487.7gb",
"total_primary_bytes" : 523718585823
}
]
},
"nodes" : {
"count" : {
"total" : 3,
"coordinating_only" : 0,
"data" : 3,
"data_cold" : 3,
"data_content" : 3,
"data_frozen" : 3,
"data_hot" : 3,
"data_warm" : 3,
"ingest" : 3,
"master" : 3,
"ml" : 3,
"remote_cluster_client" : 3,
"transform" : 3,
"voting_only" : 0
},
"versions" : [
"7.16.2"
],
"os" : {
"available_processors" : 3,
"allocated_processors" : 3,
"names" : [
{
"name" : "Linux",
"count" : 3
}
],
"pretty_names" : [
{
"pretty_name" : "Ubuntu 20.04.3 LTS",
"count" : 3
}
],
"architectures" : [
{
"arch" : "amd64",
"count" : 3
}
],
"mem" : {
"total" : "24gb",
"total_in_bytes" : 25769803776,
"free" : "617.3mb",
"free_in_bytes" : 647376896,
"used" : "23.3gb",
"used_in_bytes" : 25122426880,
"free_percent" : 3,
"used_percent" : 97
}
},
"process" : {
"cpu" : {
"percent" : 42
},
"open_file_descriptors" : {
"min" : 7882,
"max" : 8534,
"avg" : 8142
}
},
"jvm" : {
"max_uptime" : "2.4d",
"max_uptime_in_millis" : 210987876,
"versions" : [
{
"version" : "17.0.1",
"vm_name" : "OpenJDK 64-Bit Server VM",
"vm_version" : "17.0.1+12",
"vm_vendor" : "Eclipse Adoptium",
"bundled_jdk" : true,
"using_bundled_jdk" : true,
"count" : 3
}
],
"mem" : {
"heap_used" : "11.4gb",
"heap_used_in_bytes" : 12243000832,
"heap_max" : "18gb",
"heap_max_in_bytes" : 19327352832
},
"threads" : 505
},
"fs" : {
"total" : "882.6gb",
"total_in_bytes" : 947778945024,
"free" : "305.7gb",
"free_in_bytes" : 328301461504,
"available" : "305.7gb",
"available_in_bytes" : 328251129856
},
"plugins" : ,
"network_types" : {
"transport_types" : {
"security4" : 3
},
"http_types" : {
"security4" : 3
}
},
"discovery_types" : {
"zen" : 3
},
"packaging_types" : [
{
"flavor" : "default",
"type" : "docker",
"count" : 3
}
],
"ingest" : {
"number_of_pipelines" : 45,
"processor_stats" : {
"append" : {
"count" : 99245,
"failed" : 0,
"current" : 0,
"time" : "53ms",
"time_in_millis" : 53
},
"conditional" : {
"count" : 892430,
"failed" : 0,
"current" : 0,
"time" : "7.6s",
"time_in_millis" : 7617
},
"convert" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"csv" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"date" : {
"count" : 194635,
"failed" : 0,
"current" : 0,
"time" : "3.9s",
"time_in_millis" : 3953
},
"geoip" : {
"count" : 463665,
"failed" : 0,
"current" : 0,
"time" : "13.5s",
"time_in_millis" : 13519
},
"grok" : {
"count" : 488970,
"failed" : 455,
"current" : 0,
"time" : "6.8s",
"time_in_millis" : 6833
},
"gsub" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"pipeline" : {
"count" : 564940,
"failed" : 0,
"current" : 0,
"time" : "7s",
"time_in_millis" : 7011
},
"remove" : {
"count" : 665149,
"failed" : 0,
"current" : 0,
"time" : "891ms",
"time_in_millis" : 891
},
"rename" : {
"count" : 649730,
"failed" : 0,
"current" : 0,
"time" : "520ms",
"time_in_millis" : 520
},
"script" : {
"count" : 99245,
"failed" : 0,
"current" : 0,
"time" : "2.4s",
"time_in_millis" : 2435
},
"set" : {
"count" : 379647,
"failed" : 0,
"current" : 0,
"time" : "2.2s",
"time_in_millis" : 2250
},
"set_security_user" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"uppercase" : {
"count" : 62,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"uri_parts" : {
"count" : 99245,
"failed" : 5688,
"current" : 0,
"time" : "316ms",
"time_in_millis" : 316
},
"urldecode" : {
"count" : 96383,
"failed" : 0,
"current" : 0,
"time" : "194ms",
"time_in_millis" : 194
},
"user_agent" : {
"count" : 299588,
"failed" : 61970,
"current" : 0,
"time" : "2.8s",
"time_in_millis" : 2802
}
}
}
}
}

Please format your code/logs/config using the </> button, or markdown style back ticks. It helps to make things easy to read which helps us help you :slight_smile:

This is probably why, you have far too many shards for your nodes and the amount of data in your cluster.

You should look at how you can reduce that count.

Hi Warkolm, You mean reduce the shards count ? i am getting this error only for this dashboard ? error msg like "Request error: node_not_connected_exception, Node not connected & Request error: i_o_exception, Invalid string; unexpected character: 129 hex: 81"

The issue is likely related to having too many shards for your cluster to efficiently manage, reducing the count reduces the load that is required to manage things.

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.