Elastic Cluster Performance Issue

Hi All,

We have a set-up of Pipeline: Beats Agents-->AWS MSK-->LogStash-->Elasticsearch. In which application team sends their application logs through this pipeline. We have 8 AWS ec2 instances which is running on Auto-Scaling Groups in which we installed and configured LogStash to consume data from the Kafka topics. We have multiple pipelines configured in the pipeline.yml file and these 8 LogStash instances would consume from that and emit the logs to Elasticsearch. we started to on board the applications in production and ended up with the performance issue of elastic cluster when there is peak load. we need to understand on below queries - In Production, we have 9 nodes, on which 3 Master Nodes (each node having 2GB), 3 Hot Nodes, 3 Warm Nodes.

1 . We often experience lag in cluster performance and end up with cluster restart. We would like to know why master node is getting restarted even if there is no peak use of CPU and RAM on any of the nodes in cluster? what is the root cause of this discrepancies and how to fix this?

  1. Compute is also getting utilized highly like RAM and CPU even if elastic is consumed only half of the data storage out of overall storage, we are quite skeptical what would happen if we reach to maximum storage usage?

Regards,
Clyton

I have a few questions:

  1. Which version of Elasticsearch are you using?
  2. What is the full output of the cluster stats API?
  3. Which nodes are Logstash set up to send indexing requests to?
  4. What type of instances and storage are the different tiers using?
  5. How many indices and shards are you actively indexing into?

Which version of Elasticsearch are you using?
we use 7.11.2

Which nodes are Logstash set up to send indexing requests to?
We have installed logstash applications in AWS EC2 instances which is running on Auto Scaling Group. Logstash stand alone server.

What type of instances and storage are the different tiers using?
For the hot node, we are using AWS data high.IO.I3 which is running on 3 zones, in which 15GB of RAM and 480GB of Memory is allocated in each instances.
For the warm node, we are using AWS data high.storage.d3 which is running on 3 zones, in which 15GB of RAM and 2.81TB of Memory is allocated in each instances.
For the Master node, we are using AWS master.r5d which is running in 3 Zones, in which 1GB of RAM and 4GB of Memory is allocated in each instances.

How many indices and shards are you actively indexing into?
Indices 988 - Shards 2120

What is the full output of the cluster stats API 1?
<
{
"_nodes" : {
"total" : 9,
"successful" : 9,
"failed" : 0
},
"cluster_name" :
"cluster_uuid" :
"timestamp" : 1637739677999,
"status" :
"indices" : {
"count" : 988,
"shards" : {
"total" : 2109,
"primaries" : 1059,
"replication" : 0.9915014164305949,
"index" : {
"shards" : {
"min" : 2,
"max" : 6,
"avg" : 2.1346153846153846
},
"primaries" : {
"min" : 1,
"max" : 3,
"avg" : 1.0718623481781377
},
"replication" : {
"min" : 0.0,
"max" : 1.0,
"avg" : 0.9959514170040485
}
}
},
"docs" : {
"count" : 1178299437,
"deleted" : 10960
},
"store" : {
"size_in_bytes" : 4790520703365,
"reserved_in_bytes" : 0
},
"fielddata" : {
"memory_size_in_bytes" : 819512,
"evictions" : 0
},
"query_cache" : {
"memory_size_in_bytes" : 113831864,
"total_count" : 4677144,
"hit_count" : 116577,
"miss_count" : 4560567,
"cache_size" : 3347,
"cache_count" : 4255,
"evictions" : 908
},
"completion" : {
"size_in_bytes" : 0
},
"segments" : {
"count" : 9313,
"memory_in_bytes" : 208876574,
"terms_memory_in_bytes" : 158630848,
"stored_fields_memory_in_bytes" : 5947800,
"term_vectors_memory_in_bytes" : 0,
"norms_memory_in_bytes" : 20496448,
"points_memory_in_bytes" : 0,
"doc_values_memory_in_bytes" : 23801478,
"index_writer_memory_in_bytes" : 832913588,
"version_map_memory_in_bytes" : 19719832,
"fixed_bit_set_memory_in_bytes" : 35758104,
"max_unsafe_auto_id_timestamp" : 1637736746195,
"file_sizes" : { }
},
"mappings" : {
"field_types" : [
{
"name" : "alias",
"count" : 1074,
"index_count" : 358
},
{
"name" : "boolean",
"count" : 22447,
"index_count" : 466
},
{
"name" : "byte",
"count" : 404,
"index_count" : 404
},
{
"name" : "date",
"count" : 21276,
"index_count" : 527
},
{
"name" : "date_range",
"count" : 5,
"index_count" : 5
},
{
"name" : "double",
"count" : 60187,
"index_count" : 369
},
{
"name" : "float",
"count" : 77167,
"index_count" : 415
},
{
"name" : "geo_point",
"count" : 2863,
"index_count" : 409
},
{
"name" : "histogram",
"count" : 41,
"index_count" : 41
},
{
"name" : "integer",
"count" : 8,
"index_count" : 8
},
{
"name" : "ip",
"count" : 8226,
"index_count" : 409
},
{
"name" : "keyword",
"count" : 368699,
"index_count" : 527
},
{
"name" : "long",
"count" : 814054,
"index_count" : 523
},
{
"name" : "nested",
"count" : 68,
"index_count" : 18
},
{
"name" : "object",
"count" : 877253,
"index_count" : 527
},
{
"name" : "scaled_float",
"count" : 49107,
"index_count" : 404
},
{
"name" : "text",
"count" : 26115,
"index_count" : 527
}
]
},
"analysis" : {
"char_filter_types" : ,
"tokenizer_types" : ,
"filter_types" : ,
"analyzer_types" : ,
"built_in_char_filters" : ,
"built_in_tokenizers" : ,
"built_in_filters" : ,
"built_in_analyzers" : [
{
"name" : "simple",
"count" : 10,
"index_count" : 5
}
]
},
"versions" : [
{
"version" : "7.11.2",
"index_count" : 989,
"primary_shard_count" : 1060,
"total_primary_bytes" : 2420304443780
}
]
},
"nodes" : {
"count" : {
"total" : 9,
"coordinating_only" : 0,
"data" : 0,
"data_cold" : 0,
"data_content" : 3,
"data_hot" : 3,
"data_warm" : 3,
"ingest" : 3,
"master" : 3,
"ml" : 0,
"remote_cluster_client" : 9,
"transform" : 3,
"voting_only" : 0
},
"versions" : [
"7.11.2"
],
"os" : {
"available_processors" : 25,
"allocated_processors" : 25,
"names" : [
{
"name" : "Linux",
"count" : 9
}
],
"pretty_names" : [
{
"pretty_name" : "CentOS Linux 8",
"count" : 9
}
],
"mem" : {
"total_in_bytes" : 148176371712,
"free_in_bytes" : 12069072896,
"used_in_bytes" : 136107298816,
"free_percent" : 8,
"used_percent" : 92
}
},
"process" : {
"cpu" : {
"percent" : 43
},
"open_file_descriptors" : {
"min" : 485,
"max" : 4579,
"avg" : 2206
}
},
"jvm" : {
"max_uptime_in_millis" : 4837564717,
"versions" : [
{
"version" : "15.0.1",
"vm_name" : "OpenJDK 64-Bit Server VM",
"vm_version" : "15.0.1+9",
"vm_vendor" : "AdoptOpenJDK",
"bundled_jdk" : true,
"using_bundled_jdk" : true,
"count" : 9
}
],
"mem" : {
"heap_used_in_bytes" : 20015079224,
"heap_max_in_bytes" : 73836527616
},
"threads" : 1173
},
"fs" : {
"total_in_bytes" : 12292196401152,
"free_in_bytes" : 7434669338624,
"available_in_bytes" : 7434669338624
},
"plugins" : [
{
"name" : "",
"version" : "7.11.2",
"elasticsearch_version" : "7.11.2",
"java_version" : "1.8",
"description" : "The S3 repository plugin adds S3 repositories",
"classname" : "org.Elasticsearch.repositories.s3.S3RepositoryPlugin",
"extended_plugins" : ,
"has_native_controller" : false,
"licensed" : false,
"type" : "isolated"
}
],
"network_types" : {
"transport_types" : {
"security4" : 9
},
"http_types" : {
"security4" : 9
}
},
"discovery_types" : {
"zen" : 9
},
"packaging_types" : [
{
"flavor" : "default",
"type" : "docker",
"count" : 9
}
],
"ingest" : {
"number_of_pipelines" : 7,
"processor_stats" : {
"conditional" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time_in_millis" : 0
},
"geoip" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time_in_millis" : 0
},
"gsub" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time_in_millis" : 0
},
"pipeline" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time_in_millis" : 0
},
"script" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time_in_millis" : 0
},
"user_agent" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time_in_millis" : 0
}
}
}
}
}
/>