GC issues causing high CPU

Hello,

I have setup a 2 node cluster for our production environment. We are using t3a.xlarge instance type. APM and log analysis is currently enabled for our cluster, observing high CPU frequently. Upon checking the logs saw this frequently been written in logs. (15 days data retention)

[gc][813203] overhead, spent [273ms] collecting in the last [1s]

[gc][813310] overhead, spent [366ms] collecting in the last [1s]

Should we use some different instance to resolve the issue?

What is the output from the _cluster/stats?pretty&human API?

{
"_nodes" : {
"total" : 2,
"successful" : 2,
"failed" : 0
},
"cluster_name" : "prod",
"cluster_uuid" : "tBg5ZA3rTV66DcM_GzR4zA",
"timestamp" : 1655358303053,
"status" : "green",
"indices" : {
"count" : 501,
"shards" : {
"total" : 1002,
"primaries" : 501,
"replication" : 1.0,
"index" : {
"shards" : {
"min" : 2,
"max" : 2,
"avg" : 2.0
},
"primaries" : {
"min" : 1,
"max" : 1,
"avg" : 1.0
},
"replication" : {
"min" : 1.0,
"max" : 1.0,
"avg" : 1.0
}
}
},
"docs" : {
"count" : 929011407,
"deleted" : 790
},
"store" : {
"size" : "506.3gb",
"size_in_bytes" : 543699572244,
"total_data_set_size" : "506.3gb",
"total_data_set_size_in_bytes" : 543699572244,
"reserved" : "0b",
"reserved_in_bytes" : 0
},
"fielddata" : {
"memory_size" : "62kb",
"memory_size_in_bytes" : 63576,
"evictions" : 0
},
"query_cache" : {
"memory_size" : "979.7kb",
"memory_size_in_bytes" : 1003241,
"total_count" : 650025,
"hit_count" : 1469,
"miss_count" : 648556,
"cache_size" : 1001,
"cache_count" : 1053,
"evictions" : 52
},
"completion" : {
"size" : "0b",
"size_in_bytes" : 0
},
"segments" : {
"count" : 12047,
"memory" : "142.9mb",
"memory_in_bytes" : 149877076,
"terms_memory" : "117.2mb",
"terms_memory_in_bytes" : 122901136,
"stored_fields_memory" : "6.1mb",
"stored_fields_memory_in_bytes" : 6467144,
"term_vectors_memory" : "0b",
"term_vectors_memory_in_bytes" : 0,
"norms_memory" : "1.8mb",
"norms_memory_in_bytes" : 1970496,
"points_memory" : "0b",
"points_memory_in_bytes" : 0,
"doc_values_memory" : "17.6mb",
"doc_values_memory_in_bytes" : 18538300,
"index_writer_memory" : "219.9mb",
"index_writer_memory_in_bytes" : 230624216,
"version_map_memory" : "2.6kb",
"version_map_memory_in_bytes" : 2689,
"fixed_bit_set" : "112.9mb",
"fixed_bit_set_memory_in_bytes" : 118392704,
"max_unsafe_auto_id_timestamp" : 1655356237648,
"file_sizes" : { }
},
"mappings" : {
"field_types" : [
{
"name" : "alias",
"count" : 13888,
"index_count" : 413,
"script_count" : 0
},
{
"name" : "binary",
"count" : 1,
"index_count" : 1,
"script_count" : 0
},
{
"name" : "boolean",
"count" : 53956,
"index_count" : 420,
"script_count" : 0
},
{
"name" : "byte",
"count" : 414,
"index_count" : 414,
"script_count" : 0
},
{
"name" : "constant_keyword",
"count" : 1241,
"index_count" : 414,
"script_count" : 0
},
{
"name" : "date",
"count" : 67760,
"index_count" : 470,
"script_count" : 0
},
{
"name" : "date_nanos",
"count" : 1,
"index_count" : 1,
"script_count" : 0
},
{
"name" : "date_range",
"count" : 1,
"index_count" : 1,
"script_count" : 0
},
{
"name" : "double",
"count" : 14246,
"index_count" : 386,
"script_count" : 0
},
{
"name" : "double_range",
"count" : 1,
"index_count" : 1,
"script_count" : 0
},
{
"name" : "flattened",
"count" : 11550,
"index_count" : 385,
"script_count" : 0
},
{
"name" : "float",
"count" : 13845,
"index_count" : 418,
"script_count" : 0
},
{
"name" : "float_range",
"count" : 1,
"index_count" : 1,
"script_count" : 0
},
{
"name" : "geo_point",
"count" : 3662,
"index_count" : 414,
"script_count" : 0
},
{
"name" : "geo_shape",
"count" : 1,
"index_count" : 1,
"script_count" : 0
},
{
"name" : "half_float",
"count" : 1,
"index_count" : 1,
"script_count" : 0
},
{
"name" : "histogram",
"count" : 28,
"index_count" : 28,
"script_count" : 0
},
{
"name" : "integer",
"count" : 1,
"index_count" : 1,
"script_count" : 0
},
{
"name" : "integer_range",
"count" : 1,
"index_count" : 1,
"script_count" : 0
},
{
"name" : "ip",
"count" : 51599,
"index_count" : 415,
"script_count" : 0
},
{
"name" : "ip_range",
"count" : 386,
"index_count" : 386,
"script_count" : 0
},
{
"name" : "keyword",
"count" : 1657206,
"index_count" : 470,
"script_count" : 0
},
{
"name" : "long",
"count" : 402626,
"index_count" : 468,
"script_count" : 0
},
{
"name" : "long_range",
"count" : 1,
"index_count" : 1,
"script_count" : 0
},
{
"name" : "nested",
"count" : 1159,
"index_count" : 389,
"script_count" : 0
},
{
"name" : "object",
"count" : 337108,
"index_count" : 469,
"script_count" : 0
},
{
"name" : "scaled_float",
"count" : 669,
"index_count" : 413,
"script_count" : 0
},
{
"name" : "shape",
"count" : 1,
"index_count" : 1,
"script_count" : 0
},
{
"name" : "short",
"count" : 39656,
"index_count" : 386,
"script_count" : 0
},
{
"name" : "text",
"count" : 43946,
"index_count" : 469,
"script_count" : 0
},
{
"name" : "wildcard",
"count" : 385,
"index_count" : 385,
"script_count" : 0
}
],
"runtime_field_types" :
},
"analysis" : {
"char_filter_types" : ,
"tokenizer_types" : ,
"filter_types" : ,
"analyzer_types" : ,
"built_in_char_filters" : ,
"built_in_tokenizers" : ,
"built_in_filters" : ,
"built_in_analyzers" :
},
"versions" : [
{
"version" : "7.14.1",
"index_count" : 501,
"primary_shard_count" : 501,
"total_primary_size" : "253gb",
"total_primary_bytes" : 271666909010
}
]
},
"nodes" : {
"count" : {
"total" : 2,
"coordinating_only" : 0,
"data" : 2,
"data_cold" : 2,
"data_content" : 2,
"data_frozen" : 2,
"data_hot" : 2,
"data_warm" : 2,
"ingest" : 2,
"master" : 2,
"ml" : 2,
"remote_cluster_client" : 2,
"transform" : 2,
"voting_only" : 0
},
"versions" : [
"7.14.1"
],
"os" : {
"available_processors" : 8,
"allocated_processors" : 8,
"names" : [
{
"name" : "Linux",
"count" : 2
}
],
"pretty_names" : [
{
"pretty_name" : "CentOS Linux 7 (Core)",
"count" : 2
}
],
"architectures" : [
{
"arch" : "amd64",
"count" : 2
}
],
"mem" : {
"total" : "30.7gb",
"total_in_bytes" : 33015570432,
"free" : "315.8mb",
"free_in_bytes" : 331194368,
"used" : "30.4gb",
"used_in_bytes" : 32684376064,
"free_percent" : 1,
"used_percent" : 99
}
},
"process" : {
"cpu" : {
"percent" : 92
},
"open_file_descriptors" : {
"min" : 6297,
"max" : 6629,
"avg" : 6463
}
},
"jvm" : {
"max_uptime" : "9.4d",
"max_uptime_in_millis" : 817443937,
"versions" : [
{
"version" : "16.0.2",
"vm_name" : "OpenJDK 64-Bit Server VM",
"vm_version" : "16.0.2+7",
"vm_vendor" : "Eclipse Foundation",
"bundled_jdk" : true,
"using_bundled_jdk" : true,
"count" : 2
}
],
"mem" : {
"heap_used" : "10.9gb",
"heap_used_in_bytes" : 11743826552,
"heap_max" : "17gb",
"heap_max_in_bytes" : 18253611008
},
"threads" : 372
},
"fs" : {
"total" : "999.7gb",
"total_in_bytes" : 1073479680000,
"free" : "470.5gb",
"free_in_bytes" : 505206370304,
"available" : "470.5gb",
"available_in_bytes" : 505206370304
},
"plugins" : ,
"network_types" : {
"transport_types" : {
"security4" : 2
},
"http_types" : {
"security4" : 2
}
},
"discovery_types" : {
"zen" : 2
},
"packaging_types" : [
{
"flavor" : "default",
"type" : "rpm",
"count" : 2
}
],
"ingest" : {
"number_of_pipelines" : 28,
"processor_stats" : {
"append" : {
"count" : 40808,
"failed" : 0,
"current" : 0,
"time" : "112ms",
"time_in_millis" : 112
},
"conditional" : {
"count" : 232491123,
"failed" : 0,
"current" : 0,
"time" : "1.2h",
"time_in_millis" : 4549319
},
"convert" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"date" : {
"count" : 20404,
"failed" : 0,
"current" : 0,
"time" : "2.9s",
"time_in_millis" : 2961
},
"geoip" : {
"count" : 195352782,
"failed" : 0,
"current" : 0,
"time" : "47.1m",
"time_in_millis" : 2827331
},
"grok" : {
"count" : 20404,
"failed" : 0,
"current" : 0,
"time" : "503ms",
"time_in_millis" : 503
},
"gsub" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"pipeline" : {
"count" : 781411124,
"failed" : 0,
"current" : 1,
"time" : "3.3h",
"time_in_millis" : 11992363
},
"remove" : {
"count" : 20404,
"failed" : 0,
"current" : 0,
"time" : "110ms",
"time_in_millis" : 110
},
"rename" : {
"count" : 20404,
"failed" : 0,
"current" : 0,
"time" : "167ms",
"time_in_millis" : 167
},
"script" : {
"count" : 40808,
"failed" : 0,
"current" : 0,
"time" : "1s",
"time_in_millis" : 1010
},
"set" : {
"count" : 40808,
"failed" : 0,
"current" : 0,
"time" : "1s",
"time_in_millis" : 1070
},
"set_security_user" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"user_agent" : {
"count" : 195352782,
"failed" : 0,
"current" : 0,
"time" : "39m",
"time_in_millis" : 2345554
}
}
}
}
}

I would recommend instances without burstable CPU as GC can be CPU intensive and take longer if you run out of CPU credits or overload your cluster.

I would also recommend adding a dedicated master node in order be highly available. A minimum of 3 master eligible nodes is required for this. If this is configured to be voting obly it may be able to run on a small t3a instance as it does not hold data or serve queries. Make sure you do not send data or queries to it though.

Thanks for reply christian, Would like to know if elastic recommends any specific java version for 7.14.1 version as we are using openjdk version "1.8.0_312" plus as per elastic recommendation for 2 node cluster one node should be set node.master="false" but in our case we have set both the nodes as true. When we restart the master cluster turns to red and cluster health shows

{
"error" : {
"root_cause" : [
{
"type" : "master_not_discovered_exception",
"reason" : null
}
],
"type" : "master_not_discovered_exception",
"reason" : null
},
"status" : 503
}

Should we set the other way around?

A two node cluster can never be highly available, so whenever you lose a master eligible node your cluster will go red. That is why you need to add a third node as per the link I provided.

If we move to 3 node cluster setup and current master node goes down, it should ideally elect the other one of the nodes as master i believe. I have this config for Elasticsearch.yml in all 3 nodes

discovery.seed_hosts: ["node-1","node-2", "node-3"]
cluster.initial_master_nodes: ["node-1","node-2", "node-3"]
node.data: true
node.master: true

Is there any other config which i should put for election process to happen and cluster not going to red state even if 1 node goes down. I saw this config in elastic docs should be set inorder to make 2 master eligible node

discovery.zen.minimum_master_nodes: 2

This no longer applies from version 7.x onwards.

Thanks christian for the help, will make changes and see the performance

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.