I have a cluster with 3 master nodes, 6 coordinating nodes, and 18 data nodes. the elected master node is at 95% cpu usage. I'm looking through the logs and all i see is what it has always been doing, deleting a lot of indices, creating indices, and updating number_of_replicas to [17], and failed to delete indices. the indexes have 1 primary shard and then auto expand replicas to the number of nodes in the cluster. Is there a way to figure out what causes this spike in CPU? With relational databases I can see a high cpu attached to a session but with elastic, i have no idea what process besides the main elastic process is consuming high cpu. any ideas?
Which version of Elasticsearch are you using? What is the full output of the cluster stats API?
6.3.0 for this cluster
{
"_nodes" : {
"total" : 27,
"successful" : 27,
"failed" : 0
},
"cluster_name" : "collections-es-prod-sv1-clust1",
"timestamp" : 1587051365623,
"status" : "yellow",
"indices" : {
"count" : 752,
"shards" : {
"total" : 13218,
"primaries" : 794,
"replication" : 15.64735516372796,
"index" : {
"shards" : {
"min" : 1,
"max" : 18,
"avg" : 17.57712765957447
},
"primaries" : {
"min" : 1,
"max" : 3,
"avg" : 1.0558510638297873
},
"replication" : {
"min" : 0.0,
"max" : 17.0,
"avg" : 16.465425531914892
}
}
},
"docs" : {
"count" : 695439887,
"deleted" : 3809340
},
"store" : {
"size_in_bytes" : 380576073343
},
"fielddata" : {
"memory_size_in_bytes" : 477149208,
"evictions" : 0
},
"query_cache" : {
"memory_size_in_bytes" : 693712980,
"total_count" : 309429821560,
"hit_count" : 102252621322,
"miss_count" : 207177200238,
"cache_size" : 89081,
"cache_count" : 194851129,
"evictions" : 194762048
},
"completion" : {
"size_in_bytes" : 0
},
"segments" : {
"count" : 16021,
"memory_in_bytes" : 1644048161,
"terms_memory_in_bytes" : 1337830886,
"stored_fields_memory_in_bytes" : 102774216,
"term_vectors_memory_in_bytes" : 0,
"norms_memory_in_bytes" : 40175680,
"points_memory_in_bytes" : 61448087,
"doc_values_memory_in_bytes" : 101819292,
"index_writer_memory_in_bytes" : 249337559,
"version_map_memory_in_bytes" : 3132648,
"fixed_bit_set_memory_in_bytes" : 12512280,
"max_unsafe_auto_id_timestamp" : 1586998799305,
"file_sizes" : { }
}
},
"nodes" : {
"count" : {
"total" : 27,
"data" : 18,
"coordinating_only" : 6,
"master" : 3,
"ingest" : 18
},
"versions" : [
"6.3.0"
],
"os" : {
"available_processors" : 630,
"allocated_processors" : 630,
"names" : [
{
"name" : "Linux",
"count" : 27
}
],
"mem" : {
"total_in_bytes" : 777469087744,
"free_in_bytes" : 8972398592,
"used_in_bytes" : 768496689152,
"free_percent" : 1,
"used_percent" : 99
}
},
"process" : {
"cpu" : {
"percent" : 340
},
"open_file_descriptors" : {
"min" : 797,
"max" : 4271,
"avg" : 3121
}
},
"jvm" : {
"max_uptime_in_millis" : 14601826655,
"versions" : [
{
"version" : "1.8.0_181",
"vm_name" : "Java HotSpot(TM) 64-Bit Server VM",
"vm_version" : "25.181-b13",
"vm_vendor" : "Oracle Corporation",
"count" : 26
},
{
"version" : "1.8.0_191",
"vm_name" : "Java HotSpot(TM) 64-Bit Server VM",
"vm_version" : "25.191-b12",
"vm_vendor" : "Oracle Corporation",
"count" : 1
}
],
"mem" : {
"heap_used_in_bytes" : 184591717688,
"heap_max_in_bytes" : 398571601920
},
"threads" : 7627
},
"fs" : {
"total_in_bytes" : 1423069433856,
"free_in_bytes" : 896091295744,
"available_in_bytes" : 823166480384
},
"plugins" : [ ],
"network_types" : {
"transport_types" : {
"netty4" : 27
},
"http_types" : {
"netty4" : 27
}
}
}
}
curl -XGET "http://localhost:9200/_cluster/stats/nodes/_master?pretty"
{
"_nodes" : {
"total" : 1,
"successful" : 1,
"failed" : 0
},
"cluster_name" : "collections-es-prod-sv1-clust1",
"timestamp" : 1587051652295,
"status" : "yellow",
"indices" : {
"count" : 0,
"shards" : { },
"docs" : {
"count" : 0,
"deleted" : 0
},
"store" : {
"size_in_bytes" : 0
},
"fielddata" : {
"memory_size_in_bytes" : 0,
"evictions" : 0
},
"query_cache" : {
"memory_size_in_bytes" : 0,
"total_count" : 0,
"hit_count" : 0,
"miss_count" : 0,
"cache_size" : 0,
"cache_count" : 0,
"evictions" : 0
},
"completion" : {
"size_in_bytes" : 0
},
"segments" : {
"count" : 0,
"memory_in_bytes" : 0,
"terms_memory_in_bytes" : 0,
"stored_fields_memory_in_bytes" : 0,
"term_vectors_memory_in_bytes" : 0,
"norms_memory_in_bytes" : 0,
"points_memory_in_bytes" : 0,
"doc_values_memory_in_bytes" : 0,
"index_writer_memory_in_bytes" : 0,
"version_map_memory_in_bytes" : 0,
"fixed_bit_set_memory_in_bytes" : 0,
"max_unsafe_auto_id_timestamp" : -9223372036854775808,
"file_sizes" : { }
}
},
"nodes" : {
"count" : {
"total" : 1,
"data" : 0,
"coordinating_only" : 0,
"master" : 1,
"ingest" : 0
},
"versions" : [
"6.3.0"
],
"os" : {
"available_processors" : 2,
"allocated_processors" : 2,
"names" : [
{
"name" : "Linux",
"count" : 1
}
],
"mem" : {
"total_in_bytes" : 6250946560,
"free_in_bytes" : 369209344,
"used_in_bytes" : 5881737216,
"free_percent" : 6,
"used_percent" : 94
}
},
"process" : {
"cpu" : {
"percent" : 70
},
"open_file_descriptors" : {
"min" : 798,
"max" : 798,
"avg" : 798
}
},
"jvm" : {
"max_uptime_in_millis" : 14602113827,
"versions" : [
{
"version" : "1.8.0_181",
"vm_name" : "Java HotSpot(TM) 64-Bit Server VM",
"vm_version" : "25.181-b13",
"vm_vendor" : "Oracle Corporation",
"count" : 1
}
],
"mem" : {
"heap_used_in_bytes" : 2154657120,
"heap_max_in_bytes" : 3203792896
},
"threads" : 110
},
"fs" : {
"total_in_bytes" : 52706275328,
"free_in_bytes" : 52641660928,
"available_in_bytes" : 49940742144
},
"plugins" : [ ],
"network_types" : {
"transport_types" : {
"netty4" : 1
},
"http_types" : {
"netty4" : 1
}
}
}
}
curl -XGET "http://localhost:9200/_cluster/stats/nodes/_all?pretty"
{
"_nodes" : {
"total" : 27,
"successful" : 27,
"failed" : 0
},
"cluster_name" : "collections-es-prod-sv1-clust1",
"timestamp" : 1587051838246,
"status" : "yellow",
"indices" : {
"count" : 859,
"shards" : {
"total" : 15171,
"primaries" : 901,
"replication" : 15.83795782463929,
"index" : {
"shards" : {
"min" : 2,
"max" : 18,
"avg" : 17.661233993015134
},
"primaries" : {
"min" : 1,
"max" : 3,
"avg" : 1.048894062863795
},
"replication" : {
"min" : 1.0,
"max" : 17.0,
"avg" : 16.563445867287545
}
}
},
"docs" : {
"count" : 696965474,
"deleted" : 3814663
},
"store" : {
"size_in_bytes" : 392347372859
},
"fielddata" : {
"memory_size_in_bytes" : 512541992,
"evictions" : 0
},
"query_cache" : {
"memory_size_in_bytes" : 816635041,
"total_count" : 309444757573,
"hit_count" : 102256816510,
"miss_count" : 207187941063,
"cache_size" : 114273,
"cache_count" : 194895773,
"evictions" : 194781500
},
"completion" : {
"size_in_bytes" : 0
},
"segments" : {
"count" : 17608,
"memory_in_bytes" : 1710541425,
"terms_memory_in_bytes" : 1386618300,
"stored_fields_memory_in_bytes" : 106743624,
"term_vectors_memory_in_bytes" : 0,
"norms_memory_in_bytes" : 44505280,
"points_memory_in_bytes" : 62676509,
"doc_values_memory_in_bytes" : 109997712,
"index_writer_memory_in_bytes" : 201429124,
"version_map_memory_in_bytes" : 2254908,
"fixed_bit_set_memory_in_bytes" : 14392480,
"max_unsafe_auto_id_timestamp" : 1586998799305,
"file_sizes" : { }
}
},
"nodes" : {
"count" : {
"total" : 27,
"data" : 18,
"coordinating_only" : 6,
"master" : 3,
"ingest" : 18
},
"versions" : [
"6.3.0"
],
"os" : {
"available_processors" : 630,
"allocated_processors" : 630,
"names" : [
{
"name" : "Linux",
"count" : 27
}
],
"mem" : {
"total_in_bytes" : 777469087744,
"free_in_bytes" : 8858959872,
"used_in_bytes" : 768610127872,
"free_percent" : 1,
"used_percent" : 99
}
},
"process" : {
"cpu" : {
"percent" : 228
},
"open_file_descriptors" : {
"min" : 797,
"max" : 4684,
"avg" : 3398
}
},
"jvm" : {
"max_uptime_in_millis" : 14602299571,
"versions" : [
{
"version" : "1.8.0_181",
"vm_name" : "Java HotSpot(TM) 64-Bit Server VM",
"vm_version" : "25.181-b13",
"vm_vendor" : "Oracle Corporation",
"count" : 26
},
{
"version" : "1.8.0_191",
"vm_name" : "Java HotSpot(TM) 64-Bit Server VM",
"vm_version" : "25.191-b12",
"vm_vendor" : "Oracle Corporation",
"count" : 1
}
],
"mem" : {
"heap_used_in_bytes" : 190709018496,
"heap_max_in_bytes" : 398571601920
},
"threads" : 7622
},
"fs" : {
"total_in_bytes" : 1423069433856,
"free_in_bytes" : 865727467520,
"available_in_bytes" : 792802656256
},
"plugins" : [ ],
"network_types" : {
"transport_types" : {
"netty4" : 27
},
"http_types" : {
"netty4" : 27
}
}
}
}
If the dedicated master node are experiencing high CPU this often means that it is busy updating the cluster state and propagating these changes across the cluster. This can be caused by having a lot of shards (this seems to be the case here) or frequently updating mappings.
I would also recommend checking in the logs whether the master node is suffering from long or frequent GC.
what numbers are you looking at in the data i sent you?
"indices" : {
"count" : 859,
"shards" : {
"total" : 15171,
and
"store" : {
"size_in_bytes" : 392347372859
},
It looks like you have a lot of very small shards, which can be quite inefficient.
You mentioned creating and deleting indices. Is this something that happens frequently?
yes, you are correct. alot of small indices that keep getting added/deleted every 15 minutes in batches. wasn't as much as a problem before but all of a sudden cpu has spiked the last couple of days. so not sure what tipped it over. would be nice if there was a way in elastic to get more details on exactly what queries cause the high cpu usage but i suspect you are correct in that the master node is just doing too much of everything with the number of shards.
What is the use case? How come you are using Elasticsearch in this very unusual way?
This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.