I'm trying to figure why our cluster seems to be pertually maxed out. Nnode 3 does the majority of the ingesting and is consistently around 90 - 95% CPU utilization. The other two nodes vary from 30 - 85% CPU utilization. Within the last month or so, we have started regularly experiencing a few unassigned shards. The cause is always allocation failure and manual allocation solves the problem. Our Elastic cluster is self-managed and are VMs in a dedicated VMware cluster. We use it as a SIEM as well as for certificate and uptime monitoring. We have a Fleet server managing about 250 Elastic agents. Mostly Windows hosts using the System, Windows and Sysmon integrations. Some Linux agents using the System integration. A host that uses the System, Cisco Meraki, and Microsft Defender for Endpoint integrations. A host that uses the System, TCP Custom Logs, and vsphere integrations. A couple DHCP servers use the MS DHCP integration.
I've included the output of GET _cluster/stat below. I can post the output of other commands if needed. Any suggestions to point me in the right direction would be very helpful.
Thank you in advance.
{
"_nodes": {
"total": 3,
"successful": 3,
"failed": 0
},
"cluster_name": "xxxxxxxxxxx",
"cluster_uuid": "xxxxxxxxxxxxxxxxxxxxx",
"timestamp": 1714487275838,
"status": "green",
"indices": {
"count": 280,
"shards": {
"total": 561,
"primaries": 280,
"replication": 1.0035714285714286,
"index": {
"shards": {
"min": 2,
"max": 3,
"avg": 2.0035714285714286
},
"primaries": {
"min": 1,
"max": 1,
"avg": 1
},
"replication": {
"min": 1,
"max": 2,
"avg": 1.0035714285714286
}
}
},
"docs": {
"count": 11128208032,
"deleted": 141714
},
"store": {
"size": "6.5tb",
"size_in_bytes": 7162796933088,
"total_data_set_size": "6.5tb",
"total_data_set_size_in_bytes": 7162796933088,
"reserved": "0b",
"reserved_in_bytes": 0
},
"fielddata": {
"memory_size": "638.3kb",
"memory_size_in_bytes": 653648,
"evictions": 9619,
"global_ordinals": {
"build_time": "3.8h",
"build_time_in_millis": 13934138
}
},
"query_cache": {
"memory_size": "2.3gb",
"memory_size_in_bytes": 2518797659,
"total_count": 581852207,
"hit_count": 63986884,
"miss_count": 517865323,
"cache_size": 173123,
"cache_count": 2252620,
"evictions": 2079497
},
"completion": {
"size": "0b",
"size_in_bytes": 0
},
"segments": {
"count": 14151,
"memory": "0b",
"memory_in_bytes": 0,
"terms_memory": "0b",
"terms_memory_in_bytes": 0,
"stored_fields_memory": "0b",
"stored_fields_memory_in_bytes": 0,
"term_vectors_memory": "0b",
"term_vectors_memory_in_bytes": 0,
"norms_memory": "0b",
"norms_memory_in_bytes": 0,
"points_memory": "0b",
"points_memory_in_bytes": 0,
"doc_values_memory": "0b",
"doc_values_memory_in_bytes": 0,
"index_writer_memory": "110.9mb",
"index_writer_memory_in_bytes": 116341154,
"version_map_memory": "910.5kb",
"version_map_memory_in_bytes": 932353,
"fixed_bit_set": "5mb",
"fixed_bit_set_memory_in_bytes": 5325704,
"max_unsafe_auto_id_timestamp": 1714479426485,
"file_sizes": {}
},
...
truncated
"versions": [
"8.12.2"
],
"os": {
"available_processors": 40,
"allocated_processors": 40,
"names": [
{
"name": "Linux",
"count": 3
}
],
"pretty_names": [
{
"pretty_name": "Ubuntu 22.04.4 LTS",
"count": 3
}
],
"architectures": [
{
"arch": "amd64",
"count": 3
}
],
"mem": {
"total": "58.7gb",
"total_in_bytes": 63050027008,
"adjusted_total": "58.7gb",
"adjusted_total_in_bytes": 63050027008,
"free": "1.9gb",
"free_in_bytes": 2135818240,
"used": "56.7gb",
"used_in_bytes": 60914208768,
"free_percent": 3,
"used_percent": 97
}
},
"process": {
"cpu": {
"percent": 265
},
"open_file_descriptors": {
"min": 2766,
"max": 2974,
"avg": 2884
}
},
"jvm": {
"max_uptime": "40d",
"max_uptime_in_millis": 3463242910,
"versions": [
{
"version": "21.0.2",
"vm_name": "OpenJDK 64-Bit Server VM",
"vm_version": "21.0.2+13-58",
"vm_vendor": "Oracle Corporation",
"bundled_jdk": true,
"using_bundled_jdk": true,
"count": 3
}
],
"mem": {
"heap_used": "21.3gb",
"heap_used_in_bytes": 22924166096,
"heap_max": "30gb",
"heap_max_in_bytes": 32212254720
},
"threads": 960
},
"fs": {
"total": "8.8tb",
"total_in_bytes": 9736877236224,
"free": "2.3tb",
"free_in_bytes": 2572628639744,
"available": "1.8tb",
"available_in_bytes": 2077798400000
},
"plugins": [],
"network_types": {
"transport_types": {
"security4": 3
},
"http_types": {
"security4": 3
}
},
"discovery_types": {
"multi-node": 3
},
"packaging_types": [
{
"flavor": "default",
"type": "deb",
"count": 3
}
],
"ingest": {
"number_of_pipelines": 199,
"processor_stats": {
"append": {
"count": 11979076674,
"failed": 0,
"current": 0,
"time": "1.5d",
"time_in_millis": 133445345
},
...
truncated