Hi everyone,
I'm having a very slow ES cluster despite I'm not making any change with data volume or number of shards.
Our ES cluster is version 8.6.0 with these node:
- es01: "data_content","data_hot","ingest","master","transform"
- es02: "data_content","data_hot","ingest","master","transform"
- es03: "master", "voting_only"
- es04: "data_cold"
On es04 I see these warn message repeatly:
[2023-02-08T14:20:21,909][WARN ][o.e.t.OutboundHandler ] [es04] sending transport message [Response{316186378}{false}{false}{false}{class org.elasticsearch.action.support.broadcast.node.TransportBroadcastByNodeAction$NodeResponse}] of size [857237] on [Netty4TcpChannel{localAddress=/<es04_ip>:9300, remoteAddress=/<es01_ip>:29994, profile=default}] took [5231ms] which is above the warn threshold of [5000ms] with success [true]
[2023-02-08T14:20:32,165][WARN ][o.e.t.OutboundHandler ] [es04] sending transport message [Response{316189298}{false}{false}{false}{class org.elasticsearch.action.support.broadcast.node.TransportBroadcastByNodeAction$NodeResponse}] of size [857237] on [Netty4TcpChannel{localAddress=/<es04_ip>:9300, remoteAddress=/<es01_ip>:29996, profile=default}] took [5507ms] which is above the warn threshold of [5000ms] with success [true]
On es01 got these warn:
[2023-02-08T13:16:30,590][WARN ][o.e.c.InternalClusterInfoService] [es01] failed to retrieve stats for node [Nmd_-r9DRFSheCUqLn8jPw] org.elasticsearch.transport.ReceiveTimeoutTransportException: [es04][<es04_ip>:9300][cluster:monitor/nodes/stats[n]] request_id [315291918] timed out after [15008ms]
[2023-02-08T13:16:42,222][WARN ][o.e.t.TransportService ] [es01] Received response for a request that has timed out, sent [26.8s/26815ms] ago, timed out [11.8s/11807ms] ago, action [cluster:monitor/nodes/stats[n]], node [{es04}{Nmd_-r9DRFSheCUqLn8jPw}{80PziYyxTHeGsjEtC-DDtg}{es04}{<es04_ip>}{<es04_ip>:9300}{c}{xpack.installed=true}], id [315291918]
The output of _cluster/stats?pretty&human
API:
{
"_nodes": {
"total": 4,
"successful": 4,
"failed": 0
},
"cluster_name": "<my_cluster>",
"cluster_uuid": "KlOFqtglR6a8BDUVMN3_Dw",
"timestamp": 1676101153599,
"status": "green",
"indices": {
"count": 389,
"shards": {
"total": 537,
"primaries": 389,
"replication": 0.38046272493573263,
"index": {
"shards": {
"min": 1,
"max": 2,
"avg": 1.3804627249357326
},
"primaries": {
"min": 1,
"max": 1,
"avg": 1
},
"replication": {
"min": 0,
"max": 1,
"avg": 0.38046272493573263
}
}
},
"docs": {
"count": 19565100046,
"deleted": 314738
},
"store": {
"size": "7.9tb",
"size_in_bytes": 8713808466017,
"total_data_set_size": "7.9tb",
"total_data_set_size_in_bytes": 8713808466017,
"reserved": "0b",
"reserved_in_bytes": 0
},
"fielddata": {
"memory_size": "53.2mb",
"memory_size_in_bytes": 55833664,
"evictions": 0
},
"query_cache": {
"memory_size": "29.8mb",
"memory_size_in_bytes": 31261813,
"total_count": 116574573,
"hit_count": 6427948,
"miss_count": 110146625,
"cache_size": 23034,
"cache_count": 47878,
"evictions": 24844
},
"completion": {
"size": "0b",
"size_in_bytes": 0
},
"segments": {
"count": 8322,
"memory": "0b",
"memory_in_bytes": 0,
"terms_memory": "0b",
"terms_memory_in_bytes": 0,
"stored_fields_memory": "0b",
"stored_fields_memory_in_bytes": 0,
"term_vectors_memory": "0b",
"term_vectors_memory_in_bytes": 0,
"norms_memory": "0b",
"norms_memory_in_bytes": 0,
"points_memory": "0b",
"points_memory_in_bytes": 0,
"doc_values_memory": "0b",
"doc_values_memory_in_bytes": 0,
"index_writer_memory": "86.7mb",
"index_writer_memory_in_bytes": 90975452,
"version_map_memory": "123.5kb",
"version_map_memory_in_bytes": 126534,
"fixed_bit_set": "1.5gb",
"fixed_bit_set_memory_in_bytes": 1616717456,
"max_unsafe_auto_id_timestamp": 1676096426797,
"file_sizes": {}
},
"mappings": {
"total_field_count": 278869,
"total_deduplicated_field_count": 128808,
"total_deduplicated_mapping_size": "614.4kb",
"total_deduplicated_mapping_size_in_bytes": 629220,
"field_types": [
{
"name": "alias",
"count": 2423,
"index_count": 27,
"script_count": 0
},
{
"name": "binary",
"count": 1,
"index_count": 1,
"script_count": 0
},
{
"name": "boolean",
"count": 3882,
"index_count": 303,
"script_count": 0
},
{
"name": "byte",
"count": 3,
"index_count": 3,
"script_count": 0
},
{
"name": "constant_keyword",
"count": 1135,
"index_count": 298,
"script_count": 0
},
{
"name": "date",
"count": 8926,
"index_count": 337,
"script_count": 0
},
{
"name": "date_nanos",
"count": 1,
"index_count": 1,
"script_count": 0
},
{
"name": "date_range",
"count": 21,
"index_count": 21,
"script_count": 0
},
{
"name": "double",
"count": 818,
"index_count": 22,
"script_count": 0
},
{
"name": "double_range",
"count": 1,
"index_count": 1,
"script_count": 0
},
{
"name": "flattened",
"count": 1487,
"index_count": 149,
"script_count": 0
},
{
"name": "float",
"count": 1915,
"index_count": 172,
"script_count": 0
},
{
"name": "float_range",
"count": 1,
"index_count": 1,
"script_count": 0
},
{
"name": "geo_point",
"count": 1206,
"index_count": 223,
"script_count": 0
},
{
"name": "geo_shape",
"count": 1,
"index_count": 1,
"script_count": 0
},
{
"name": "half_float",
"count": 33,
"index_count": 12,
"script_count": 0
},
{
"name": "histogram",
"count": 3,
"index_count": 3,
"script_count": 0
},
{
"name": "integer",
"count": 15,
"index_count": 13,
"script_count": 0
},
{
"name": "integer_range",
"count": 1,
"index_count": 1,
"script_count": 0
},
{
"name": "ip",
"count": 2827,
"index_count": 321,
"script_count": 0
},
{
"name": "ip_range",
"count": 1,
"index_count": 1,
"script_count": 0
},
{
"name": "keyword",
"count": 163665,
"index_count": 338,
"script_count": 0
},
{
"name": "long",
"count": 29241,
"index_count": 290,
"script_count": 0
},
{
"name": "long_range",
"count": 1,
"index_count": 1,
"script_count": 0
},
{
"name": "match_only_text",
"count": 7311,
"index_count": 238,
"script_count": 0
},
{
"name": "nested",
"count": 1698,
"index_count": 154,
"script_count": 0
},
{
"name": "object",
"count": 48130,
"index_count": 334,
"script_count": 0
},
{
"name": "scaled_float",
"count": 617,
"index_count": 155,
"script_count": 0
},
{
"name": "shape",
"count": 1,
"index_count": 1,
"script_count": 0
},
{
"name": "short",
"count": 425,
"index_count": 10,
"script_count": 0
},
{
"name": "text",
"count": 616,
"index_count": 160,
"script_count": 0
},
{
"name": "unsigned_long",
"count": 31,
"index_count": 7,
"script_count": 0
},
{
"name": "version",
"count": 22,
"index_count": 22,
"script_count": 0
},
{
"name": "wildcard",
"count": 2410,
"index_count": 191,
"script_count": 0
}
],
"runtime_field_types": []
},
"analysis": {
"char_filter_types": [],
"tokenizer_types": [],
"filter_types": [],
"analyzer_types": [],
"built_in_char_filters": [],
"built_in_tokenizers": [],
"built_in_filters": [],
"built_in_analyzers": []
},
"versions": [
{
"version": "8.0.0",
"index_count": 22,
"primary_shard_count": 22,
"total_primary_size": "18.2mb",
"total_primary_bytes": 19104002
},
{
"version": "8.1.2",
"index_count": 192,
"primary_shard_count": 192,
"total_primary_size": "2.2tb",
"total_primary_bytes": 2496656326799
},
{
"version": "8.6.0",
"index_count": 175,
"primary_shard_count": 175,
"total_primary_size": "5.2tb",
"total_primary_bytes": 5783902462356
}
],
"search": {
"total": 807643,
"queries": {
"regexp": 4254,
"bool": 687574,
"prefix": 3599,
"match": 300864,
"range": 380379,
"nested": 33,
"wildcard": 3,
"multi_match": 200,
"match_phrase": 146783,
"terms": 161705,
"constant_score": 859,
"match_phrase_prefix": 45,
"ids": 4028,
"match_all": 109621,
"exists": 357960,
"term": 515557,
"simple_query_string": 76982,
"query_string": 8713
},
"sections": {
"highlight": 270,
"search_after": 7330,
"stored_fields": 608,
"runtime_mappings": 144929,
"query": 718033,
"script_fields": 608,
"_source": 13261,
"pit": 19416,
"terminate_after": 156,
"fields": 116887,
"collapse": 50607,
"aggs": 191626
}
}
},
"nodes": {
"count": {
"total": 4,
"coordinating_only": 0,
"data": 0,
"data_cold": 1,
"data_content": 2,
"data_frozen": 0,
"data_hot": 2,
"data_warm": 0,
"index": 0,
"ingest": 2,
"master": 3,
"ml": 0,
"remote_cluster_client": 0,
"search": 0,
"transform": 2,
"voting_only": 1
},
"versions": [
"8.6.0"
],
"os": {
"available_processors": 48,
"allocated_processors": 48,
"names": [
{
"name": "Linux",
"count": 4
}
],
"pretty_names": [
{
"pretty_name": "Ubuntu 18.04.6 LTS",
"count": 4
}
],
"architectures": [
{
"arch": "amd64",
"count": 4
}
],
"mem": {
"total": "92gb",
"total_in_bytes": 98881466368,
"adjusted_total": "92gb",
"adjusted_total_in_bytes": 98881466368,
"free": "4.7gb",
"free_in_bytes": 5093277696,
"used": "87.3gb",
"used_in_bytes": 93788188672,
"free_percent": 5,
"used_percent": 95
}
},
"process": {
"cpu": {
"percent": 46
},
"open_file_descriptors": {
"min": 560,
"max": 4248,
"avg": 2143
}
},
"jvm": {
"max_uptime": "3d",
"max_uptime_in_millis": 261641669,
"versions": [
{
"version": "19.0.1",
"vm_name": "OpenJDK 64-Bit Server VM",
"vm_version": "19.0.1+10-21",
"vm_vendor": "Oracle Corporation",
"bundled_jdk": true,
"using_bundled_jdk": true,
"count": 4
}
],
"mem": {
"heap_used": "13.8gb",
"heap_used_in_bytes": 14855281888,
"heap_max": "44.1gb",
"heap_max_in_bytes": 47404023808
},
"threads": 502
},
"fs": {
"total": "16.7tb",
"total_in_bytes": 18373235757056,
"free": "8.7tb",
"free_in_bytes": 9591671062528,
"available": "8tb",
"available_in_bytes": 8810034135040
},
"plugins": [],
"network_types": {
"transport_types": {
"security4": 4
},
"http_types": {
"security4": 4
}
},
"discovery_types": {
"multi-node": 4
},
"packaging_types": [
{
"flavor": "default",
"type": "deb",
"count": 4
}
],
"ingest": {
"number_of_pipelines": 319,
"processor_stats": {
"append": {
"count": 140343861,
"failed": 0,
"current": 0,
"time": "2.1m",
"time_in_millis": 126061
},
"community_id": {
"count": 410689266,
"failed": 518684,
"current": 0,
"time": "54.9m",
"time_in_millis": 3294763
},
"conditional": {
"count": 8518573873,
"failed": 108781,
"current": 0,
"time": "1.1d",
"time_in_millis": 95615210
},
"convert": {
"count": 3536505778,
"failed": 252735416,
"current": 0,
"time": "1.2h",
"time_in_millis": 4529251
},
"csv": {
"count": 4851,
"failed": 0,
"current": 0,
"time": "126ms",
"time_in_millis": 126
},
"date": {
"count": 1970806100,
"failed": 66603464,
"current": 0,
"time": "4.5h",
"time_in_millis": 16390025
},
"dissect": {
"count": 365017597,
"failed": 334171895,
"current": 0,
"time": "59.2m",
"time_in_millis": 3552113
},
"dot_expander": {
"count": 331475023,
"failed": 0,
"current": 0,
"time": "17.7m",
"time_in_millis": 1066733
},
"enrich": {
"count": 3138570,
"failed": 0,
"current": 0,
"time": "2m",
"time_in_millis": 120119
},
"fingerprint": {
"count": 10065915,
"failed": 0,
"current": 0,
"time": "2.5m",
"time_in_millis": 154302
},
"foreach": {
"count": 214567,
"failed": 0,
"current": 0,
"time": "626ms",
"time_in_millis": 626
},
"geoip": {
"count": 2094619950,
"failed": 380,
"current": 0,
"time": "6.9h",
"time_in_millis": 25072611
},
"grok": {
"count": 1277350232,
"failed": 265091616,
"current": 1,
"time": "14.9d",
"time_in_millis": 1289944840
},
"gsub": {
"count": 17972404,
"failed": 0,
"current": 0,
"time": "6.2m",
"time_in_millis": 372964
},
"json": {
"count": 32745164,
"failed": 8009,
"current": 0,
"time": "22.9m",
"time_in_millis": 1375130
},
"kv": {
"count": 491418,
"failed": 52107,
"current": 0,
"time": "18.3s",
"time_in_millis": 18318
},
"lowercase": {
"count": 589830566,
"failed": 371402793,
"current": 0,
"time": "29.9m",
"time_in_millis": 1799212
},
"pipeline": {
"count": 647692291,
"failed": 0,
"current": 0,
"time": "4.6m",
"time_in_millis": 276820
},
"registered_domain": {
"count": 4911936,
"failed": 0,
"current": 0,
"time": "43.9s",
"time_in_millis": 43929
},
"remove": {
"count": 2663944271,
"failed": 0,
"current": 0,
"time": "1h",
"time_in_millis": 3928697
},
"rename": {
"count": 4958622563,
"failed": 59068390,
"current": 0,
"time": "2.3h",
"time_in_millis": 8527009
},
"script": {
"count": 1190769152,
"failed": 5163,
"current": 0,
"time": "2.2h",
"time_in_millis": 8114448
},
"set": {
"count": 4968648433,
"failed": 0,
"current": 0,
"time": "4.4h",
"time_in_millis": 16039357
},
"set_security_user": {
"count": 638794053,
"failed": 0,
"current": 0,
"time": "1h",
"time_in_millis": 3939516
},
"split": {
"count": 585292767,
"failed": 0,
"current": 0,
"time": "15m",
"time_in_millis": 902501
},
"trim": {
"count": 421192,
"failed": 1126,
"current": 0,
"time": "506ms",
"time_in_millis": 506
},
"uppercase": {
"count": 69231,
"failed": 0,
"current": 0,
"time": "64ms",
"time_in_millis": 64
},
"uri_parts": {
"count": 42710844,
"failed": 186081,
"current": 0,
"time": "2.3m",
"time_in_millis": 140765
},
"urldecode": {
"count": 37123722,
"failed": 0,
"current": 0,
"time": "1.4m",
"time_in_millis": 86386
},
"user_agent": {
"count": 72010796,
"failed": 5192,
"current": 0,
"time": "36.4m",
"time_in_millis": 2186592
}
}
},
"indexing_pressure": {
"memory": {
"current": {
"combined_coordinating_and_primary": "0b",
"combined_coordinating_and_primary_in_bytes": 0,
"coordinating": "0b",
"coordinating_in_bytes": 0,
"primary": "0b",
"primary_in_bytes": 0,
"replica": "0b",
"replica_in_bytes": 0,
"all": "0b",
"all_in_bytes": 0
},
"total": {
"combined_coordinating_and_primary": "0b",
"combined_coordinating_and_primary_in_bytes": 0,
"coordinating": "0b",
"coordinating_in_bytes": 0,
"primary": "0b",
"primary_in_bytes": 0,
"replica": "0b",
"replica_in_bytes": 0,
"all": "0b",
"all_in_bytes": 0,
"coordinating_rejections": 0,
"primary_rejections": 0,
"replica_rejections": 0
},
"limit": "0b",
"limit_in_bytes": 0
}
}
}
}
CPU, Disk and Heap is at normal state and not having much shard in cluster:
Can you help me find out the reason why my cluster is so slow?
Thanks.