Hey Mark,
Version is 7.12.0, though this has happened consistently for the last several (starting at 7.10.0 which is what I first installed this cluster on.) Another thing I probably should've mentioned is that each node is running in a docker container.
Scrolling through the logs at the time of the event yesterday, pretty much everything I'm seeing looks like this:
[elasticsearch.server][WARN] failed to retrieve shard stats from node [GPhxT1vaSJe28koqMC1lrA]: [es-data1][***][indices:monitor/stats[n]] request_id [51668292] timed out after [15005ms]
Along with corresponding messages showing eventual responses, between 25s and 15m later.
Here's the stats output now (no issue at the moment, I'll try to capture one later today if the problem happens again for comparison)
{
"_nodes" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"cluster_name" : "my-cluster",
"cluster_uuid" : "my-uuid",
"timestamp" : 1622034173125,
"status" : "green",
"indices" : {
"count" : 206,
"shards" : {
"total" : 206,
"primaries" : 206,
"replication" : 0.0,
"index" : {
"shards" : {
"min" : 1,
"max" : 1,
"avg" : 1.0
},
"primaries" : {
"min" : 1,
"max" : 1,
"avg" : 1.0
},
"replication" : {
"min" : 0.0,
"max" : 0.0,
"avg" : 0.0
}
}
},
"docs" : {
"count" : 1992111813,
"deleted" : 22353
},
"store" : {
"size_in_bytes" : 507784642462,
"reserved_in_bytes" : 0
},
"fielddata" : {
"memory_size_in_bytes" : 182552,
"evictions" : 0
},
"query_cache" : {
"memory_size_in_bytes" : 648062,
"total_count" : 29206871,
"hit_count" : 2047801,
"miss_count" : 27159070,
"cache_size" : 5416,
"cache_count" : 45240,
"evictions" : 39824
},
"completion" : {
"size_in_bytes" : 0
},
"segments" : {
"count" : 5273,
"memory_in_bytes" : 76208396,
"terms_memory_in_bytes" : 55658840,
"stored_fields_memory_in_bytes" : 2879816,
"term_vectors_memory_in_bytes" : 0,
"norms_memory_in_bytes" : 7688448,
"points_memory_in_bytes" : 0,
"doc_values_memory_in_bytes" : 9981292,
"index_writer_memory_in_bytes" : 3178399772,
"version_map_memory_in_bytes" : 0,
"fixed_bit_set_memory_in_bytes" : 250832,
"max_unsafe_auto_id_timestamp" : 1621858669266,
"file_sizes" : { }
},
"mappings" : {
"field_types" : [
{
"name" : "boolean",
"count" : 275,
"index_count" : 130
},
{
"name" : "constant_keyword",
"count" : 2,
"index_count" : 1
},
{
"name" : "date",
"count" : 299,
"index_count" : 197
},
{
"name" : "date_nanos",
"count" : 182,
"index_count" : 182
},
{
"name" : "double",
"count" : 182,
"index_count" : 182
},
{
"name" : "float",
"count" : 851,
"index_count" : 182
},
{
"name" : "ip",
"count" : 1,
"index_count" : 1
},
{
"name" : "keyword",
"count" : 6019,
"index_count" : 197
},
{
"name" : "long",
"count" : 1407,
"index_count" : 189
},
{
"name" : "nested",
"count" : 8,
"index_count" : 8
},
{
"name" : "object",
"count" : 368,
"index_count" : 196
},
{
"name" : "text",
"count" : 6599,
"index_count" : 197
}
]
},
"analysis" : {
"char_filter_types" : [ ],
"tokenizer_types" : [ ],
"filter_types" : [ ],
"analyzer_types" : [ ],
"built_in_char_filters" : [ ],
"built_in_tokenizers" : [ ],
"built_in_filters" : [ ],
"built_in_analyzers" : [ ]
},
"versions" : [
{
"version" : "7.10.2",
"index_count" : 9,
"primary_shard_count" : 9,
"total_primary_bytes" : 6599500
},
{
"version" : "7.11.0",
"index_count" : 9,
"primary_shard_count" : 9,
"total_primary_bytes" : 76385336
},
{
"version" : "7.12.0",
"index_count" : 188,
"primary_shard_count" : 188,
"total_primary_bytes" : 507701657626
}
]
},
"nodes" : {
"count" : {
"total" : 5,
"coordinating_only" : 0,
"data" : 1,
"data_cold" : 0,
"data_content" : 0,
"data_frozen" : 0,
"data_hot" : 0,
"data_warm" : 0,
"ingest" : 1,
"master" : 3,
"ml" : 0,
"remote_cluster_client" : 0,
"transform" : 0,
"voting_only" : 0
},
"versions" : [
"7.12.0"
],
"os" : {
"available_processors" : 640,
"allocated_processors" : 640,
"names" : [
{
"name" : "Linux",
"count" : 5
}
],
"pretty_names" : [
{
"pretty_name" : "CentOS Linux 8",
"count" : 5
}
],
"architectures" : [
{
"arch" : "amd64",
"count" : 5
}
],
"mem" : {
"total_in_bytes" : 1349931417600,
"free_in_bytes" : 21592543232,
"used_in_bytes" : 1328338874368,
"free_percent" : 2,
"used_percent" : 98
}
},
"process" : {
"cpu" : {
"percent" : 1
},
"open_file_descriptors" : {
"min" : 754,
"max" : 8621,
"avg" : 2348
}
},
"jvm" : {
"max_uptime_in_millis" : 4987556072,
"versions" : [
{
"version" : "15.0.1",
"vm_name" : "OpenJDK 64-Bit Server VM",
"vm_version" : "15.0.1+9",
"vm_vendor" : "AdoptOpenJDK",
"bundled_jdk" : true,
"using_bundled_jdk" : true,
"count" : 5
}
],
"mem" : {
"heap_used_in_bytes" : 38936636224,
"heap_max_in_bytes" : 94489280512
},
"threads" : 1367
},
"fs" : {
"total_in_bytes" : 4148140335104,
"free_in_bytes" : 3294954549248,
"available_in_bytes" : 3084122091520
},
"plugins" : [ ],
"network_types" : {
"transport_types" : {
"netty4" : 5
},
"http_types" : {
"netty4" : 5
}
},
"discovery_types" : {
"zen" : 5
},
"packaging_types" : [
{
"flavor" : "default",
"type" : "docker",
"count" : 5
}
],
"ingest" : {
"number_of_pipelines" : 68,
"processor_stats" : {
"append" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time_in_millis" : 0
},
"conditional" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time_in_millis" : 0
},
"convert" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time_in_millis" : 0
},
"date" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time_in_millis" : 0
},
"dot_expander" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time_in_millis" : 0
},
"geoip" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time_in_millis" : 0
},
"grok" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time_in_millis" : 0
},
"gsub" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time_in_millis" : 0
},
"json" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time_in_millis" : 0
},
"pipeline" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time_in_millis" : 0
},
"remove" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time_in_millis" : 0
},
"rename" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time_in_millis" : 0
},
"script" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time_in_millis" : 0
},
"set" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time_in_millis" : 0
},
"split" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time_in_millis" : 0
},
"user_agent" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time_in_millis" : 0
}
}
}
}
}