Reported CPU usage from Elasticsearch node gets stuck/frozen

I have a randomly occuring issue where after being up for some time, nodes stop reporting CPU usage correctly. It seems it starts repeating the same value as right before it stopped working.

Visible in the Elastic Monitoring dashboard as seen in the image

But also seen when checking stats directly

GET _cat/nodes?v

For example, it always returns "14" in the CPU column for one of the nodes

Seems other stats works as expected.

Details about the cluster:

  • Elasticsearch 7.10.0
  • Running on Windows servers
  • JVM: AdoptOpenJDK 11.0.10+9

Current workaround is restarting nodes when this happens.

Not found anything relevant in the logs. Do not know how to reproduce directly, but happens consistently on several nodes in the cluster.

Any known bug that might explain this? Any suggestion for better workaround than restarting nodes?

Elasticsearch 7.10 is EOL and no longer supported. Please upgrade ASAP.

(This is an automated response from your friendly Elastic bot. Please report this post if you have any suggestions or concerns :elasticheart: )

Here's a capture of when it stops working, at 00:20:

What is the full output of the cluster stats API?

Here it is!

Output from _cluster/stats:
{
  "_nodes" : {
    "total" : 8,
    "successful" : 8,
    "failed" : 0
  },
  "cluster_name" : "anonymized",
  "cluster_uuid" : "anonymized",
  "timestamp" : 1669645645867,
  "status" : "green",
  "indices" : {
    "count" : 42,
    "shards" : {
      "total" : 230,
      "primaries" : 84,
      "replication" : 1.7380952380952381,
      "index" : {
        "shards" : {
          "min" : 2,
          "max" : 48,
          "avg" : 5.476190476190476
        },
        "primaries" : {
          "min" : 1,
          "max" : 16,
          "avg" : 2.0
        },
        "replication" : {
          "min" : 1.0,
          "max" : 2.0,
          "avg" : 1.4761904761904763
        }
      }
    },
    "docs" : {
      "count" : 19006719,
      "deleted" : 8742387
    },
    "store" : {
      "size_in_bytes" : 68984597029,
      "reserved_in_bytes" : 0
    },
    "fielddata" : {
      "memory_size_in_bytes" : 762298144,
      "evictions" : 0
    },
    "query_cache" : {
      "memory_size_in_bytes" : 150091842,
      "total_count" : 225572496269,
      "hit_count" : 15275357598,
      "miss_count" : 210297138671,
      "cache_size" : 47077,
      "cache_count" : 39137914,
      "evictions" : 39090837
    },
    "completion" : {
      "size_in_bytes" : 0
    },
    "segments" : {
      "count" : 2169,
      "memory_in_bytes" : 133452460,
      "terms_memory_in_bytes" : 91241480,
      "stored_fields_memory_in_bytes" : 1109160,
      "term_vectors_memory_in_bytes" : 0,
      "norms_memory_in_bytes" : 14232512,
      "points_memory_in_bytes" : 0,
      "doc_values_memory_in_bytes" : 26869308,
      "index_writer_memory_in_bytes" : 193859920,
      "version_map_memory_in_bytes" : 10572714,
      "fixed_bit_set_memory_in_bytes" : 9870536,
      "max_unsafe_auto_id_timestamp" : 1669593602923,
      "file_sizes" : { }
    },
    "mappings" : {
      "field_types" : [
        {
          "name" : "binary",
          "count" : 9,
          "index_count" : 1
        },
        {
          "name" : "boolean",
          "count" : 120,
          "index_count" : 28
        },
        {
          "name" : "byte",
          "count" : 2,
          "index_count" : 2
        },
        {
          "name" : "date",
          "count" : 192,
          "index_count" : 35
        },
        {
          "name" : "flattened",
          "count" : 9,
          "index_count" : 1
        },
        {
          "name" : "float",
          "count" : 146,
          "index_count" : 10
        },
        {
          "name" : "geo_point",
          "count" : 1,
          "index_count" : 1
        },
        {
          "name" : "half_float",
          "count" : 56,
          "index_count" : 14
        },
        {
          "name" : "integer",
          "count" : 203,
          "index_count" : 14
        },
        {
          "name" : "join",
          "count" : 2,
          "index_count" : 2
        },
        {
          "name" : "keyword",
          "count" : 1265,
          "index_count" : 38
        },
        {
          "name" : "long",
          "count" : 1273,
          "index_count" : 27
        },
        {
          "name" : "nested",
          "count" : 37,
          "index_count" : 14
        },
        {
          "name" : "object",
          "count" : 1066,
          "index_count" : 38
        },
        {
          "name" : "scaled_float",
          "count" : 8,
          "index_count" : 2
        },
        {
          "name" : "short",
          "count" : 4,
          "index_count" : 2
        },
        {
          "name" : "text",
          "count" : 765,
          "index_count" : 30
        }
      ]
    },
    "analysis" : {
      "char_filter_types" : [
        {
          "name" : "pattern_replace",
          "count" : 2,
          "index_count" : 2
        }
      ],
      "tokenizer_types" : [
        {
          "name" : "path_hierarchy",
          "count" : 4,
          "index_count" : 4
        }
      ],
      "filter_types" : [
        {
          "name" : "edge_ngram",
          "count" : 6,
          "index_count" : 6
        },
        {
          "name" : "shingle",
          "count" : 2,
          "index_count" : 2
        },
        {
          "name" : "stemmer",
          "count" : 6,
          "index_count" : 6
        },
        {
          "name" : "stop",
          "count" : 6,
          "index_count" : 6
        },
        {
          "name" : "synonym",
          "count" : 12,
          "index_count" : 6
        }
      ],
      "analyzer_types" : [
        {
          "name" : "custom",
          "count" : 30,
          "index_count" : 6
        }
      ],
      "built_in_char_filters" : [ ],
      "built_in_tokenizers" : [
        {
          "name" : "keyword",
          "count" : 4,
          "index_count" : 4
        },
        {
          "name" : "standard",
          "count" : 22,
          "index_count" : 6
        }
      ],
      "built_in_filters" : [
        {
          "name" : "lowercase",
          "count" : 24,
          "index_count" : 6
        },
        {
          "name" : "unique",
          "count" : 2,
          "index_count" : 2
        }
      ],
      "built_in_analyzers" : [
        {
          "name" : "keyword",
          "count" : 21,
          "index_count" : 3
        },
        {
          "name" : "standard",
          "count" : 97,
          "index_count" : 6
        },
        {
          "name" : "swedish",
          "count" : 37,
          "index_count" : 6
        }
      ]
    }
  },
  "nodes" : {
    "count" : {
      "total" : 8,
      "coordinating_only" : 0,
      "data" : 8,
      "data_cold" : 8,
      "data_content" : 8,
      "data_hot" : 8,
      "data_warm" : 8,
      "ingest" : 8,
      "master" : 8,
      "ml" : 0,
      "remote_cluster_client" : 8,
      "transform" : 8,
      "voting_only" : 0
    },
    "versions" : [
      "7.10.0"
    ],
    "os" : {
      "available_processors" : 64,
      "allocated_processors" : 64,
      "names" : [
        {
          "name" : "Windows Server 2016",
          "count" : 8
        }
      ],
      "pretty_names" : [
        {
          "pretty_name" : "Windows Server 2016",
          "count" : 8
        }
      ],
      "mem" : {
        "total_in_bytes" : 137430433792,
        "free_in_bytes" : 39297904640,
        "used_in_bytes" : 98132529152,
        "free_percent" : 29,
        "used_percent" : 71
      }
    },
    "process" : {
      "cpu" : {
        "percent" : 144
      },
      "open_file_descriptors" : {
        "min" : -1,
        "max" : -1,
        "avg" : 0
      }
    },
    "jvm" : {
      "max_uptime_in_millis" : 29984030790,
      "versions" : [
        {
          "version" : "11.0.10",
          "vm_name" : "OpenJDK 64-Bit Server VM",
          "vm_version" : "11.0.10+9",
          "vm_vendor" : "AdoptOpenJDK",
          "bundled_jdk" : true,
          "using_bundled_jdk" : false,
          "count" : 8
        }
      ],
      "mem" : {
        "heap_used_in_bytes" : 28380411328,
        "heap_max_in_bytes" : 68161634304
      },
      "threads" : 665
    },
    "fs" : {
      "total_in_bytes" : 1716896366592,
      "free_in_bytes" : 1564103098368,
      "available_in_bytes" : 1564103098368
    },
    "plugins" : [ ],
    "network_types" : {
      "transport_types" : {
        "netty4" : 8
      },
      "http_types" : {
        "netty4" : 8
      }
    },
    "discovery_types" : {
      "zen" : 8
    },
    "packaging_types" : [
      {
        "flavor" : "default",
        "type" : "zip",
        "count" : 8
      }
    ],
    "ingest" : {
      "number_of_pipelines" : 3,
      "processor_stats" : {
        "gsub" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 0
        },
        "rename" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 0
        },
        "script" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 0
        },
        "set" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 0
        }
      }
    }
  }
}

I do not see anything specific in the stats. I do not know if there are any issues around this that may have been identified/fixed since the version you are using, but would nevertheless recommend upgrading to Elasticsearch 7.17.