All of a sudden Elastic Search, 100% CPU usage and high memory usage

Hello

I havent touched any configuration on the Elastic Stack in about a month and all of a sudden, we are seeing the CPU at 100% and the memory usage very high.

Im surprised that this has been going on for about 8 hours so Id like to know why

Here is some info:


{
  "_nodes" : {
    "total" : 1,
    "successful" : 1,
    "failed" : 0
  },
  "cluster_name" : "elasticsearch",
  "cluster_uuid" : "X-f567g34-urd34567d-x",
  "timestamp" : 1629358136738,
  "status" : "yellow",
  "indices" : {
    "count" : 4028,
    "shards" : {
      "total" : 4028,
      "primaries" : 4028,
      "replication" : 0.0,
      "index" : {
        "shards" : {
          "min" : 1,
          "max" : 1,
          "avg" : 1.0
        },
        "primaries" : {
          "min" : 1,
          "max" : 1,
          "avg" : 1.0
        },
        "replication" : {
          "min" : 0.0,
          "max" : 0.0,
          "avg" : 0.0
        }
      }
    },
    "docs" : {
      "count" : 239472341,
      "deleted" : 2812
    },
    "store" : {
      "size" : "170.3gb",
      "size_in_bytes" : 182964144349,
      "reserved" : "0b",
      "reserved_in_bytes" : 0
    },
    "fielddata" : {
      "memory_size" : "0b",
      "memory_size_in_bytes" : 0,
      "evictions" : 0
    },
    "query_cache" : {
      "memory_size" : "0b",
      "memory_size_in_bytes" : 0,
      "total_count" : 0,
      "hit_count" : 0,
      "miss_count" : 0,
      "cache_size" : 0,
      "cache_count" : 0,
      "evictions" : 0
    },
    "completion" : {
      "size" : "0b",
      "size_in_bytes" : 0
    },
    "segments" : {
      "count" : 27325,
      "memory" : "883.1mb",
      "memory_in_bytes" : 926019254,
      "terms_memory" : "735.5mb",
      "terms_memory_in_bytes" : 771241896,
      "stored_fields_memory" : "13.1mb",
      "stored_fields_memory_in_bytes" : 13770856,
      "term_vectors_memory" : "0b",
      "term_vectors_memory_in_bytes" : 0,
      "norms_memory" : "102.8mb",
      "norms_memory_in_bytes" : 107853184,
      "points_memory" : "0b",
      "points_memory_in_bytes" : 0,
      "doc_values_memory" : "31.6mb",
      "doc_values_memory_in_bytes" : 33153318,
      "index_writer_memory" : "563.1mb",
      "index_writer_memory_in_bytes" : 590503336,
      "version_map_memory" : "10.9mb",
      "version_map_memory_in_bytes" : 11435800,
      "fixed_bit_set" : "4.4mb",
      "fixed_bit_set_memory_in_bytes" : 4690952,
      "max_unsafe_auto_id_timestamp" : 1629332500632,
      "file_sizes" : { }
    },
    "mappings" : {
      "field_types" : [
        {
          "name" : "alias",
          "count" : 5817,
          "index_count" : 172
        },
        {
          "name" : "binary",
          "count" : 14,
          "index_count" : 3
        },
        {
          "name" : "boolean",
          "count" : 19602,
          "index_count" : 186
        },
        {
          "name" : "byte",
          "count" : 1,
          "index_count" : 1
        },
        {
          "name" : "constant_keyword",
          "count" : 3,
          "index_count" : 1
        },
        {
          "name" : "date",
          "count" : 34452,
          "index_count" : 4009
        },
        {
          "name" : "double",
          "count" : 5749,
          "index_count" : 173
        },
        {
          "name" : "flattened",
          "count" : 1377,
          "index_count" : 172
        },
        {
          "name" : "float",
          "count" : 6163,
          "index_count" : 194
        },
        {
          "name" : "geo_point",
          "count" : 1946,
          "index_count" : 572
        },
        {
          "name" : "half_float",
          "count" : 1119,
          "index_count" : 555
        },
        {
          "name" : "integer",
          "count" : 95,
          "index_count" : 6
        },
        {
          "name" : "ip",
          "count" : 21038,
          "index_count" : 328
        },
        {
          "name" : "keyword",
          "count" : 865536,
          "index_count" : 4010
        },
        {
          "name" : "long",
          "count" : 196018,
          "index_count" : 4005
        },
        {
          "name" : "nested",
          "count" : 533,
          "index_count" : 180
        },
        {
          "name" : "object",
          "count" : 174394,
          "index_count" : 3933
        },
        {
          "name" : "scaled_float",
          "count" : 121,
          "index_count" : 1
        },
        {
          "name" : "short",
          "count" : 17271,
          "index_count" : 171
        },
        {
          "name" : "text",
          "count" : 254912,
          "index_count" : 4008
        }
      ]
    },
    "analysis" : {
      "char_filter_types" : [ ],
      "tokenizer_types" : [ ],
      "filter_types" : [
        {
          "name" : "pattern_capture",
          "count" : 1,
          "index_count" : 1
        }
      ],
      "analyzer_types" : [
        {
          "name" : "custom",
          "count" : 1,
          "index_count" : 1
        }
      ],
      "built_in_char_filters" : [ ],
      "built_in_tokenizers" : [
        {
          "name" : "uax_url_email",
          "count" : 1,
          "index_count" : 1
        }
      ],
      "built_in_filters" : [
        {
          "name" : "lowercase",
          "count" : 1,
          "index_count" : 1
        },
        {
          "name" : "unique",
          "count" : 1,
          "index_count" : 1
        }
      ],
      "built_in_analyzers" : [ ]
    }
  },
  "nodes" : {
    "count" : {
      "total" : 1,
      "coordinating_only" : 0,
      "data" : 1,
      "data_cold" : 1,
      "data_content" : 1,
      "data_hot" : 1,
      "data_warm" : 1,
      "ingest" : 1,
      "master" : 1,
      "ml" : 1,
      "remote_cluster_client" : 1,
      "transform" : 1,
      "voting_only" : 0
    },
    "versions" : [
      "7.10.1"
    ],
    "os" : {
      "available_processors" : 4,
      "allocated_processors" : 4,
      "names" : [
        {
          "name" : "Linux",
          "count" : 1
        }
      ],
      "pretty_names" : [
        {
          "pretty_name" : "CentOS Linux 7 (Core)",
          "count" : 1
        }
      ],
      "mem" : {
        "total" : "31.2gb",
        "total_in_bytes" : 33530023936,
        "free" : "239.5mb",
        "free_in_bytes" : 251133952,
        "used" : "30.9gb",
        "used_in_bytes" : 33278889984,
        "free_percent" : 1,
        "used_percent" : 99
      }
    },
    "process" : {
      "cpu" : {
        "percent" : 80
      },
      "open_file_descriptors" : {
        "min" : 22644,
        "max" : 22644,
        "avg" : 22644
      }
    },
    "jvm" : {
      "max_uptime" : "15.3h",
      "max_uptime_in_millis" : 55150606,
      "versions" : [
        {
          "version" : "15.0.1",
          "vm_name" : "OpenJDK 64-Bit Server VM",
          "vm_version" : "15.0.1+9",
          "vm_vendor" : "AdoptOpenJDK",
          "bundled_jdk" : true,
          "using_bundled_jdk" : true,
          "count" : 1
        }
      ],
      "mem" : {
        "heap_used" : "11.3gb",
        "heap_used_in_bytes" : 12217795064,
        "heap_max" : "16gb",
        "heap_max_in_bytes" : 17179869184
      },
      "threads" : 184
    },
    "fs" : {
      "total" : "299.9gb",
      "total_in_bytes" : 322065928192,
      "free" : "126gb",
      "free_in_bytes" : 135381614592,
      "available" : "126gb",
      "available_in_bytes" : 135381614592
    },
    "plugins" : [ ],
    "network_types" : {
      "transport_types" : {
        "security4" : 1
      },
      "http_types" : {
        "security4" : 1
      }
    },
    "discovery_types" : {
      "single-node" : 1
    },
    "packaging_types" : [
      {
        "flavor" : "default",
        "type" : "rpm",
        "count" : 1
      }
    ],
    "ingest" : {
      "number_of_pipelines" : 21,
      "processor_stats" : {
        "conditional" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "convert" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "date" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "foreach" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "geoip" : {
          "count" : 305288,
          "failed" : 0,
          "current" : 0,
          "time" : "4.9s",
          "time_in_millis" : 4932
        },
        "grok" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "gsub" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "json" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "lowercase" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "pipeline" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "remove" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "rename" : {
          "count" : 305288,
          "failed" : 0,
          "current" : 0,
          "time" : "899ms",
          "time_in_millis" : 899
        },
        "script" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "set" : {
          "count" : 152644,
          "failed" : 0,
          "current" : 0,
          "time" : "801ms",
          "time_in_millis" : 801
        },
        "user_agent" : {
          "count" : 152644,
          "failed" : 0,
          "current" : 0,
          "time" : "1.2s",
          "time_in_millis" : 1291
        }
      }
    }
  }
}

You can use the hot threads API to figure out what part of Elasticsearch code consumes a lot of CPU.

Thats one of the things that helped me figured out it Metricbeat and its collection of metrics on the localhost.

I increased the period from the default 5s and increased it to 60s and it seems to have gone away.