Heap memory overflow master nodes (continuous GC)

Hi everyone,

Recently, encountered increases in the heap memory usage in the master nodes (heap memory overflow master nodes continuous garbage collection). I try to debug the root cause using the heap dump saved in the storage ( sample file name for reference: java_pid1.hprof ) but those files are encrypted unable find anything.

Is this the correct way to debug the heap memory issue,
If yes, how to get the decrypted heap dump to get a proper info
Else how to debug the heap memory issue in the master node

Elasticsearch Info:

Running in Kubernetes
Dedicated 3 master nodes
3 data nodes (which are also the ingest nodes)

3 data nodes - each node spec(ram 64GB memory limit 32GB) - heap size - 28GB disk size - 1TB
3 master nodes - each node spec(ram 16GB memory limit 4GB) - heap size - 4GB disk size - 10GB

What is the output from the _cluster/stats?pretty&human API?

This looks strange. As Elasticsearch uses off-heap storage and also relies on the operating system page cache for performance it is recommended to allocate at most 50% of the available memory to heap. You seem to be much higher than that which I would expect to be a problem.

It would also help if you provided information about the version you are using and the full output of the cluster stats API.

sorry for late response

{
  "_nodes" : {
    "total" : 6,
    "successful" : 6,
    "failed" : 0
  },
  "cluster_name" : "name",
  "cluster_uuid" : "xxyyyxx",
  "timestamp" : 1657527823553,
  "status" : "green",
  "indices" : {
    "count" : 753,
    "shards" : {
      "total" : 2964,
      "primaries" : 1482,
      "replication" : 1.0,
      "index" : {
        "shards" : {
          "min" : 2,
          "max" : 20,
          "avg" : 3.9362549800796813
        },
        "primaries" : {
          "min" : 1,
          "max" : 10,
          "avg" : 1.9681274900398407
        },
        "replication" : {
          "min" : 1.0,
          "max" : 1.0,
          "avg" : 1.0
        }
      }
    },
    "docs" : {
      "count" : 1128170433,
      "deleted" : 143901474
    },
    "store" : {
      "size" : "388.3gb",
      "size_in_bytes" : 416966891016,
      "total_data_set_size" : "388.3gb",
      "total_data_set_size_in_bytes" : 416966891016,
      "reserved" : "0b",
      "reserved_in_bytes" : 0
    },
    "fielddata" : {
      "memory_size" : "7.1mb",
      "memory_size_in_bytes" : 7448800,
      "evictions" : 0
    },
    "query_cache" : {
      "memory_size" : "454.6mb",
      "memory_size_in_bytes" : 476761942,
      "total_count" : 191282104,
      "hit_count" : 12653992,
      "miss_count" : 178628112,
      "cache_size" : 83228,
      "cache_count" : 300557,
      "evictions" : 217329
    },
    "completion" : {
      "size" : "0b",
      "size_in_bytes" : 0
    },
    "segments" : {
      "count" : 16317,
      "memory" : "181.9mb",
      "memory_in_bytes" : 190840312,
      "terms_memory" : "134.8mb",
      "terms_memory_in_bytes" : 141379496,
      "stored_fields_memory" : "7.9mb",
      "stored_fields_memory_in_bytes" : 8329272,
      "term_vectors_memory" : "0b",
      "term_vectors_memory_in_bytes" : 0,
      "norms_memory" : "5.1mb",
      "norms_memory_in_bytes" : 5351488,
      "points_memory" : "0b",
      "points_memory_in_bytes" : 0,
      "doc_values_memory" : "34.1mb",
      "doc_values_memory_in_bytes" : 35780056,
      "index_writer_memory" : "240.8mb",
      "index_writer_memory_in_bytes" : 252516396,
      "version_map_memory" : "8.3mb",
      "version_map_memory_in_bytes" : 8800358,
      "fixed_bit_set" : "1.5gb",
      "fixed_bit_set_memory_in_bytes" : 1703423200,
      "max_unsafe_auto_id_timestamp" : 1657497604894,
      "file_sizes" : { }
    },
    "mappings" : {
      "field_types" : [
        {
          "name" : "binary",
          "count" : 1,
          "index_count" : 1,
          "script_count" : 0
        },
        {
          "name" : "boolean",
          "count" : 4828,
          "index_count" : 742,
          "script_count" : 0
        },
        {
          "name" : "byte",
          "count" : 1,
          "index_count" : 1,
          "script_count" : 0
        },
        {
          "name" : "date",
          "count" : 2786,
          "index_count" : 745,
          "script_count" : 0
        },
        {
          "name" : "date_nanos",
          "count" : 1,
          "index_count" : 1,
          "script_count" : 0
        },
        {
          "name" : "date_range",
          "count" : 1,
          "index_count" : 1,
          "script_count" : 0
        },
        {
          "name" : "double",
          "count" : 8646,
          "index_count" : 311,
          "script_count" : 0
        },
        {
          "name" : "double_range",
          "count" : 1,
          "index_count" : 1,
          "script_count" : 0
        },
        {
          "name" : "float",
          "count" : 9690,
          "index_count" : 325,
          "script_count" : 0
        },
        {
          "name" : "float_range",
          "count" : 1,
          "index_count" : 1,
          "script_count" : 0
        },
        {
          "name" : "geo_point",
          "count" : 9,
          "index_count" : 2,
          "script_count" : 0
        },
        {
          "name" : "geo_shape",
          "count" : 1,
          "index_count" : 1,
          "script_count" : 0
        },
        {
          "name" : "half_float",
          "count" : 57,
          "index_count" : 15,
          "script_count" : 0
        },
        {
          "name" : "integer",
          "count" : 172,
          "index_count" : 16,
          "script_count" : 0
        },
        {
          "name" : "integer_range",
          "count" : 1,
          "index_count" : 1,
          "script_count" : 0
        },
        {
          "name" : "ip",
          "count" : 15,
          "index_count" : 2,
          "script_count" : 0
        },
        {
          "name" : "ip_range",
          "count" : 1,
          "index_count" : 1,
          "script_count" : 0
        },
        {
          "name" : "join",
          "count" : 310,
          "index_count" : 310,
          "script_count" : 0
        },
        {
          "name" : "keyword",
          "count" : 53096,
          "index_count" : 746,
          "script_count" : 0
        },
        {
          "name" : "long",
          "count" : 1430,
          "index_count" : 75,
          "script_count" : 0
        },
        {
          "name" : "long_range",
          "count" : 1,
          "index_count" : 1,
          "script_count" : 0
        },
        {
          "name" : "nested",
          "count" : 6635,
          "index_count" : 363,
          "script_count" : 0
        },
        {
          "name" : "object",
          "count" : 2681,
          "index_count" : 336,
          "script_count" : 0
        },
        {
          "name" : "scaled_float",
          "count" : 1,
          "index_count" : 1,
          "script_count" : 0
        },
        {
          "name" : "shape",
          "count" : 1,
          "index_count" : 1,
          "script_count" : 0
        },
        {
          "name" : "short",
          "count" : 1,
          "index_count" : 1,
          "script_count" : 0
        },
        {
          "name" : "text",
          "count" : 13806,
          "index_count" : 738,
          "script_count" : 0
        },
        {
          "name" : "wildcard",
          "count" : 1,
          "index_count" : 1,
          "script_count" : 0
        }
      ],
      "runtime_field_types" : [ ]
    },
    "analysis" : {
      "char_filter_types" : [ ],
      "tokenizer_types" : [ ],
      "filter_types" : [ ],
      "analyzer_types" : [ ],
      "built_in_char_filters" : [ ],
      "built_in_tokenizers" : [ ],
      "built_in_filters" : [ ],
      "built_in_analyzers" : [ ]
    },
    "versions" : [
      {
        "version" : "7.14.1",
        "index_count" : 753,
        "primary_shard_count" : 1482,
        "total_primary_size" : "192.9gb",
        "total_primary_bytes" : 207156458431
      }
    ]
  },
  "nodes" : {
    "count" : {
      "total" : 6,
      "coordinating_only" : 0,
      "data" : 3,
      "data_cold" : 0,
      "data_content" : 0,
      "data_frozen" : 0,
      "data_hot" : 0,
      "data_warm" : 0,
      "ingest" : 3,
      "master" : 3,
      "ml" : 0,
      "remote_cluster_client" : 0,
      "transform" : 0,
      "voting_only" : 0
    },
    "versions" : [
      "7.14.1"
    ],
    "os" : {
      "available_processors" : 6,
      "allocated_processors" : 6,
      "names" : [
        {
          "name" : "Linux",
          "count" : 6
        }
      ],
      "pretty_names" : [
        {
          "pretty_name" : "CentOS Linux 8",
          "count" : 6
        }
      ],
      "architectures" : [
        {
          "arch" : "amd64",
          "count" : 6
        }
      ],
      "mem" : {
        "total" : "111gb",
        "total_in_bytes" : 119185342464,
        "free" : "2.2gb",
        "free_in_bytes" : 2431660032,
        "used" : "108.7gb",
        "used_in_bytes" : 116753682432,
        "free_percent" : 2,
        "used_percent" : 98
      }
    },
    "process" : {
      "cpu" : {
        "percent" : 6
      },
      "open_file_descriptors" : {
        "min" : 464,
        "max" : 6518,
        "avg" : 3480
      }
    },
    "jvm" : {
      "max_uptime" : "248d",
      "max_uptime_in_millis" : 21427790587,
      "versions" : [
        {
          "version" : "16.0.2",
          "vm_name" : "OpenJDK 64-Bit Server VM",
          "vm_version" : "16.0.2+7",
          "vm_vendor" : "Eclipse Foundation",
          "bundled_jdk" : true,
          "using_bundled_jdk" : true,
          "count" : 6
        }
      ],
      "mem" : {
        "heap_used" : "59.1gb",
        "heap_used_in_bytes" : 63552823240,
        "heap_max" : "96gb",
        "heap_max_in_bytes" : 103079215104
      },
      "threads" : 361
    },
    "fs" : {
      "total" : "2.9tb",
      "total_in_bytes" : 3198599565312,
      "free" : "2.5tb",
      "free_in_bytes" : 2765769703424,
      "available" : "2.5tb",
      "available_in_bytes" : 2765669040128
    },
    "plugins" : [
      {
        "name" : "repository-azure",
        "version" : "7.14.1",
        "elasticsearch_version" : "7.14.1",
        "java_version" : "1.8",
        "description" : "The Azure Repository plugin adds support for Azure storage repositories.",
        "classname" : "org.elasticsearch.repositories.azure.AzureRepositoryPlugin",
        "extended_plugins" : [ ],
        "has_native_controller" : false,
        "licensed" : false,
        "type" : "isolated"
      }
    ],
    "network_types" : {
      "transport_types" : {
        "security4" : 6
      },
      "http_types" : {
        "security4" : 6
      }
    },
    "discovery_types" : {
      "zen" : 6
    },
    "packaging_types" : [
      {
        "flavor" : "default",
        "type" : "docker",
        "count" : 6
      }
    ],
    "ingest" : {
      "number_of_pipelines" : 13,
      "processor_stats" : {
        "conditional" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "geoip" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "gsub" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "script" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "set" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        }
      }
    }
  }
}

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.