HEAP Memory usage over 90%

Hi everyone,

I'm facing an issue with the memory usage of ES.
We have a 5 nodes cluster to store our logs sent from graylog.
I've searching and reading around for a week now I made some change on my configuration but nothing seems to be relevant or helping me to solve this issue.

Here is my elasticsearch.yml

    bootstrap.memory_lock: true
    network.host: 0.0.0.0
    http.port: 9200
    discovery.zen.ping.unicast.hosts: ["log01", "log02", "log03", "log04", "log05"]
    discovery.zen.minimum_master_nodes: 3
    cluster.routing.allocation.disk.watermark.low: 90%
    cluster.routing.allocation.disk.watermark.high: 95%
    indices.fielddata.cache.size: 20%
    indices.memory.index_buffer_size: 20%

My jvm.option :

    -Dcom.sun.management.jmxremote
    -Dcom.sun.management.jmxremote.port=9999
    -Dcom.sun.management.jmxremote.authenticate=false
    -Dcom.sun.management.jmxremote.ssl=false
    -Dcom.sun.management.jmxremote.local.only=false
    -Xms31g
    -Xmx31g

    -XX:NewRatio=2
    -XX:ParallelGCThreads=20
    -XX:+UseConcMarkSweepGC
    -XX:CMSInitiatingOccupancyFraction=75
    -XX:+UseCMSInitiatingOccupancyOnly

    10-:-XX:-UseConcMarkSweepGC
    10-:-XX:-UseCMSInitiatingOccupancyOnly
    10-:-XX:+UseG1GC
    10-:-XX:InitiatingHeapOccupancyPercent=75

    -Des.networkaddress.cache.ttl=60
    -Des.networkaddress.cache.negative.ttl=10

    -XX:+AlwaysPreTouch
    -Xss1m

    -Djava.awt.headless=true
    -Dfile.encoding=UTF-8
    -Djna.nosys=true

    -XX:-OmitStackTraceInFastThrow

    -Dio.netty.noUnsafe=true
    -Dio.netty.noKeySetOptimization=true
    -Dio.netty.recycler.maxCapacityPerThread=0

    -Dlog4j.shutdownHookEnabled=false
    -Dlog4j2.disable.jmx=true

    -Djava.io.tmpdir=${ES_TMPDIR}

    -XX:HeapDumpPath=/var/lib/elasticsearch
    -XX:ErrorFile=/var/log/elasticsearch/hs_err_pid%p.log

Some more informations :

    {
      "name" : "log01.sib.fr",
      "cluster_name" : "elasticC01",
      "cluster_uuid" : "xxxxxxxxxxxxxxxx",
      "version" : {
        "number" : "6.8.13",
        "build_flavor" : "default",
        "build_type" : "rpm",
        "build_hash" : "be13c69",
        "build_date" : "2020-10-16T09:09:46.555371Z",
        "build_snapshot" : false,
        "lucene_version" : "7.7.3",
        "minimum_wire_compatibility_version" : "5.6.0",
        "minimum_index_compatibility_version" : "5.0.0"
      },
      "tagline" : "You Know, for Search"
    }

Every few days I have to restart my whole cluster because it's going down due to memory usage.
I am planning to make special nodes only to do the "master" role but I guess it won't be enough.
I am certainly missing something here (I'm quite new to ES at least the recent versions.

Any help would be appreciate.

Regards,

I forgot my stats, my bad.

Regards,

[root@log01 ~]# curl -XGET 'http://localhost:9200/_cluster/stats?human&pretty'
{
  "_nodes" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "cluster_name" : "elasticC01",
  "cluster_uuid" : "xxxxxxxxxxxxxxxxxx",
  "timestamp" : 1613466489724,
  "status" : "green",
  "indices" : {
    "count" : 321,
    "shards" : {
      "total" : 2519,
      "primaries" : 1400,
      "replication" : 0.7992857142857143,
      "index" : {
        "shards" : {
          "min" : 3,
          "max" : 12,
          "avg" : 7.8473520249221185
        },
        "primaries" : {
          "min" : 3,
          "max" : 6,
          "avg" : 4.361370716510903
        },
        "replication" : {
          "min" : 0.0,
          "max" : 1.0,
          "avg" : 0.8286604361370716
        }
      }
    },
    "docs" : {
      "count" : 63602436209,
      "deleted" : 856
    },
    "store" : {
      "size" : "34.6tb",
      "size_in_bytes" : 38061661250625
    },
    "fielddata" : {
      "memory_size" : "11.7kb",
      "memory_size_in_bytes" : 12000,
      "evictions" : 0
    },
    "query_cache" : {
      "memory_size" : "12.1gb",
      "memory_size_in_bytes" : 13063084075,
      "total_count" : 223375738,
      "hit_count" : 609769,
      "miss_count" : 222765969,
      "cache_size" : 36354,
      "cache_count" : 42704,
      "evictions" : 6350
    },
    "completion" : {
      "size" : "0b",
      "size_in_bytes" : 0
    },
    "segments" : {
      "count" : 7092,
      "memory" : "102.9gb",
      "memory_in_bytes" : 110521518138,
      "terms_memory" : "85.8gb",
      "terms_memory_in_bytes" : 92194416956,
      "stored_fields_memory" : "14.8gb",
      "stored_fields_memory_in_bytes" : 15897791288,
      "term_vectors_memory" : "0b",
      "term_vectors_memory_in_bytes" : 0,
      "norms_memory" : "576.7kb",
      "norms_memory_in_bytes" : 590592,
      "points_memory" : "2.2gb",
      "points_memory_in_bytes" : 2380445350,
      "doc_values_memory" : "46mb",
      "doc_values_memory_in_bytes" : 48273952,
      "index_writer_memory" : "163.2mb",
      "index_writer_memory_in_bytes" : 171206564,
      "version_map_memory" : "2mb",
      "version_map_memory_in_bytes" : 2148176,
      "fixed_bit_set" : "0b",
      "fixed_bit_set_memory_in_bytes" : 0,
      "max_unsafe_auto_id_timestamp" : 1592863202743,
      "file_sizes" : { }
    }
  },
  "nodes" : {
    "count" : {
      "total" : 5,
      "data" : 5,
      "coordinating_only" : 0,
      "master" : 5,
      "ingest" : 5
    },
    "versions" : [
      "6.8.13"
    ],
    "os" : {
      "available_processors" : 108,
      "allocated_processors" : 108,
      "names" : [
        {
          "name" : "Linux",
          "count" : 5
        }
      ],
      "pretty_names" : [
        {
          "pretty_name" : "CentOS Linux 7 (Core)",
          "count" : 5
        }
      ],
      "mem" : {
        "total" : "312.2gb",
        "total_in_bytes" : 335281053696,
        "free" : "24.6gb",
        "free_in_bytes" : 26496303104,
        "used" : "287.5gb",
        "used_in_bytes" : 308784750592,
        "free_percent" : 8,
        "used_percent" : 92
      }
    },
    "process" : {
      "cpu" : {
        "percent" : 77
      },
      "open_file_descriptors" : {
        "min" : 5048,
        "max" : 12589,
        "avg" : 7633
      }
    },
    "jvm" : {
      "max_uptime" : "3.7d",
      "max_uptime_in_millis" : 326154804,
      "versions" : [
        {
          "version" : "11.0.9",
          "vm_name" : "OpenJDK 64-Bit Server VM",
          "vm_version" : "11.0.9+11-LTS",
          "vm_vendor" : "Red Hat, Inc.",
          "count" : 5
        }
      ],
      "mem" : {
        "heap_used" : "146.4gb",
        "heap_used_in_bytes" : 157256546952,
        "heap_max" : "155gb",
        "heap_max_in_bytes" : 166429982720
      },
      "threads" : 1100
    },
    "fs" : {
      "total" : "52.3tb",
      "total_in_bytes" : 57583371755520,
      "free" : "17.7tb",
      "free_in_bytes" : 19496562700288,
      "available" : "17.7tb",
      "available_in_bytes" : 19496562700288
    },
    "plugins" : [ ],
    "network_types" : {
      "transport_types" : {
        "security4" : 5
      },
      "http_types" : {
        "security4" : 5
      }
    }
  }
}

And a screenshot of Memory Heap :

What do your Elasticsearch logs show?

Verify that this is set low enough to ensure you are using compressed pointers. This should be printed in the logs on startup.

You are using a quite old version so do not benefit from the heap saving improvements that were added in Elasticsearch 7.7. I would recommend upgrading if you can.

It looks like you have almost 7TB data per node. That is a lot of data if the system and mappings are not tuned well. I guess it is a bit late for optimizing mappings, but given the version you are on this webinar will provide you with some good tips on how to reduce heap usage. The main way to do this is to forcemerge indices that are no longer written to down to a single segment. This can take a long time and be IO intensive but should help help bring down the terms memory usage, which at 85.8GB is quite large.

Hello warkolm,

ATM I have not faced an error close enought to have logs about those failure (They put me on this ES problem recently and they are restarting ES cluster once or twice a week to avoid this failure rooling start)

Hi Christian,

This is the only logs I have regarding those parameters on my log when ES start/restart :

[2021-02-17T12:38:14,817][INFO ][o.e.e.NodeEnvironment    ] [log01] heap size [31gb], compressed ordinary object pointers [true]
[2021-02-17T12:38:17,790][INFO ][o.e.n.Node               ] [log01] node name [log01], node ID [3d750CvpRkaMfL7KfJD5kA]
[2021-02-17T12:38:17,791][INFO ][o.e.n.Node               ] [log01] version[6.8.13], pid[89975], build[default/rpm/be13c69/2020-10-16T09:09:46.555371Z], OS[Linux/3.10.0-1160.6.1.el7.x86_64/amd64], JVM[Red Hat, Inc./OpenJDK 64-Bit Server VM/11.0.9/11.0.9+11-LTS]
[2021-02-17T12:38:17,791][INFO ][o.e.n.Node               ] [log01] JVM arguments [-Dcom.sun.management.jmxremote, -Dcom.sun.management.jmxremote.port=9999, -Dcom.sun.management.jmxremote.authenticate=false, -Dcom.sun.management.jmxremote.ssl=false, -Dcom.sun.management.jmxremote.local.only=false, -Xms31g, -Xmx31g, -XX:NewRatio=2, -XX:ParallelGCThreads=20, -XX:+UseConcMarkSweepGC, -XX:CMSInitiatingOccupancyFraction=75, -XX:+UseCMSInitiatingOccupancyOnly, -XX:-UseConcMarkSweepGC, -XX:-UseCMSInitiatingOccupancyOnly, -XX:+UseG1GC, -XX:InitiatingHeapOccupancyPercent=75, -Des.networkaddress.cache.ttl=60, -Des.networkaddress.cache.negative.ttl=10, -XX:+AlwaysPreTouch, -Xss1m, -Djava.awt.headless=true, -Dfile.encoding=UTF-8, -Djna.nosys=true, -XX:-OmitStackTraceInFastThrow, -Dio.netty.noUnsafe=true, -Dio.netty.noKeySetOptimization=true, -Dio.netty.recycler.maxCapacityPerThread=0, -Dlog4j.shutdownHookEnabled=false, -Dlog4j2.disable.jmx=true, -Djava.io.tmpdir=/tmp/elasticsearch-8404039662045141775, -XX:HeapDumpPath=/var/lib/elasticsearch, -XX:ErrorFile=/var/log/elasticsearch/hs_err_pid%p.log, -Xlog:gc*,gc+age=trace,safepoint:file=/var/log/elasticsearch/gc.log:utctime,pid,tags:filecount=32,filesize=64m, -Djava.locale.providers=COMPAT, -XX:UseAVX=2, -Des.path.home=/usr/share/elasticsearch, -Des.path.conf=/etc/elasticsearch, -Des.distribution.flavor=default, -Des.distribution.type=rpm]

I know that this version is quite OLD but Update is not possible at this point since our graylog version is not compatible with ES 7.* (Graylog 3, and upgrade is planned but since Graylog 4 is not "free" anymore we must figure this out before upgrading).

I'll take a look on this webinar.

Thx both of you.

Regards,