ES node out of memory: Java heap space

I often find out of memory: Java heap space in some nodes in our ES cluster. I have already allocated half of the total memory of each node to heap size but I still get this error. And except heap size issue, I never find a node cpu or memory are in high loading, on the contrary, the cpu usage is lower then 10% and memory are lower than 70% most of the time. Is it normal? if not, what is the possible cause of out of heap space ?

Similar issue is happening to me too. Really waiting for some right answers.

It'd be helpful if you could please post the logs from the node that OOM'd as well as the output from the _cluster/stats?pretty&human API.

1 Like

Thank you for your reply.
As the out of heap space occur, it always causes the ES service to stop and cluster to collapse, I have to disable some of the log collection to stabilize the cluster.

The output of the command :

{
"_nodes": {
"total": 7,
"successful": 7,
"failed": 0
},
"cluster_name": "Cluster1",
"cluster_uuid": "xxxxxxxxxxxxxxxxxxxxxx",
"timestamp": 1612241232165,
"status": "yellow",
"indices": {
"count": 94,
"shards": {
"total": 349,
"primaries": 171,
"replication": 1.04093567251462,
"index": {
"shards": {
"min": 1,
"max": 9,
"avg": 3.7127659574468086
},
"primaries": {
"min": 1,
"max": 3,
"avg": 1.8191489361702127
},
"replication": {
"min": 0.0,
"max": 2.0,
"avg": 0.875886524822695
}
}
},
"docs": {
"count": 268302902,
"deleted": 4608438
},
"store": {
"size": "425.1gb",
"size_in_bytes": 456494734685
},
"fielddata": {
"memory_size": "167.8kb",
"memory_size_in_bytes": 171864,
"evictions": 0
},
"query_cache": {
"memory_size": "454.3mb",
"memory_size_in_bytes": 476429169,
"total_count": 2383939,
"hit_count": 194135,
"miss_count": 2189804,
"cache_size": 30555,
"cache_count": 74490,
"evictions": 43935
},
"completion": {
"size": "0b",
"size_in_bytes": 0
},
"segments": {
"count": 2555,
"memory": "40mb",
"memory_in_bytes": 41988432,
"terms_memory": "20.9mb",
"terms_memory_in_bytes": 21915712,
"stored_fields_memory": "3.7mb",
"stored_fields_memory_in_bytes": 3969480,
"term_vectors_memory": "0b",
"term_vectors_memory_in_bytes": 0,
"norms_memory": "2.3mb",
"norms_memory_in_bytes": 2467328,
"points_memory": "0b",
"points_memory_in_bytes": 0,
"doc_values_memory": "13mb",
"doc_values_memory_in_bytes": 13635912,
"index_writer_memory": "3.8gb",
"index_writer_memory_in_bytes": 4101395580,
"version_map_memory": "214mb",
"version_map_memory_in_bytes": 224405274,
"fixed_bit_set": "13.4mb",
"fixed_bit_set_memory_in_bytes": 14150592,
"max_unsafe_auto_id_timestamp": 1612240341731,
"file_sizes": {}
},
"mappings": {
"field_types": [
{
"name": "alias",
"count": 3,
"index_count": 1
},
{
"name": "binary",
"count": 3,
"index_count": 1
},
{
"name": "boolean",
"count": 150,
"index_count": 31
},
{
"name": "byte",
"count": 1,
"index_count": 1
},
{
"name": "date",
"count": 260,
"index_count": 61
},
{
"name": "double",
"count": 191,
"index_count": 22
},
{
"name": "flattened",
"count": 1,
"index_count": 1
},
{
"name": "float",
"count": 236,
"index_count": 22
},
{
"name": "geo_point",
"count": 7,
"index_count": 1
},
{
"name": "geo_shape",
"count": 1,
"index_count": 1
},
{
"name": "half_float",
"count": 77,
"index_count": 21
},
{
"name": "integer",
"count": 195,
"index_count": 16
},
{
"name": "ip",
"count": 21,
"index_count": 1
},
{
"name": "keyword",
"count": 2686,
"index_count": 62
},
{
"name": "long",
"count": 5265,
"index_count": 60
},
{
"name": "nested",
"count": 61,
"index_count": 20
},
{
"name": "object",
"count": 4651,
"index_count": 63
},
{
"name": "scaled_float",
"count": 112,
"index_count": 1
},
{
"name": "short",
"count": 21,
"index_count": 7
},
{
"name": "text",
"count": 482,
"index_count": 33
}
]
},
"analysis": {
"char_filter_types": ,
"tokenizer_types": ,
"filter_types": ,
"analyzer_types": ,
"built_in_char_filters": ,
"built_in_tokenizers": ,
"built_in_filters": ,
"built_in_analyzers":
}
},
"nodes": {
"count": {
"total": 7,
"coordinating_only": 0,
"data": 6,
"ingest": 5,
"master": 4,
"ml": 4,
"remote_cluster_client": 4,
"transform": 4,
"voting_only": 0
},
"versions": [
"7.9.3",
"7.9.2",
"7.8.0",
"7.10.0",
"7.9.0"
],
"os": {
"available_processors": 56,
"allocated_processors": 56,
"names": [
{
"name": "Linux",
"count": 7
}
],
"pretty_names": [
{
"pretty_name": "Ubuntu 18.04.2 LTS",
"count": 4
},
{
"pretty_name": "Ubuntu 20.10",
"count": 1
},
{
"pretty_name": "Ubuntu 18.10",
"count": 1
},
{
"pretty_name": "Ubuntu 18.04.1 LTS",
"count": 1
}
],
"mem": {
"total": "234.9gb",
"total_in_bytes": 252248031232,
"free": "32.6gb",
"free_in_bytes": 35094163456,
"used": "202.2gb",
"used_in_bytes": 217153867776,
"free_percent": 14,
"used_percent": 86
}
},

   "process": {
        "cpu": {
            "percent": 101
        },
        "open_file_descriptors": {
            "min": 438,
            "max": 1202,
            "avg": 809
        }
    },
    "jvm": {
        "max_uptime": "21d",
        "max_uptime_in_millis": 1822181612,
        "versions": [
            {
                "version": "14.0.1",
                "vm_name": "OpenJDK 64-Bit Server VM",
                "vm_version": "14.0.1+7",
                "vm_vendor": "AdoptOpenJDK",
                "bundled_jdk": true,
                "using_bundled_jdk": true,
                "count": 3
            },
            {
                "version": "15",
                "vm_name": "OpenJDK 64-Bit Server VM",
                "vm_version": "15+36",
                "vm_vendor": "AdoptOpenJDK",
                "bundled_jdk": true,
                "using_bundled_jdk": true,
                "count": 1
            },
            {
                "version": "15",
                "vm_name": "OpenJDK 64-Bit Server VM",
                "vm_version": "15+36-1562",
                "vm_vendor": "Oracle Corporation",
                "bundled_jdk": true,
                "using_bundled_jdk": true,
                "count": 1
            },
            {
                "version": "15.0.1",
                "vm_name": "OpenJDK 64-Bit Server VM",
                "vm_version": "15.0.1+9",
                "vm_vendor": "AdoptOpenJDK",
                "bundled_jdk": true,
                "using_bundled_jdk": true,
                "count": 2
            }
        ],
        "mem": {
            "heap_used": "64.5gb",
            "heap_used_in_bytes": 69355816272,
            "heap_max": "116gb",
            "heap_max_in_bytes": 124554051584
        },
        "threads": 613
    },
    "fs": {
        "total": "6.1tb",
        "total_in_bytes": 6784419385344,
        "free": "3.9tb",
        "free_in_bytes": 4315239600128,
        "available": "3.6tb",
        "available_in_bytes": 4009719656448
    },
    "plugins": [],
    "network_types": {
        "transport_types": {
            "netty4": 7
        },
        "http_types": {
            "netty4": 7
        }
    },
    "discovery_types": {
        "zen": 7
    },
    "packaging_types": [
        {
            "flavor": "default",
            "type": "deb",
            "count": 7
        }
    ],
    "ingest": {
        "number_of_pipelines": 19,
        "processor_stats": {
            "conditional": {
                "count": 0,
                "failed": 0,
                "current": 0,
                "time": "0s",
                "time_in_millis": 0
            },
            "convert": {
                "count": 0,
                "failed": 0,
                "current": 0,
                "time": "0s",
                "time_in_millis": 0
            },
            "date": {
                "count": 0,
                "failed": 0,
                "current": 0,
                "time": "0s",
                "time_in_millis": 0
            },
            "dot_expander": {
                "count": 0,
                "failed": 0,
                "current": 0,
                "time": "0s",
                "time_in_millis": 0
            },
            "geoip": {
                "count": 0,
                "failed": 0,
                "current": 0,
                "time": "0s",
                "time_in_millis": 0
            },
            "grok": {
                "count": 0,
                "failed": 0,
                "current": 0,
                "time": "0s",
                "time_in_millis": 0
            },
            "gsub": {
                "count": 0,
                "failed": 0,
                "current": 0,
                "time": "0s",
                "time_in_millis": 0
            },
            "json": {
                "count": 0,
                "failed": 0,
                "current": 0,
                "time": "0s",
                "time_in_millis": 0
            },
            "remove": {
                "count": 0,
                "failed": 0,
                "current": 0,
                "time": "0s",
                "time_in_millis": 0
            },
            "rename": {
                "count": 0,
                "failed": 0,
                "current": 0,
                "time": "0s",
                "time_in_millis": 0
            },
            "script": {
                "count": 0,
                "failed": 0,
                "current": 0,
                "time": "0s",
                "time_in_millis": 0
            },
            "split": {
                "count": 0,
                "failed": 0,
                "current": 0,
                "time": "0s",
                "time_in_millis": 0
            }
        }
    }
}

}

This is a potentially a big problem. All nodes in the cluster should always use exactly the same version unless you are actually upgrading. Shards located on newer nodes can generally not be allocated to older nodes, so you are likely to see imbalances until this is fixed.

2 Likes

I cannot reiterate Christian's comments strongly enough here.

1 Like

Thank you for all of your reply. One more question, we are using logstash to process log and push to ES cluster. In the cluster there are digest node, data node and master node. Should I configure the logstash to push the log to all nodes no matter what type they are or just those digest node only?

Ingest nodes?
For your cluster size, and having 5 of 7 nodes as ingest, I would send to those.

okok...some nodes have more than one roles, for better performance, I should dedicate one node for one role only, right?

I am not sure I understand how you have assigned roles in your cluster. Could you please elaborate?

There are a range of node type in ES if I understand correctly.

At this moment, some nodes in my cluster are master node, data node and ingest node at the same time. I am think if I should configure one node for single role only even all I have just 7 nodes

Do all nodes have the same hardware specification?

no, some of them are I7 with 32GB ram while some are i5 with 16GB ram.

How many do you have of each? How much storage do they have? What type of storage are they using?

Elasticsearch by default assumes all nodes are equal so unless you tier your nodes you may end up with imbalances.

Here is the detail of those node
ES1: i7 32GB
ES2: i5 48GB
ES3: i7 32GB
ES4: i5 16GB
ES5: i9 48GB
ES6: i5 32GB
ES7: i5 32GB

All of them have at least 1TB SSD storage.
I have assign half of the memory of each machine to java heap space.