Running out of heap memory Elasticsearch

We are using single node of elasticsearch with 24 gb of heap memory. The rollover size is 75gb with 2 shards per index therefore single shard size is roughly 37.5 gb. We need to archive 8 Tb of data out of which 2 Tb has been indexed in 37 indexes and 74 shards. Now _cat/nodes returns 90% heap in use.
Any suggestion on how to reduce heap usage. Is there is a limit on the size of data the can be indexed in a single node of elasticsearch.
Regards,
Yasir

What version are you running.

v 7.9.1

What is the full output of the cluster stats API?

{
  "_nodes" : {
    "total" : 1,
    "successful" : 1,
    "failed" : 0
  },
  "cluster_name" : "elasticsearch",
  "cluster_uuid" : "ujl4Pr6vRLGuGd-J-Uytog",
  "timestamp" : 1605017517580,
  "status" : "yellow",
  "indices" : {
    "count" : 79,
    "shards" : {
      "total" : 118,
      "primaries" : 118,
      "replication" : 0.0,
      "index" : {
        "shards" : {
          "min" : 1,
          "max" : 2,
          "avg" : 1.4936708860759493
        },
        "primaries" : {
          "min" : 1,
          "max" : 2,
          "avg" : 1.4936708860759493
        },
        "replication" : {
          "min" : 0.0,
          "max" : 0.0,
          "avg" : 0.0
        }
      }
    },
    "docs" : {
      "count" : 35471693,
      "deleted" : 233720
    },
    "store" : {
      "size_in_bytes" : 2929111831895,
      "reserved_in_bytes" : 0
    },
    "fielddata" : {
      "memory_size_in_bytes" : 0,
      "evictions" : 0
    },
    "query_cache" : {
      "memory_size_in_bytes" : 0,
      "total_count" : 0,
      "hit_count" : 0,
      "miss_count" : 0,
      "cache_size" : 0,
      "cache_count" : 0,
      "evictions" : 0
    },
    "completion" : {
      "size_in_bytes" : 0
    },
    "segments" : {
      "count" : 2749,
      "memory_in_bytes" : 1605478624,
      "terms_memory_in_bytes" : 1317449376,
      "stored_fields_memory_in_bytes" : 2497816,
      "term_vectors_memory_in_bytes" : 0,
      "norms_memory_in_bytes" : 193225280,
      "points_memory_in_bytes" : 0,
      "doc_values_memory_in_bytes" : 92306152,
      "index_writer_memory_in_bytes" : 1285255556,
      "version_map_memory_in_bytes" : 778034,
      "fixed_bit_set_memory_in_bytes" : 1568,
      "max_unsafe_auto_id_timestamp" : 1604355408208,
      "file_sizes" : { }
    },
    "mappings" : {
      "field_types" : [
        {
          "name" : "binary",
          "count" : 25,
          "index_count" : 4
        },
        {
          "name" : "boolean",
          "count" : 651,
          "index_count" : 71
        },
        {
          "name" : "byte",
          "count" : 22,
          "index_count" : 22
        },
        {
          "name" : "date",
          "count" : 2794,
          "index_count" : 76
        },
        {
          "name" : "flattened",
          "count" : 19,
          "index_count" : 3
        },
        {
          "name" : "float",
          "count" : 131,
          "index_count" : 23
        },
        {
          "name" : "geo_point",
          "count" : 154,
          "index_count" : 22
        },
        {
          "name" : "geo_shape",
          "count" : 2,
          "index_count" : 2
        },
        {
          "name" : "histogram",
          "count" : 22,
          "index_count" : 22
        },
        {
          "name" : "integer",
          "count" : 82,
          "index_count" : 7
        },
        {
          "name" : "ip",
          "count" : 286,
          "index_count" : 22
        },
        {
          "name" : "keyword",
          "count" : 313823,
          "index_count" : 78
        },
        {
          "name" : "long",
          "count" : 2081,
          "index_count" : 72
        },
        {
          "name" : "nested",
          "count" : 32,
          "index_count" : 8
        },
        {
          "name" : "object",
          "count" : 11810,
          "index_count" : 76
        },
        {
          "name" : "scaled_float",
          "count" : 45,
          "index_count" : 23
        },
        {
          "name" : "short",
          "count" : 1,
          "index_count" : 1
        },
        {
          "name" : "text",
          "count" : 303634,
          "index_count" : 77
        }
      ]
    },
    "analysis" : {
      "char_filter_types" : [ ],
      "tokenizer_types" : [ ],
      "filter_types" : [
        {
          "name" : "pattern_capture",
          "count" : 1,
          "index_count" : 1
        },
        {
          "name" : "stemmer",
          "count" : 39,
          "index_count" : 39
        }
      ],
      "analyzer_types" : [
        {
          "name" : "custom",
          "count" : 118,
          "index_count" : 40
        }
      ],
      "built_in_char_filters" : [
        {
          "name" : "html_strip",
          "count" : 39,
          "index_count" : 39
        }
      ],
      "built_in_tokenizers" : [
        {
          "name" : "standard",
          "count" : 78,
          "index_count" : 39
        },
        {
          "name" : "uax_url_email",
          "count" : 40,
          "index_count" : 40
        }
      ],
      "built_in_filters" : [
        {
          "name" : "lowercase",
          "count" : 118,
          "index_count" : 40
        },
        {
          "name" : "unique",
          "count" : 1,
          "index_count" : 1
        }
      ],
      "built_in_analyzers" : [ ]
    }
  },
  "nodes" : {
    "count" : {
      "total" : 1,
      "coordinating_only" : 0,
      "data" : 1,
      "ingest" : 1,
      "master" : 1,
      "ml" : 1,
      "remote_cluster_client" : 1,
      "transform" : 1,
      "voting_only" : 0
    },
    "versions" : [
      "7.9.1"
    ],
    "os" : {
      "available_processors" : 8,
      "allocated_processors" : 8,
      "names" : [
        {
          "name" : "Windows Server 2019",
          "count" : 1
        }
      ],
      "pretty_names" : [
        {
          "pretty_name" : "Windows Server 2019",
          "count" : 1
        }
      ],
      "mem" : {
        "total_in_bytes" : 51538526208,
        "free_in_bytes" : 17143250944,
        "used_in_bytes" : 34395275264,
        "free_percent" : 33,
        "used_percent" : 67
      }
    },
    "process" : {
      "cpu" : {
        "percent" : 29
      },
      "open_file_descriptors" : {
        "min" : -1,
        "max" : -1,
        "avg" : 0
      }
    },
    "jvm" : {
      "max_uptime_in_millis" : 20958392,
      "versions" : [
        {
          "version" : "14.0.1",
          "vm_name" : "OpenJDK 64-Bit Server VM",
          "vm_version" : "14.0.1+7",
          "vm_vendor" : "AdoptOpenJDK",
          "bundled_jdk" : true,
          "using_bundled_jdk" : true,
          "count" : 1
        }
      ],
      "mem" : {
        "heap_used_in_bytes" : 20218729480,
        "heap_max_in_bytes" : 21474836480
      },
      "threads" : 194
    },
    "fs" : {
      "total_in_bytes" : 6597013139456,
      "free_in_bytes" : 3664968577024,
      "available_in_bytes" : 3664968577024
    },
    "plugins" : [ ],
    "network_types" : {
      "transport_types" : {
        "security4" : 1
      },
      "http_types" : {
        "security4" : 1
      }
    },
    "discovery_types" : {
      "zen" : 1
    },
    "packaging_types" : [
      {
        "flavor" : "default",
        "type" : "zip",
        "count" : 1
      }
    ],
    "ingest" : {
      "number_of_pipelines" : 7,
      "processor_stats" : {
        "conditional" : {
          "count" : 24167,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 3
        },
        "geoip" : {
          "count" : 24167,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 178
        },
        "gsub" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 0
        },
        "pipeline" : {
          "count" : 96668,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 207
        },
        "script" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 0
        },
        "unknown" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 0
        },
        "user_agent" : {
          "count" : 24167,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 7
        }
      }
    }
  }
}


Based on that it looks like the heap is currently only 76% full. Are you monitoring heap usage so you can show how it varies over time?

Do you have time-based indices where older indices basically becomes read-only? If so you may reduce heap usage by forcemerging these down to a single segment per shard.

1 Like

Thankyou for responding.
I had to stop the indexing process as elasticsearch was throwing curcit breaker exception,
Secondly as i mentioned i need to index 8TB of data out of which 2TB of data has been indexed and the heap usage reached 87 percent, so im concerned if 8TB will be indexed or not
This is the result of _cat/stats api while indexing is running.

{
  "_nodes" : {
    "total" : 1,
    "successful" : 1,
    "failed" : 0
  },
  "cluster_name" : "elasticsearch",
  "cluster_uuid" : "ujl4Pr6vRLGuGd-J-Uytog",
  "timestamp" : 1605016732338,
  "status" : "yellow",
  "indices" : {
    "count" : 79,
    "shards" : {
      "total" : 118,
      "primaries" : 118,
      "replication" : 0.0,
      "index" : {
        "shards" : {
          "min" : 1,
          "max" : 2,
          "avg" : 1.4936708860759493
        },
        "primaries" : {
          "min" : 1,
          "max" : 2,
          "avg" : 1.4936708860759493
        },
        "replication" : {
          "min" : 0.0,
          "max" : 0.0,
          "avg" : 0.0
        }
      }
    },
    "docs" : {
      "count" : 35462448,
      "deleted" : 234215
    },
    "store" : {
      "size_in_bytes" : 2927978413605,
      "reserved_in_bytes" : 0
    },
    "fielddata" : {
      "memory_size_in_bytes" : 0,
      "evictions" : 0
    },
    "query_cache" : {
      "memory_size_in_bytes" : 0,
      "total_count" : 0,
      "hit_count" : 0,
      "miss_count" : 0,
      "cache_size" : 0,
      "cache_count" : 0,
      "evictions" : 0
    },
    "completion" : {
      "size_in_bytes" : 0
    },
    "segments" : {
      "count" : 2752,
      "memory_in_bytes" : 1603695164,
      "terms_memory_in_bytes" : 1316032320,
      "stored_fields_memory_in_bytes" : 2499296,
      "term_vectors_memory_in_bytes" : 0,
      "norms_memory_in_bytes" : 193009728,
      "points_memory_in_bytes" : 0,
      "doc_values_memory_in_bytes" : 92153820,
      "index_writer_memory_in_bytes" : 280591928,
      "version_map_memory_in_bytes" : 124873,
      "fixed_bit_set_memory_in_bytes" : 1528,
      "max_unsafe_auto_id_timestamp" : 1604355408208,
      "file_sizes" : { }
    },
    "mappings" : {
      "field_types" : [
        {
          "name" : "binary",
          "count" : 25,
          "index_count" : 4
        },
        {
          "name" : "boolean",
          "count" : 651,
          "index_count" : 71
        },
        {
          "name" : "byte",
          "count" : 22,
          "index_count" : 22
        },
        {
          "name" : "date",
          "count" : 2794,
          "index_count" : 76
        },
        {
          "name" : "flattened",
          "count" : 19,
          "index_count" : 3
        },
        {
          "name" : "float",
          "count" : 131,
          "index_count" : 23
        },
        {
          "name" : "geo_point",
          "count" : 154,
          "index_count" : 22
        },
        {
          "name" : "geo_shape",
          "count" : 2,
          "index_count" : 2
        },
        {
          "name" : "histogram",
          "count" : 22,
          "index_count" : 22
        },
        {
          "name" : "integer",
          "count" : 82,
          "index_count" : 7
        },
        {
          "name" : "ip",
          "count" : 286,
          "index_count" : 22
        },
        {
          "name" : "keyword",
          "count" : 313423,
          "index_count" : 78
        },
        {
          "name" : "long",
          "count" : 2081,
          "index_count" : 72
        },
        {
          "name" : "nested",
          "count" : 32,
          "index_count" : 8
        },
        {
          "name" : "object",
          "count" : 11809,
          "index_count" : 76
        },
        {
          "name" : "scaled_float",
          "count" : 45,
          "index_count" : 23
        },
        {
          "name" : "short",
          "count" : 1,
          "index_count" : 1
        },
        {
          "name" : "text",
          "count" : 303234,
          "index_count" : 77
        }
      ]
    },
    "analysis" : {
      "char_filter_types" : [ ],
      "tokenizer_types" : [ ],
      "filter_types" : [
        {
          "name" : "pattern_capture",
          "count" : 1,
          "index_count" : 1
        },
        {
          "name" : "stemmer",
          "count" : 39,
          "index_count" : 39
        }
      ],
      "analyzer_types" : [
        {
          "name" : "custom",
          "count" : 118,
          "index_count" : 40
        }
      ],
      "built_in_char_filters" : [
        {
          "name" : "html_strip",
          "count" : 39,
          "index_count" : 39
        }
      ],
      "built_in_tokenizers" : [
        {
          "name" : "standard",
          "count" : 78,
          "index_count" : 39
        },
        {
          "name" : "uax_url_email",
          "count" : 40,
          "index_count" : 40
        }
      ],
      "built_in_filters" : [
        {
          "name" : "lowercase",
          "count" : 118,
          "index_count" : 40
        },
        {
          "name" : "unique",
          "count" : 1,
          "index_count" : 1
        }
      ],
      "built_in_analyzers" : [ ]
    }
  },
  "nodes" : {
    "count" : {
      "total" : 1,
      "coordinating_only" : 0,
      "data" : 1,
      "ingest" : 1,
      "master" : 1,
      "ml" : 1,
      "remote_cluster_client" : 1,
      "transform" : 1,
      "voting_only" : 0
    },
    "versions" : [
      "7.9.1"
    ],
    "os" : {
      "available_processors" : 8,
      "allocated_processors" : 8,
      "names" : [
        {
          "name" : "Windows Server 2019",
          "count" : 1
        }
      ],
      "pretty_names" : [
        {
          "pretty_name" : "Windows Server 2019",
          "count" : 1
        }
      ],
      "mem" : {
        "total_in_bytes" : 51538526208,
        "free_in_bytes" : 17155682304,
        "used_in_bytes" : 34382843904,
        "free_percent" : 33,
        "used_percent" : 67
      }
    },
    "process" : {
      "cpu" : {
        "percent" : 11
      },
      "open_file_descriptors" : {
        "min" : -1,
        "max" : -1,
        "avg" : 0
      }
    },
    "jvm" : {
      "max_uptime_in_millis" : 20173111,
      "versions" : [
        {
          "version" : "14.0.1",
          "vm_name" : "OpenJDK 64-Bit Server VM",
          "vm_version" : "14.0.1+7",
          "vm_vendor" : "AdoptOpenJDK",
          "bundled_jdk" : true,
          "using_bundled_jdk" : true,
          "count" : 1
        }
      ],
      "mem" : {
        "heap_used_in_bytes" : 18695740656,
        "heap_max_in_bytes" : 21474836480
      },
      "threads" : 193
    },
    "fs" : {
      "total_in_bytes" : 6597013139456,
      "free_in_bytes" : 3666669338624,
      "available_in_bytes" : 3666669338624
    },
    "plugins" : [ ],
    "network_types" : {
      "transport_types" : {
        "security4" : 1
      },
      "http_types" : {
        "security4" : 1
      }
    },
    "discovery_types" : {
      "zen" : 1
    },
    "packaging_types" : [
      {
        "flavor" : "default",
        "type" : "zip",
        "count" : 1
      }
    ],
    "ingest" : {
      "number_of_pipelines" : 7,
      "processor_stats" : {
        "conditional" : {
          "count" : 23268,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 3
        },
        "geoip" : {
          "count" : 23268,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 178
        },
        "gsub" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 0
        },
        "pipeline" : {
          "count" : 93072,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 207
        },
        "script" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 0
        },
        "unknown" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 0
        },
        "user_agent" : {
          "count" : 23268,
          "failed" : 0,
          "current" : 0,
          "time_in_millis" : 7
        }
      }
    }
  }
}

This is the response of _cat/nodes api

ip        heap.percent ram.percent cpu load_1m load_5m load_15m node.role master name
127.0.0.1           89          67  34                          dilmrt    *      ---

It looks like your documents are quite large. Can you describe the data and how you indexing, e.g. bulk size, in a bit more detail?

We are indexing .eml files (emails), the bulk size is set to 150 documents with 4 processes sending the data independently. After the heap usage reached to 80% we had to reduce the bulk size from 250 to 150 documents keeping the number of processing unchanged.
We don't have any field with field data set to true.

Thanks!

Are you sending the .eml file as a BASE64 encoded content to elasticsearch or are you first extracting the text and "just" indexing the text?

1 Like

I am parsing the .eml files converting them to json and then sending the data to elasticsearch.
So everything present in the email including the header, message body and attachment is getting indexed into elastsicsearch.

How do you index the attachment? Is it only stored?

No, we are parsing the attachments using tika and storing the text data within attachments.

Great. So I believe you are all good with your implementation.
Indeed the text content per document might be too much.

If you are using a BulkProcessor java class, may be you should consider flushing the bulk not only based on the number of documents but on the size of the bulk requests in mb and may be flush every 20mb or so?

Thanks for the responding.
I will try the Bulk Processor java class and see if hat solves the circuit breaker exception.
In meantime any suggestion for lowering the heap utilization besides using forcemerge as Christian suggested. Do i need to increase the heap size in order to index the entire 8TB of data? So far only 2TB has been indexed allocated 20gb heap is approximately 80 percent used

Thanks,

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.