Memory problems / Circuit breaking exception with small data

I am experiencing an odd situation. I have 3 node cluster (two data nodes, one master node) with the following resources

  • node1 24GB RAM, 12CPU
  • node2 16GB RAM, 8CPU
  • master 8GB RAM, 4CPU

I am ingesting firewall logs with logstash.
I use only two indices (which rollover after 40GB or 30 days), one shard, one replica.

I've left the jvm settings to be determine automatically (the recommended settings) but I get this two errors continually

In ES

  • java.lang.OutOfMemoryError: Java heap space
    In Logstash
    A circuit breaking exception when trying to index a bulk request because a limit (ridicule low) was reached

[2021-04-14T10:14:46,467][INFO ][logstash.outputs.elasticsearch][output][1d65d5c564b6fb6eb72558301103541954c97a03464eeb468a96d235eb224a66] retrying failed action with response code: 429 ({"type"=>"circuit_breaking_exception", "reason"=>"[parent] Data too large, data for [indices:data/write/bulk[s]] would be [1038945886/990.8mb], which is larger than the limit of [1020054732/972.7mb], real usage: [1038874592/990.7mb], new bytes reserved: [71294/69.6kb], usages [request=32880/32.1kb, fielddata=0/0b, in_flight_requests=45567374/43.4mb, model_inference=0/0b, accounting=728556/711.4kb]", "bytes_wanted"=>1038945886, "bytes_limit"=>1020054732, "durability"=>"TRANSIENT"})

I am running in other installation an smaller cluster, with fewer resources and more logs to ingest and never had experience a problem like this one

I have also observed that at the moment of the crash the Heap Size was of 2GB
Any help would be appreciated

Can you please post your Elasticsearch log, as well as the output from the _cluster/stats?pretty&human API?

Here it is. Thank you!

{
  "_nodes" : {
    "total" : 3,
    "successful" : 3,
    "failed" : 0
  },
  "cluster_name" : "cimera",
  "cluster_uuid" : "Kbr9-jVBSdKLtWYsOHG90g",
  "timestamp" : 1618389462209,
  "status" : "green",
  "indices" : {
    "count" : 18,
    "shards" : {
      "total" : 36,
      "primaries" : 18,
      "replication" : 1.0,
      "index" : {
        "shards" : {
          "min" : 2,
          "max" : 2,
          "avg" : 2.0
        },
        "primaries" : {
          "min" : 1,
          "max" : 1,
          "avg" : 1.0
        },
        "replication" : {
          "min" : 1.0,
          "max" : 1.0,
          "avg" : 1.0
        }
      }
    },
    "docs" : {
      "count" : 40073,
      "deleted" : 5582
    },
    "store" : {
      "size" : "242.7mb",
      "size_in_bytes" : 254587538,
      "reserved" : "0b",
      "reserved_in_bytes" : 0
    },
    "fielddata" : {
      "memory_size" : "0b",
      "memory_size_in_bytes" : 0,
      "evictions" : 0
    },
    "query_cache" : {
      "memory_size" : "54.5kb",
      "memory_size_in_bytes" : 55816,
      "total_count" : 2760,
      "hit_count" : 720,
      "miss_count" : 2040,
      "cache_size" : 28,
      "cache_count" : 28,
      "evictions" : 0
    },
    "completion" : {
      "size" : "0b",
      "size_in_bytes" : 0
    },
    "segments" : {
      "count" : 134,
      "memory" : "1.5mb",
      "memory_in_bytes" : 1629992,
      "terms_memory" : "999.7kb",
      "terms_memory_in_bytes" : 1023744,
      "stored_fields_memory" : "64.9kb",
      "stored_fields_memory_in_bytes" : 66480,
      "term_vectors_memory" : "0b",
      "term_vectors_memory_in_bytes" : 0,
      "norms_memory" : "32.1kb",
      "norms_memory_in_bytes" : 32960,
      "points_memory" : "0b",
      "points_memory_in_bytes" : 0,
      "doc_values_memory" : "494.9kb",
      "doc_values_memory_in_bytes" : 506808,
      "index_writer_memory" : "8.8mb",
      "index_writer_memory_in_bytes" : 9276064,
      "version_map_memory" : "2.5kb",
      "version_map_memory_in_bytes" : 2564,
      "fixed_bit_set" : "8.6kb",
      "fixed_bit_set_memory_in_bytes" : 8832,
      "max_unsafe_auto_id_timestamp" : 1618388644413,
      "file_sizes" : { }
    },
    "mappings" : {
      "field_types" : [
        {
          "name" : "boolean",
          "count" : 19,
          "index_count" : 5
        },
        {
          "name" : "date",
          "count" : 38,
          "index_count" : 7
        },
        {
          "name" : "double",
          "count" : 10,
          "index_count" : 2
        },
        {
          "name" : "float",
          "count" : 12,
          "index_count" : 2
        },
        {
          "name" : "geo_point",
          "count" : 12,
          "index_count" : 2
        },
        {
          "name" : "half_float",
          "count" : 5,
          "index_count" : 1
        },
        {
          "name" : "integer",
          "count" : 25,
          "index_count" : 4
        },
        {
          "name" : "ip",
          "count" : 24,
          "index_count" : 2
        },
        {
          "name" : "keyword",
          "count" : 851,
          "index_count" : 7
        },
        {
          "name" : "long",
          "count" : 243,
          "index_count" : 6
        },
        {
          "name" : "nested",
          "count" : 9,
          "index_count" : 3
        },
        {
          "name" : "object",
          "count" : 318,
          "index_count" : 7
        },
        {
          "name" : "short",
          "count" : 7,
          "index_count" : 3
        },
        {
          "name" : "text",
          "count" : 111,
          "index_count" : 7
        }
      ]
    },
    "analysis" : {
      "char_filter_types" : [ ],
      "tokenizer_types" : [ ],
      "filter_types" : [ ],
      "analyzer_types" : [ ],
      "built_in_char_filters" : [ ],
      "built_in_tokenizers" : [ ],
      "built_in_filters" : [ ],
      "built_in_analyzers" : [ ]
    },
    "versions" : [
      {
        "version" : "7.12.0",
        "index_count" : 18,
        "primary_shard_count" : 18,
        "total_primary_size" : "120.5mb",
        "total_primary_bytes" : 126453751
      }
    ]
  },
  "nodes" : {
    "count" : {
      "total" : 3,
      "coordinating_only" : 0,
      "data" : 2,
      "data_cold" : 2,
      "data_content" : 2,
      "data_frozen" : 2,
      "data_hot" : 2,
      "data_warm" : 2,
      "ingest" : 2,
      "master" : 3,
      "ml" : 3,
      "remote_cluster_client" : 3,
      "transform" : 2,
      "voting_only" : 0
    },
    "versions" : [
      "7.12.0"
    ],
    "os" : {
      "available_processors" : 24,
      "allocated_processors" : 24,
      "names" : [
        {
          "name" : "Linux",
          "count" : 3
        }
      ],
      "pretty_names" : [
        {
          "pretty_name" : "Red Hat Cloud Infrastructure",
          "count" : 3
        }
      ],
      "architectures" : [
        {
          "arch" : "amd64",
          "count" : 3
        }
      ],
      "mem" : {
        "total" : "46.5gb",
        "total_in_bytes" : 49967927296,
        "free" : "23gb",
        "free_in_bytes" : 24699981824,
        "used" : "23.5gb",
        "used_in_bytes" : 25267945472,
        "free_percent" : 49,
        "used_percent" : 51
      }
    },
    "process" : {
      "cpu" : {
        "percent" : 11
      },
      "open_file_descriptors" : {
        "min" : 373,
        "max" : 734,
        "avg" : 532
      }
    },
    "jvm" : {
      "max_uptime" : "28m",
      "max_uptime_in_millis" : 1680933,
      "versions" : [
        {
          "version" : "15.0.1",
          "vm_name" : "OpenJDK 64-Bit Server VM",
          "vm_version" : "15.0.1+9",
          "vm_vendor" : "AdoptOpenJDK",
          "bundled_jdk" : true,
          "using_bundled_jdk" : true,
          "count" : 3
        }
      ],
      "mem" : {
        "heap_used" : "2.1gb",
        "heap_used_in_bytes" : 2354161944,
        "heap_max" : "3gb",
        "heap_max_in_bytes" : 3221225472
      },
      "threads" : 222
    },
    "fs" : {
      "total" : "1tb",
      "total_in_bytes" : 1162163445760,
      "free" : "1tb",
      "free_in_bytes" : 1139122548736,
      "available" : "1005.8gb",
      "available_in_bytes" : 1080049152000
    },
    "plugins" : [ ],
    "network_types" : {
      "transport_types" : {
        "security4" : 3
      },
      "http_types" : {
        "security4" : 3
      }
    },
    "discovery_types" : {
      "zen" : 3
    },
    "packaging_types" : [
      {
        "flavor" : "default",
        "type" : "rpm",
        "count" : 3
      }
    ],
    "ingest" : {
      "number_of_pipelines" : 2,
      "processor_stats" : {
        "gsub" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "script" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        }
      }
    }
  }
}

This too please. It'd be good to see the error you are referring to in the logs.

  [2021-04-13T20:27:44,268][INFO ][o.e.i.b.HierarchyCircuitBreakerService] [node1] GC did bring memory usage down, before [1022944040], after [855367976], allocations [1], duration [60]
[2021-04-13T20:27:44,934][INFO ][o.e.m.j.JvmGcMonitorService] [node1] [gc][1238858] overhead, spent [449ms] collecting in the last [1s]
[2021-04-13T20:27:55,636][INFO ][o.e.i.b.HierarchyCircuitBreakerService] [node1] attempting to trigger G1GC due to high heap usage [1028402960]
[2021-04-13T20:27:55,741][INFO ][o.e.i.b.HierarchyCircuitBreakerService] [node1] GC did bring memory usage down, before [1028402960], after [943331088], allocations [8], duration [105]
[2021-04-13T20:27:56,074][INFO ][o.e.m.j.JvmGcMonitorService] [node1] [gc][1238869] overhead, spent [291ms] collecting in the last [1.1s]
[2021-04-13T20:27:57,141][WARN ][o.e.m.j.JvmGcMonitorService] [node1] [gc][1238870] overhead, spent [588ms] collecting in the last [1s]
[2021-04-13T20:27:58,142][INFO ][o.e.m.j.JvmGcMonitorService] [node1] [gc][1238871] overhead, spent [323ms] collecting in the last [1s]
[2021-04-13T20:28:08,204][INFO ][o.e.i.b.HierarchyCircuitBreakerService] [node1] attempting to trigger G1GC due to high heap usage [1022373544]
[2021-04-13T20:28:08,245][INFO ][o.e.i.b.HierarchyCircuitBreakerService] [node1] GC did bring memory usage down, before [1022373544], after [1018856104], allocations [8], duration [42]
[2021-04-13T20:28:12,074][ERROR][o.e.ExceptionsHelper     ] [node1] fatal error
        at org.elasticsearch.ExceptionsHelper.lambda$maybeDieOnAnotherThread$4(ExceptionsHelper.java:283)
        at java.base/java.util.Optional.ifPresent(Optional.java:176)
        at org.elasticsearch.ExceptionsHelper.maybeDieOnAnotherThread(ExceptionsHelper.java:273)
        at org.elasticsearch.transport.netty4.Netty4MessageChannelHandler.exceptionCaught(Netty4MessageChannelHandler.java:70)
        at io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:302)
        at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:381)
        at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
        at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
        at io.netty.handler.logging.LoggingHandler.channelRead(LoggingHandler.java:271)
        at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
        at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
        at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
        at io.netty.handler.ssl.SslHandler.unwrap(SslHandler.java:1518)
        at io.netty.handler.ssl.SslHandler.decodeJdkCompatible(SslHandler.java:1267)
        at io.netty.handler.ssl.SslHandler.decode(SslHandler.java:1314)
        at io.netty.handler.codec.ByteToMessageDecoder.decodeRemovalReentryProtection(ByteToMessageDecoder.java:501)
        at io.netty.handler.codec.ByteToMessageDecoder.callDecode(ByteToMessageDecoder.java:440)
        at io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:276)
        at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
        at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
        at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
        at io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1410)
        at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
        at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
        at io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:919)
        at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:163)
        at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:714)
        at io.netty.channel.nio.NioEventLoop.processSelectedKeysPlain(NioEventLoop.java:615)
        at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:578)
        at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:493)
        at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:989)
        at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
        at java.base/java.lang.Thread.run(Thread.java:832)
2021-04-13T20:28:12,090][ERROR][o.e.b.ElasticsearchUncaughtExceptionHandler] [node1] fatal error in thread [Thread-28578], exiting
java.lang.OutOfMemoryError: Java heap space
        at java.util.Base64$Decoder.decode(Base64.java:566) ~[?:?]
        at java.util.Base64$Decoder.decode(Base64.java:590) ~[?:?]
        at org.elasticsearch.xpack.core.async.AsyncTaskIndexService.decodeResponse(AsyncTaskIndexService.java:496) ~[?:?]
        at org.elasticsearch.xpack.core.async.AsyncTaskIndexService.lambda$getStatusResponseFromIndex$8(AsyncTaskIndexService.java:415) ~[?:?]
        at org.elasticsearch.xpack.core.async.AsyncTaskIndexService$$Lambda$7154/0x0000000801b24530.accept(Unknown Source) ~[?:?]
        at org.elasticsearch.action.ActionListener$1.onResponse(ActionListener.java:117) ~[elasticsearch-7.12.0.jar:7.12.0]
        at org.elasticsearch.action.support.ContextPreservingActionListener.onResponse(ContextPreservingActionListener.java:32) ~[elasticsearch-7.12.0.jar:7.12.0]
        at org.elasticsearch.action.support.TransportAction$1.onResponse(TransportAction.java:83) ~[elasticsearch-7.12.0.jar:7.12.0]
        at org.elasticsearch.action.support.TransportAction$1.onResponse(TransportAction.java:77) ~[elasticsearch-7.12.0.jar:7.12.0]
        at org.elasticsearch.action.support.ContextPreservingActionListener.onResponse(ContextPreservingActionListener.java:32) ~[elasticsearch-7.12.0.jar:7.12.0]
        at org.elasticsearch.xpack.security.action.filter.SecurityActionFilter.lambda$applyInternal$2(SecurityActionFilter.java:165) ~[?:?]
        at org.elasticsearch.xpack.security.action.filter.SecurityActionFilter$$Lambda$5843/0x0000000801897020.accept(Unknown Source) ~[?:?]
        at org.elasticsearch.action.ActionListener$3.onResponse(ActionListener.java:167) ~[elasticsearch-7.12.0.jar:7.12.0]
        at org.elasticsearch.action.support.single.shard.TransportSingleShardAction$AsyncSingleAction$2.handleResponse(TransportSingleShardAction.java:240) ~[elasticsearch-7.12.0.jar:7.12.0]
        at org.elasticsearch.action.support.single.shard.TransportSingleShardAction$AsyncSingleAction$2.handleResponse(TransportSingleShardAction.java:231) ~[elasticsearch-7.12.0.jar:7.12.0]
        at org.elasticsearch.transport.TransportService$ContextRestoreResponseHandler.handleResponse(TransportService.java:1280) ~[elasticsearch-7.12.0.jar:7.12.0]
        at org.elasticsearch.transport.InboundHandler.doHandleResponse(InboundHandler.java:291) ~[elasticsearch-7.12.0.jar:7.12.0]
        at org.elasticsearch.transport.InboundHandler.handleResponse(InboundHandler.java:275) ~[elasticsearch-7.12.0.jar:7.12.0]
        at org.elasticsearch.transport.InboundHandler.messageReceived(InboundHandler.java:128) ~[elasticsearch-7.12.0.jar:7.12.0]
        at org.elasticsearch.transport.InboundHandler.inboundMessage(InboundHandler.java:84) ~[elasticsearch-7.12.0.jar:7.12.0]
        at org.elasticsearch.transport.TcpTransport.inboundMessage(TcpTransport.java:690) ~[elasticsearch-7.12.0.jar:7.12.0]
        at org.elasticsearch.transport.netty4.Netty4MessageChannelHandler$$Lambda$5101/0x00000008017a8920.accept(Unknown Source) ~[?:?]
        at org.elasticsearch.transport.InboundPipeline.forwardFragments(InboundPipeline.java:131) ~[elasticsearch-7.12.0.jar:7.12.0]
        at org.elasticsearch.transport.InboundPipeline.doHandleBytes(InboundPipeline.java:106) ~[elasticsearch-7.12.0.jar:7.12.0]
        at org.elasticsearch.transport.InboundPipeline.handleBytes(InboundPipeline.java:71) ~[elasticsearch-7.12.0.jar:7.12.0]
        at org.elasticsearch.transport.netty4.Netty4MessageChannelHandler.channelRead(Netty4MessageChannelHandler.java:63) ~[?:?]
        at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379) ~[?:?]
        at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365) ~[?:?]
        at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357) ~[?:?]
        at io.netty.handler.logging.LoggingHandler.channelRead(LoggingHandler.java:271) ~[?:?]
        at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379) ~[?:?]
        at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365) ~[?:?]

Thanks, what's your Logstash config look like?

Which part? I have multiple pipelines...

  • pipeline.id: input
    path.config: "/etc/logstash/conf.d/00.Input.conf"
    pipeline.workers: 3

  • pipeline.id: kv_pipeline
    path.config: "/etc/logstash/conf.d/20.kv_pipeline.conf"
    pipeline.workers: 3

  • pipeline.id: clean_drop
    path.config: "/etc/logstash/conf.d/25.clean_drop.conf"
    pipeline.workers: 3

  • pipeline.id: cimera
    path.config: "/etc/logstash/conf.d/21.cimera.conf"
    pipeline.workers: 3

  • pipeline.id: fortigate_2_ecs
    path.config: "/etc/logstash/conf.d/30.fortigate_2_ecs.conf"
    pipeline.workers: 3

  • pipeline.id: enrichments
    path.config: "/etc/logstash/conf.d/35.enrichments.conf"
    pipeline.workers: 3

  • pipeline.id: output
    path.config: "/etc/logstash/conf.d/90.output.conf"
    pipeline.workers: 3

Ah ok.
Let's try the output from _cat/nodes?v=true?

ip         heap.percent ram.percent cpu load_1m load_5m load_15m node.role   master name
10.31.5.19           61          38   0    1.16    1.15     1.14 cdfhilmrstw -      node2
10.31.5.20           35          48   1    0.62    0.53     0.33 lmr         *      node3

node1 (being logstash stopped) went down
After starting the node1

ip         heap.percent ram.percent cpu load_1m load_5m load_15m node.role   master name
10.31.5.47           50          66  44    2.54    1.05     1.12 cdfhilmrstw -      node1
10.31.5.20           19          48   2    0.36    0.50     0.36 lmr         *      node3
10.31.5.19           42          38  15    0.90    1.16     1.15 cdfhilmrstw -      node2

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.