All data nodes oom down during heavily indexing periods

ES version: 7.5.2
JDK version: bundled openjdk 13

GC settings:
-XX:+UseG1GC
-XX:MaxGCPauseMillis=20
-XX:+UnlockExperimentalVMOptions
-XX:G1NewSizePercent=2

We do heavily indexing(enable sniffing and all data nodes can use 100% cpu performance) in middle night everyday and it goes well except yesterday.
Yesterday all data node broke down.
I think ES7 has parent circuit breaker to avoid oom error but I still find OutOfMemoryError in the logfile.

I don't hope to slow down the indexing speed, what can I do to prevent it to happen again?

[2021-01-27T03:30:13,846][INFO ][o.e.m.j.JvmGcMonitorService] [node-data-0001] [gc][361018] overhead, spent [493ms] collecting in the last [1s]
[2021-01-27T03:30:15,967][INFO ][o.e.m.j.JvmGcMonitorService] [node-data-0001] [gc][361020] overhead, spent [439ms] collecting in the last [1s]
[2021-01-27T03:30:22,141][WARN ][o.e.m.j.JvmGcMonitorService] [node-data-0001] [gc][361021] overhead, spent [6.1s] collecting in the last [6.1s]
[2021-01-27T03:30:22,143][ERROR][o.e.ExceptionsHelper     ] [node-data-0001] fatal error
	at org.elasticsearch.ExceptionsHelper.lambda$maybeDieOnAnotherThread$4(ExceptionsHelper.java:300)
	at java.base/java.util.Optional.ifPresent(Optional.java:176)
	at org.elasticsearch.ExceptionsHelper.maybeDieOnAnotherThread(ExceptionsHelper.java:290)
	at org.elasticsearch.http.netty4.Netty4HttpRequestHandler.exceptionCaught(Netty4HttpRequestHandler.java:75)
	at io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:297)
	at io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:276)
	at io.netty.channel.AbstractChannelHandlerContext.fireExceptionCaught(AbstractChannelHandlerContext.java:268)
	at io.netty.channel.DefaultChannelPipeline$HeadContext.exceptionCaught(DefaultChannelPipeline.java:1389)
	at io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:297)
	at io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:276)
	at io.netty.channel.DefaultChannelPipeline.fireExceptionCaught(DefaultChannelPipeline.java:919)
	at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.handleReadException(AbstractNioByteChannel.java:125)
	at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:174)
	at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:700)
	at io.netty.channel.nio.NioEventLoop.processSelectedKeysPlain(NioEventLoop.java:600)
	at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:554)
	at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:514)
	at io.netty.util.concurrent.SingleThreadEventExecutor$6.run(SingleThreadEventExecutor.java:1050)
	at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
	at java.base/java.lang.Thread.run(Thread.java:830)
[2021-01-27T03:30:29,644][ERROR][o.e.b.ElasticsearchUncaughtExceptionHandler] [node-data-0001] fatal error in thread [Thread-3549], exiting
java.lang.OutOfMemoryError: Java heap space
[2021-01-27T03:30:29,644][ERROR][o.e.b.ElasticsearchUncaughtExceptionHandler] [node-data-0001] fatal error in thread [Thread-3542], exiting
java.lang.OutOfMemoryError: Java heap space
[2021-01-27T03:30:29,645][ERROR][o.e.b.ElasticsearchUncaughtExceptionHandler] [node-data-0001] fatal error in thread [Thread-3548], exiting
java.lang.OutOfMemoryError: Java heap space
[2021-01-27T03:30:29,645][WARN ][o.e.h.AbstractHttpServerTransport] [node-data-0001] caught exception while handling client http traffic, closing connection Netty4HttpChannel{localAddress=/10.111.55.172:9200, remoteAddress=/10.65.1.20:45719}
java.lang.Exception: java.lang.OutOfMemoryError: Java heap space
	at org.elasticsearch.http.netty4.Netty4HttpRequestHandler.exceptionCaught(Netty4HttpRequestHandler.java:78) [transport-netty4-client-7.5.2.jar:7.5.2]
	at io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:297) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.channel.AbstractChannelHandlerContext.notifyHandlerException(AbstractChannelHandlerContext.java:831) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:376) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:360) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:352) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
	at org.elasticsearch.http.netty4.Netty4HttpPipeliningHandler.channelRead(Netty4HttpPipeliningHandler.java:58) [transport-netty4-client-7.5.2.jar:7.5.2]
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:374) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:360) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:352) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:102) [netty-codec-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.handler.codec.MessageToMessageCodec.channelRead(MessageToMessageCodec.java:111) [netty-codec-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:374) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:360) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:352) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:102) [netty-codec-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:374) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:360) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:352) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:102) [netty-codec-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:374) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:360) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:352) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:326) [netty-codec-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:300) [netty-codec-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:374) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:360) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:352) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.handler.timeout.IdleStateHandler.channelRead(IdleStateHandler.java:287) [netty-handler-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:374) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:360) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:352) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1422) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:374) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:360) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:931) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:163) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:700) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.channel.nio.NioEventLoop.processSelectedKeysPlain(NioEventLoop.java:600) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:554) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:514) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.util.concurrent.SingleThreadEventExecutor$6.run(SingleThreadEventExecutor.java:1050) [netty-common-4.1.43.Final.jar:4.1.43.Final]
	at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) [netty-common-4.1.43.Final.jar:4.1.43.Final]
	at java.lang.Thread.run(Thread.java:830) [?:?]
Caused by: java.lang.OutOfMemoryError: Java heap space

What is the full output of the cluster stats API?

I did not capture the stats yesterday.
This is the stats what I captured just now

{
  "_nodes" : {
    "total" : 83,
    "successful" : 83,
    "failed" : 0
  },
  "cluster_name" : "MyES",
  "cluster_uuid" : "bfJTwho2RQ6LNxXLphc7xA",
  "timestamp" : 1611803773538,
  "status" : "green",
  "indices" : {
    "count" : 41,
    "shards" : {
      "total" : 1804,
      "primaries" : 1011,
      "replication" : 0.7843719090009891,
      "index" : {
        "shards" : {
          "min" : 2,
          "max" : 96,
          "avg" : 44.0
        },
        "primaries" : {
          "min" : 1,
          "max" : 48,
          "avg" : 24.658536585365855
        },
        "replication" : {
          "min" : 0.0,
          "max" : 47.0,
          "avg" : 1.853658536585366
        }
      }
    },
    "docs" : {
      "count" : 5393051118,
      "deleted" : 166228065
    },
    "store" : {
      "size_in_bytes" : 23069434957371
    },
    "fielddata" : {
      "memory_size_in_bytes" : 1264,
      "evictions" : 0
    },
    "query_cache" : {
      "memory_size_in_bytes" : 658258645,
      "total_count" : 3737533597,
      "hit_count" : 1473121732,
      "miss_count" : 2264411865,
      "cache_size" : 502760,
      "cache_count" : 75918531,
      "evictions" : 75415771
    },
    "completion" : {
      "size_in_bytes" : 0
    },
    "segments" : {
      "count" : 26424,
      "memory_in_bytes" : 15631854462,
      "terms_memory_in_bytes" : 2411989545,
      "stored_fields_memory_in_bytes" : 6088231872,
      "term_vectors_memory_in_bytes" : 0,
      "norms_memory_in_bytes" : 1216512,
      "points_memory_in_bytes" : 6303929693,
      "doc_values_memory_in_bytes" : 826486840,
      "index_writer_memory_in_bytes" : 74816694,
      "version_map_memory_in_bytes" : 80565,
      "fixed_bit_set_memory_in_bytes" : 192,
      "max_unsafe_auto_id_timestamp" : -1,
      "file_sizes" : { }
    }
  },
  "nodes" : {
    "count" : {
      "total" : 83,
      "coordinating_only" : 32,
      "data" : 48,
      "ingest" : 48,
      "master" : 3,
      "ml" : 0,
      "voting_only" : 0
    },
    "versions" : [
      "7.5.2"
    ],
    "os" : {
      "available_processors" : 1286,
      "allocated_processors" : 1286,
      "names" : [
        {
          "name" : "Linux",
          "count" : 83
        }
      ],
      "pretty_names" : [
        {
          "pretty_name" : "CentOS Linux 7 (Core)",
          "count" : 83
        }
      ],
      "mem" : {
        "total_in_bytes" : 4333211127808,
        "free_in_bytes" : 337708449792,
        "used_in_bytes" : 3995502678016,
        "free_percent" : 8,
        "used_percent" : 92
      }
    },
    "process" : {
      "cpu" : {
        "percent" : 604
      },
      "open_file_descriptors" : {
        "min" : 2038,
        "max" : 3099,
        "avg" : 2641
      }
    },
    "jvm" : {
      "max_uptime_in_millis" : 2720622232,
      "versions" : [
        {
          "version" : "13.0.1",
          "vm_name" : "OpenJDK 64-Bit Server VM",
          "vm_version" : "13.0.1+9",
          "vm_vendor" : "AdoptOpenJDK",
          "bundled_jdk" : true,
          "using_bundled_jdk" : true,
          "count" : 83
        }
      ],
      "mem" : {
        "heap_used_in_bytes" : 990124482944,
        "heap_max_in_bytes" : 2160368549888
      },
      "threads" : 11607
    },
    "fs" : {
      "total_in_bytes" : 93592137940992,
      "free_in_bytes" : 70327794900992,
      "available_in_bytes" : 65651127644160
    },
    "plugins" : [ ],
    "network_types" : {
      "transport_types" : {
        "security4" : 83
      },
      "http_types" : {
        "security4" : 83
      }
    },
    "discovery_types" : {
      "zen" : 83
    },
    "packaging_types" : [
      {
        "flavor" : "default",
        "type" : "rpm",
        "count" : 83
      }
    ]
  }
}

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.