Sure @Ignacio_Vera
Under workload, Elasticsearch logs start to contain these two log events, quite often:
{
"@timestamp":"2023-06-06T16:29:03.294Z",
"log.level":"INFO",
"message":"attempting to trigger G1GC due to high heap usage [7859992840]",
"ecs.version":"1.2.0",
"service.name":"ES_ECS",
"event.dataset":"elasticsearch.server",
"process.thread.name":"elasticsearch[x-elastic2-es8.xx][transport_worker][T#2]",
"log.logger":"org.elasticsearch.indices.breaker.HierarchyCircuitBreakerService",
"elasticsearch.cluster.uuid":"B5_WKX7-R8OvOAZwTy3DEg",
"elasticsearch.node.id":"BAqS8hb9RaqifRMuKYVoKw",
"elasticsearch.node.name":"x-elastic2-es8.xx",
"elasticsearch.cluster.name":"x-cluster0-es8"
}
{
"@timestamp":"2023-06-06T16:29:03.425Z",
"log.level":"INFO",
"message":"GC did bring memory usage down, before [7859992840], after [4110292480], allocations [87], duration [131]",
"ecs.version":"1.2.0",
"service.name":"ES_ECS",
"event.dataset":"elasticsearch.server",
"process.thread.name":"elasticsearch[x-elastic2-es8.xx][transport_worker][T#2]",
"log.logger":"org.elasticsearch.indices.breaker.HierarchyCircuitBreakerService",
"elasticsearch.cluster.uuid":"B5_WKX7-R8OvOAZwTy3DEg",
"elasticsearch.node.id":"BAqS8hb9RaqifRMuKYVoKw",
"elasticsearch.node.name":"x-elastic2-es8.xx",
"elasticsearch.cluster.name":"x-cluster0-es8"
}
And sometimes also this log event (cca once per 10min workload):
{
"@timestamp":"2023-06-06T16:29:09.324Z",
"log.level":"WARN",
"message":"failed to retrieve stats for node [zMZlF4RNT9eFWD0x7fQgaA]",
"ecs.version":"1.2.0",
"service.name":"ES_ECS",
"event.dataset":"elasticsearch.server",
"process.thread.name":"elasticsearch[x-elastic2-es8.xx][generic][T#3]",
"log.logger":"org.elasticsearch.cluster.InternalClusterInfoService",
"elasticsearch.cluster.uuid":"B5_WKX7-R8OvOAZwTy3DEg",
"elasticsearch.node.id":"BAqS8hb9RaqifRMuKYVoKw",
"elasticsearch.node.name":"x-elastic2-es8.xx",
"elasticsearch.cluster.name":"x-cluster0-es8",
"error.type":"org.elasticsearch.transport.RemoteTransportException",
"error.message":"[x-elastic1-es8.xxx][10.0.57.14:9300][cluster:monitor/nodes/stats[n]]",
"error.stack_trace":"org.elasticsearch.transport.RemoteTransportException: [x-elastic1-es8.xxx][10.0.57.14:9300][cluster:monitor/nodes/stats[n]]\nCaused by: org.elasticsearch.common.breaker.CircuitBreakingException: [parent] Data too large, data for [cluster:monitor/nodes/stats[n]] would be [7859247260/7.3gb], which is larger than the limit of [7857609113/7.3gb], real usage: [7859246184/7.3gb], new bytes reserved: [1076/1kb], usages [eql_sequence=0/0b, model_inference=0/0b, inflight_requests=1076/1kb, request=0/0b, fielddata=0/0b]\n\tat org.elasticsearch.indices.breaker.HierarchyCircuitBreakerService.checkParentLimit(HierarchyCircuitBreakerService.java:414)\n\tat org.elasticsearch.common.breaker.ChildMemoryCircuitBreaker.addEstimateBytesAndMaybeBreak(ChildMemoryCircuitBreaker.java:109)\n\tat org.elasticsearch.transport.InboundAggregator.checkBreaker(InboundAggregator.java:215)\n\tat org.elasticsearch.transport.InboundAggregator.finishAggregation(InboundAggregator.java:119)\n\tat org.elasticsearch.transport.InboundPipeline.forwardFragments(InboundPipeline.java:147)\n\tat org.elasticsearch.transport.InboundPipeline.doHandleBytes(InboundPipeline.java:121)\n\tat org.elasticsearch.transport.InboundPipeline.handleBytes(InboundPipeline.java:86)\n\tat org.elasticsearch.transport.netty4.Netty4MessageInboundHandler.channelRead(Netty4MessageInboundHandler.java:63)\n\tat io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:444)\n\tat io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420)\n\tat io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:412)\n\tat io.netty.handler.logging.LoggingHandler.channelRead(LoggingHandler.java:280)\n\tat io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:442)\n\tat io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420)\n\tat io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:412)\n\tat io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:103)\n\tat io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:444)\n\tat io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420)\n\tat io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:412)\n\tat io.netty.handler.ssl.SslHandler.unwrap(SslHandler.java:1373)\n\tat io.netty.handler.ssl.SslHandler.decodeJdkCompatible(SslHandler.java:1236)\n\tat io.netty.handler.ssl.SslHandler.decode(SslHandler.java:1285)\n\tat io.netty.handler.codec.ByteToMessageDecoder.decodeRemovalReentryProtection(ByteToMessageDecoder.java:529)\n\tat io.netty.handler.codec.ByteToMessageDecoder.callDecode(ByteToMessageDecoder.java:468)\n\tat io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:290)\n\tat io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:444)\n\tat io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420)\n\tat io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:412)\n\tat io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1410)\n\tat io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:440)\n\tat io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420)\n\tat io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:919)\n\tat io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:166)\n\tat io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:788)\n\tat io.netty.channel.nio.NioEventLoop.processSelectedKeysPlan(NioEventLoop.java:689)\n\tat io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:652)\n\tat io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:562)\n\tat io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:997)\n\tat io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)\n\tat java.lang.Thread.run(Thread.java:1623)\n"
}
This correlates with these circuitbreaker Prometheus metrics: