7.6.0 High CPU on a data node

Found the need to restart elasticsearch service on one of our 7.6.0 data nodes today as it was in high cpu load. Maybe a general bug that bid us?

Top of only stack dump/issue seen in the log:

[2020-02-21T09:15:34,251][DEBUG][o.e.a.s.TransportSearchScrollAction] [d1r2n14] [1416773] Failed to execute query phase
org.elasticsearch.transport.RemoteTransportException: [d1r1n12][62.243.41.176:9300][indices:data/read/search[phase/query+fetch/scroll]]
Caused by: org.elasticsearch.search.SearchContextMissingException: No search context found for id [1416773]
        at org.elasticsearch.search.SearchService.getExecutor(SearchService.java:462) ~[elasticsearch-7.6.0.jar:7.6.0]
        at org.elasticsearch.search.SearchService.runAsync(SearchService.java:347) ~[elasticsearch-7.6.0.jar:7.6.0]
        at org.elasticsearch.search.SearchService.executeFetchPhase(SearchService.java:474) ~[elasticsearch-7.6.0.jar:7.6.0]
        at org.elasticsearch.action.search.SearchTransportService.lambda$registerRequestHandler$10(SearchTransportService.java:356) ~[elasticsearch-7.6.0.jar:7.6.0]
        at org.elasticsearch.xpack.security.transport.SecurityServerTransportInterceptor$ProfileSecuredRequestHandler$1.doRun(SecurityServerTransportInterceptor.java:257) ~[?:?]
        at org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) ~[elasticsearch-7.6.0.jar:7.6.0]
        at org.elasticsearch.common.util.concurrent.EsExecutors$DirectExecutorService.execute(EsExecutors.java:225) ~[elasticsearch-7.6.0.jar:7.6.0]
        at org.elasticsearch.xpack.security.transport.SecurityServerTransportInterceptor$ProfileSecuredRequestHandler.lambda$messageReceived$0(SecurityServerTransportInterceptor.java:306) ~[?:?]
        at org.elasticsearch.action.ActionListener$1.onResponse(ActionListener.java:63) ~[elasticsearch-7.6.0.jar:7.6.0]
        at org.elasticsearch.xpack.security.authz.AuthorizationService.lambda$runRequestInterceptors$15(AuthorizationService.java:344) ~[?:?]
        at org.elasticsearch.action.ActionListener$1.onResponse(ActionListener.java:63) ~[elasticsearch-7.6.0.jar:7.6.0]
        at org.elasticsearch.common.util.concurrent.ListenableFuture$1.doRun(ListenableFuture.java:112) ~[elasticsearch-7.6.0.jar:7.6.0]
        at org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) ~[elasticsearch-7.6.0.jar:7.6.0]
        at org.elasticsearch.common.util.concurrent.EsExecutors$DirectExecutorService.execute(EsExecutors.java:225) ~[elasticsearch-7.6.0.jar:7.6.0]
        at org.elasticsearch.common.util.concurrent.ListenableFuture.notifyListener(ListenableFuture.java:106) ~[elasticsearch-7.6.0.jar:7.6.0]
        at org.elasticsearch.common.util.concurrent.ListenableFuture.lambda$done$0(ListenableFuture.java:98) ~[elasticsearch-7.6.0.jar:7.6.0]
        at java.util.ArrayList.forEach(ArrayList.java:1507) ~[?:?]
        at org.elasticsearch.common.util.concurrent.ListenableFuture.done(ListenableFuture.java:98) ~[elasticsearch-7.6.0.jar:7.6.0]
        at org.elasticsearch.common.util.concurrent.BaseFuture.set(BaseFuture.java:144) ~[elasticsearch-7.6.0.jar:7.6.0]
        at org.elasticsearch.common.util.concurrent.ListenableFuture.onResponse(ListenableFuture.java:127) ~[elasticsearch-7.6.0.jar:7.6.0]
        at org.elasticsearch.action.StepListener.innerOnResponse(StepListener.java:62) ~[elasticsearch-7.6.0.jar:7.6.0]
        at org.elasticsearch.action.NotifyOnceListener.onResponse(NotifyOnceListener.java:40) ~[elasticsearch-7.6.0.jar:7.6.0]
        at org.elasticsearch.xpack.security.authz.interceptor.FieldAndDocumentLevelSecurityRequestInterceptor.intercept(FieldAndDocumentLevelSecurityRequestInterceptor.java:61) ~[?:?]
        at org.elasticsearch.xpack.security.authz.interceptor.UpdateRequestInterceptor.intercept(UpdateRequestInterceptor.java:23) ~[?:?]
        at org.elasticsearch.xpack.security.authz.AuthorizationService.lambda$runRequestInterceptors$14(AuthorizationService.java:339) ~[?:?]
        at org.elasticsearch.action.ActionListener$1.onResponse(ActionListener.java:63) ~[elasticsearch-7.6.0.jar:7.6.0]
        at org.elasticsearch.common.util.concurrent.ListenableFuture$1.doRun(ListenableFuture.java:112) ~[elasticsearch-7.6.0.jar:7.6.0]
        at org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) ~[elasticsearch-7.6.0.jar:7.6.0]
        at org.elasticsearch.common.util.concurrent.EsExecutors$DirectExecutorService.execute(EsExecutors.java:225) ~[elasticsearch-7.6.0.jar:7.6.0]
        at org.elasticsearch.common.util.concurrent.ListenableFuture.notifyListener(ListenableFuture.java:106) ~[elasticsearch-7.6.0.jar:7.6.0]
        at org.elasticsearch.common.util.concurrent.ListenableFuture.lambda$done$0(ListenableFuture.java:98) ~[elasticsearch-7.6.0.jar:7.6.0]
        at java.util.ArrayList.forEach(ArrayList.java:1507) ~[?:?]
        at org.elasticsearch.common.util.concurrent.ListenableFuture.done(ListenableFuture.java:98) ~[elasticsearch-7.6.0.jar:7.6.0]
        at org.elasticsearch.common.util.concurrent.BaseFuture.set(BaseFuture.java:144) ~[elasticsearch-7.6.0.jar:7.6.0]
        at org.elasticsearch.common.util.concurrent.ListenableFuture.onResponse(ListenableFuture.java:127) ~[elasticsearch-7.6.0.jar:7.6.0]
        at org.elasticsearch.action.StepListener.innerOnResponse(StepListener.java:62) ~[elasticsearch-7.6.0.jar:7.6.0]
        at org.elasticsearch.action.NotifyOnceListener.onResponse(NotifyOnceListener.java:40) ~[elasticsearch-7.6.0.jar:7.6.0]
        at org.elasticsearch.xpack.security.authz.interceptor.FieldAndDocumentLevelSecurityRequestInterceptor.intercept(FieldAndDocumentLevelSecurityRequestInterceptor.java:61) ~[?:?]
        at org.elasticsearch.xpack.security.authz.interceptor.SearchRequestInterceptor.intercept(SearchRequestInterceptor.java:19) ~[?:?]
        at org.elasticsearch.xpack.security.authz.AuthorizationService.lambda$runRequestInterceptors$14(AuthorizationService.java:339) ~[?:?]
        at org.elasticsearch.action.ActionListener$1.onResponse(ActionListener.java:63) ~[elasticsearch-7.6.0.jar:7.6.0]
        at org.elasticsearch.common.util.concurrent.ListenableFuture$1.doRun(ListenableFuture.java:112) ~[elasticsearch-7.6.0.jar:7.6.0]
        at org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) ~[elasticsearch-7.6.0.jar:7.6.0]
        at org.elasticsearch.common.util.concurrent.EsExecutors$DirectExecutorService.execute(EsExecutors.java:225) ~[elasticsearch-7.6.0.jar:7.6.0]
        at org.elasticsearch.common.util.concurrent.ListenableFuture.notifyListener(ListenableFuture.java:106) ~[elasticsearch-7.6.0.jar:7.6.0]
        at org.elasticsearch.common.util.concurrent.ListenableFuture.lambda$done$0(ListenableFuture.java:98) ~[elasticsearch-7.6.0.jar:7.6.0]
        at java.util.ArrayList.forEach(ArrayList.java:1507) ~[?:?]
...
    at io.netty.handler.ssl.SslHandler.decode(SslHandler.java:1274) [netty-handler-4.1.43.Final.jar:4.1.43.Final]
    at io.netty.handler.codec.ByteToMessageDecoder.decodeRemovalReentryProtection(ByteToMessageDecoder.java:503) [netty-codec-4.1.43.Final.jar:4.1.43.Final]
    at io.netty.handler.codec.ByteToMessageDecoder.callDecode(ByteToMessageDecoder.java:442) [netty-codec-4.1.43.Final.jar:4.1.43.Final]
    at io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:281) [netty-codec-4.1.43.Final.jar:4.1.43.Final]
    at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:374) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
    at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:360) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
    at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:352) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
    at io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1422) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
    at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:374) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
    at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:360) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
    at io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:931) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
    at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:163) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
    at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:700) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
    at io.netty.channel.nio.NioEventLoop.processSelectedKeysPlain(NioEventLoop.java:600) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
    at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:554) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
    at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:514) [netty-transport-4.1.43.Final.jar:4.1.43.Final]
    at io.netty.util.concurrent.SingleThreadEventExecutor$6.run(SingleThreadEventExecutor.java:1050) [netty-common-4.1.43.Final.jar:4.1.43.Final]
    at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) [netty-common-4.1.43.Final.jar:4.1.43.Final]
    at java.lang.Thread.run(Thread.java:830) [?:?]
[2020-02-21T09:41:15,987][INFO ][o.e.n.Node               ] [d1r2n14] stopping ...
[2020-02-21T09:41:16,096][INFO ][o.e.x.w.WatcherService   ] [d1r2n14] stopping watch service, reason [shutdown initiated]
[2020-02-21T09:41:16,097][INFO ][o.e.x.w.WatcherLifeCycleService] [d1r2n14] watcher has stopped and shutdown
[2020-02-21T09:41:16,241][INFO ][o.e.c.c.Coordinator      ] [d1r2n14] master node [{es-mst1}{90yeCMLeTb66cCVzAsz3aA}{KWa-QIHXQEmogR5ff_dNVw}{es-mst1}{62.243.41.230:9300}{lm}{ml.machine_memory=6251069440, rack=OPA3.3.16, ml.max_open_jobs=20, xpack.installed=true}] failed, restarting discovery
org.elasticsearch.transport.NodeDisconnectedException: [es-mst1][62.243.41.230:9300][disconnected] disconnected
[2020-02-21T09:41:16,357][INFO ][o.e.x.m.p.l.CppLogMessageHandler] [d1r2n14] [controller/22056] [Main.cc@150] Ml controller exiting
[2020-02-21T09:41:16,357][INFO ][o.e.x.m.p.NativeController] [d1r2n14] Native controller process has stopped - no new native processes can be started
[2020-02-21T09:41:21,805][INFO ][o.e.n.Node               ] [d1r2n14] stopped
[2020-02-21T09:41:21,805][INFO ][o.e.n.Node               ] [d1r2n14] closing ...
[2020-02-21T09:41:21,840][INFO ][o.e.n.Node               ] [d1r2n14] closed

Attaching few load metric SDs of cluster:

!

!

!

Screenshot 2020-02-21 at 12.15.13 !

Sorry haven't got pretty GC stats, but it doesn't seems to be GC activity going on at the time:

[2020-02-21T07:52:14.895+0000][22009][gc           ] GC(24545) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14161M->4584M(16384M) 30.060ms
[2020-02-21T07:53:00.471+0000][22009][gc           ] GC(24548) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14172M->4563M(16384M) 26.712ms
[2020-02-21T07:54:29.023+0000][22009][gc           ] GC(24554) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14184M->4497M(16384M) 18.694ms
[2020-02-21T07:55:13.616+0000][22009][gc           ] GC(24557) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14183M->4471M(16384M) 17.285ms
[2020-02-21T07:55:59.780+0000][22009][gc           ] GC(24560) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14192M->4496M(16384M) 18.964ms
[2020-02-21T07:56:48.802+0000][22009][gc           ] GC(24563) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14191M->4525M(16384M) 21.815ms
[2020-02-21T07:57:31.788+0000][22009][gc           ] GC(24566) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14191M->4533M(16384M) 22.506ms
[2020-02-21T07:58:19.684+0000][22009][gc           ] GC(24569) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14193M->4550M(16384M) 28.618ms
[2020-02-21T07:59:05.830+0000][22009][gc           ] GC(24572) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14204M->4554M(16384M) 25.608ms
[2020-02-21T07:59:56.404+0000][22009][gc           ] GC(24575) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14212M->4566M(16384M) 29.546ms
[2020-02-21T08:00:43.902+0000][22009][gc           ] GC(24578) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14226M->4579M(16384M) 26.064ms
[2020-02-21T08:01:28.579+0000][22009][gc           ] GC(24581) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14234M->4592M(16384M) 36.580ms
[2020-02-21T08:02:13.290+0000][22009][gc           ] GC(24584) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14245M->4600M(16384M) 25.749ms
[2020-02-21T08:02:51.621+0000][22009][gc           ] GC(24587) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14249M->4580M(16384M) 28.279ms
[2020-02-21T08:03:38.794+0000][22009][gc           ] GC(24590) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14264M->4597M(16384M) 22.215ms
[2020-02-21T08:04:28.946+0000][22009][gc           ] GC(24593) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14282M->4596M(16384M) 21.563ms
[2020-02-21T08:05:14.430+0000][22009][gc           ] GC(24596) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14293M->4629M(16384M) 22.994ms
[2020-02-21T08:06:06.607+0000][22009][gc           ] GC(24599) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14304M->4625M(16384M) 20.077ms
[2020-02-21T08:06:53.225+0000][22009][gc           ] GC(24602) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14309M->4608M(16384M) 20.584ms
[2020-02-21T08:07:39.729+0000][22009][gc           ] GC(24605) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14323M->4590M(16384M) 18.288ms
[2020-02-21T08:08:31.131+0000][22009][gc           ] GC(24608) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14326M->4607M(16384M) 23.847ms
[2020-02-21T08:09:22.545+0000][22009][gc           ] GC(24611) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14333M->4621M(16384M) 17.531ms
[2020-02-21T08:10:13.861+0000][22009][gc           ] GC(24614) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14337M->4615M(16384M) 18.267ms
[2020-02-21T08:11:00.239+0000][22009][gc           ] GC(24617) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14328M->4599M(16384M) 16.572ms
[2020-02-21T08:11:42.173+0000][22009][gc           ] GC(24620) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14338M->4588M(16384M) 16.805ms
[2020-02-21T08:12:32.919+0000][22009][gc           ] GC(24623) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14338M->4616M(16384M) 16.449ms
[2020-02-21T08:13:21.523+0000][22009][gc           ] GC(24626) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14340M->4596M(16384M) 14.188ms
[2020-02-21T08:14:13.504+0000][22009][gc           ] GC(24629) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14344M->4607M(16384M) 15.639ms
[2020-02-21T08:15:00.353+0000][22009][gc           ] GC(24632) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14341M->4614M(16384M) 15.823ms
[2020-02-21T08:15:35.437+0000][22009][gc           ] GC(24635) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14347M->4629M(16384M) 19.054ms
[2020-02-21T08:16:19.361+0000][22009][gc           ] GC(24638) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14345M->4642M(16384M) 24.722ms
[2020-02-21T08:17:06.080+0000][22009][gc           ] GC(24641) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14343M->4629M(16384M) 17.499ms
[2020-02-21T08:17:46.941+0000][22009][gc           ] GC(24644) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14349M->4650M(16384M) 19.312ms
[2020-02-21T08:18:26.945+0000][22009][gc           ] GC(24647) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14349M->4664M(16384M) 29.667ms
[2020-02-21T08:21:58.580+0000][22009][gc           ] GC(24662) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14077M->4675M(16384M) 17.530ms
[2020-02-21T08:22:45.711+0000][22009][gc           ] GC(24665) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14217M->4661M(16384M) 15.689ms
[2020-02-21T08:23:32.454+0000][22009][gc           ] GC(24668) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14305M->4654M(16384M) 14.649ms
[2020-02-21T08:24:20.076+0000][22009][gc           ] GC(24671) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14374M->4653M(16384M) 15.209ms
[2020-02-21T08:25:09.847+0000][22009][gc           ] GC(24674) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14406M->4658M(16384M) 13.765ms
[2020-02-21T08:26:02.768+0000][22009][gc           ] GC(24677) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14405M->4670M(16384M) 14.875ms
[2020-02-21T08:26:47.873+0000][22009][gc           ] GC(24680) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14405M->4660M(16384M) 14.288ms
[2020-02-21T08:27:34.075+0000][22009][gc           ] GC(24683) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14405M->4707M(16384M) 18.154ms
[2020-02-21T08:28:18.667+0000][22009][gc           ] GC(24686) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14406M->4688M(16384M) 16.671ms
[2020-02-21T08:29:06.627+0000][22009][gc           ] GC(24689) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14409M->4696M(16384M) 16.983ms
[2020-02-21T08:29:58.055+0000][22009][gc           ] GC(24692) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14407M->4693M(16384M) 16.375ms
[2020-02-21T08:30:44.772+0000][22009][gc           ] GC(24695) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14414M->4689M(16384M) 15.564ms
[2020-02-21T08:31:31.893+0000][22009][gc           ] GC(24698) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14418M->4670M(16384M) 13.906ms
[2020-02-21T08:32:17.522+0000][22009][gc           ] GC(24701) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14414M->4681M(16384M) 15.449ms
[2020-02-21T08:33:07.077+0000][22009][gc           ] GC(24704) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14420M->4677M(16384M) 14.410ms
[2020-02-21T08:33:55.954+0000][22009][gc           ] GC(24707) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14415M->4672M(16384M) 14.394ms
[2020-02-21T08:34:47.024+0000][22009][gc           ] GC(24710) Pause Young (Concurrent Start) (G1 Evacuation Pause) 14418M->4663M(16384M) 12.823ms

Just a bit of info from restart:

[2020-02-21T09:41:54,838][INFO ][o.e.e.NodeEnvironment    ] [d1r2n14] using [1] data paths, mounts [[/mxes_data/1 (/dev/mapper/vg--mxes-lv--mxes)]], net usable_space [11.6tb], net total_space [13tb], types [xfs]
[2020-02-21T09:41:54,840][INFO ][o.e.e.NodeEnvironment    ] [d1r2n14] heap size [16gb], compressed ordinary object pointers [true]
[2020-02-21T09:42:03,125][INFO ][o.e.n.Node               ] [d1r2n14] node name [d1r2n14], node ID [13aM0HqSTSGRSj50j6VT6g], cluster name [mxes2data]
[2020-02-21T09:42:03,126][INFO ][o.e.n.Node               ] [d1r2n14] version[7.6.0], pid[21549], build[default/rpm/7f634e9f44834fbc12724506cc1da681b0c3b1e3/2020-02-06T00:09:00.449973Z], OS[Linux/4.5.4-1.el6.elrepo.x86_64/amd64], JVM[AdoptOpenJDK/OpenJDK 64-Bit Server VM/13.0.2/13.0.2+8]
[2020-02-21T09:42:03,126][INFO ][o.e.n.Node               ] [d1r2n14] JVM home [/usr/share/elasticsearch/jdk]

Nodes hot threads is normally a good way to investigate high CPU, especially if it's not GC. Sounds like you're no longer seeing this problem, but if it recurs then GET _nodes/hot_threads?threads=99999 will show you what Elasticsearch is busy doing.

Thanks, will try to remember!

1 Like

Seems to elute me :confused:

curl -ku <redacted> -s -H 'Content-Type: application/json' -XGET "https://localhost:9200/_nodes/d1r2n14/hot_threads" | jq .
parse error: Expected string key before ':' at line 1, column 1 

where as this works:

curl -ku <redacted> -s -H 'Content-Type: application/json' -XGET "https://localhost:9200/_nodes/d1r2n14" | jq . | head
{
  "_nodes": {
    "total": 1,
    "successful": 1,
    "failed": 0
  },
  "cluster_name": "mxes2data",
  "nodes": {
    "<redacted>": {
      "name": "d1r2n14",

The result of the hot threads output is plain text, not JSON, so jq will be of no help to you.

(-H 'Content-Type: application/json' tells Elasticsearch that you are sending JSON in your request, but does not impose any restrictions on the format of the response)

:slight_smile: okay, managed to trig issue again and this time I grapped a few hot thread samples not surprisingly search threads as it's triggered by attempting filter a search in the SIEM App:

# grep 'cpu usage' /tmp/es_hot_threads*
/tmp/es_hot_threads_d1r1n11.txt:    4.3% (21.7ms out of 500ms) cpu usage by thread 'elasticsearch[d1r1n11][write][T#16]'
/tmp/es_hot_threads_d1r1n11.txt:    4.3% (21.5ms out of 500ms) cpu usage by thread 'elasticsearch[d1r1n11][write][T#14]'
/tmp/es_hot_threads_d1r1n12.txt:   100.7% (503.3ms out of 500ms) cpu usage by thread 'elasticsearch[d1r1n12][search][T#25]'
/tmp/es_hot_threads_d1r1n12.txt:   100.7% (503.3ms out of 500ms) cpu usage by thread 'elasticsearch[d1r1n12][search][T#5]'
/tmp/es_hot_threads_d1r1n12.txt:   100.6% (502.8ms out of 500ms) cpu usage by thread 'elasticsearch[d1r1n12][search][T#1]'
/tmp/es_hot_threads_d1r1n14.txt:    5.7% (28.5ms out of 500ms) cpu usage by thread 'elasticsearch[d1r1n14][transport_worker][T#26]'
/tmp/es_hot_threads.txt:   100.4% (501.9ms out of 500ms) cpu usage by thread 'elasticsearch[d1r2n14][search][T#16]'
/tmp/es_hot_threads.txt:   25.9% (129.6ms out of 500ms) cpu usage by thread 'elasticsearch[d1r2n14][[winlogbeat-7.6.0-2020.02.24][2]: Lucene Merge Thread #2468]'

Good. The corresponding stack traces will show us what exactly the searches are getting hung up on - can you share them too?

Sure properly from the coordinating node:

::: {d1r2n14}{13aM0HqSTSGRSj50j6VT6g}{h7hwyX_hRHSklLzXkBnZrQ}{d1r2n14}{<redacted>.214:9300}{dil}{ml.machine_memory=33673216000, rack=OPA3.3.16, xpack.installed=true, ml.max_open_jobs=20}
   Hot threads at 2020-02-24T11:01:33.739Z, interval=500ms, busiestThreads=3, ignoreIdleThreads=true:
   
   100.4% (501.9ms out of 500ms) cpu usage by thread 'elasticsearch[d1r2n14][search][T#16]'
     5/10 snapshots sharing following 42 elements
       app//org.apache.lucene.codecs.lucene80.Lucene80DocValuesProducer$TermsDict.seekExact(Lucene80DocValuesProducer.java:987)
       app//org.apache.lucene.codecs.lucene80.Lucene80DocValuesProducer$BaseSortedDocValues.lookupOrd(Lucene80DocValuesProducer.java:869)
       app//org.apache.lucene.index.SingletonSortedSetDocValues.lookupOrd(SingletonSortedSetDocValues.java:92)
       app//org.elasticsearch.index.fielddata.FieldData$10.nextValue(FieldData.java:367)
       app//org.elasticsearch.search.aggregations.bucket.terms.StringTermsAggregator$1.collect(StringTermsAggregator.java:92)
       app//org.elasticsearch.search.aggregations.LeafBucketCollector.collect(LeafBucketCollector.java:82)
       app//org.apache.lucene.search.MatchAllDocsQuery$1$1.score(MatchAllDocsQuery.java:64)
       app//org.elasticsearch.search.internal.CancellableBulkScorer.score(CancellableBulkScorer.java:56)
       app//org.apache.lucene.search.BulkScorer.score(BulkScorer.java:39)
       app//org.elasticsearch.search.internal.ContextIndexSearcher.searchLeaf(ContextIndexSearcher.java:198)
       app//org.elasticsearch.search.internal.ContextIndexSearcher.search(ContextIndexSearcher.java:171)
       app//org.apache.lucene.search.IndexSearcher.search(IndexSearcher.java:445)
       app//org.elasticsearch.search.query.QueryPhase.searchWithCollector(QueryPhase.java:333)
       app//org.elasticsearch.search.query.QueryPhase.executeInternal(QueryPhase.java:295)
       app//org.elasticsearch.search.query.QueryPhase.execute(QueryPhase.java:134)
       app//org.elasticsearch.indices.IndicesService.lambda$loadIntoContext$19(IndicesService.java:1335)
       app//org.elasticsearch.indices.IndicesService$$Lambda$5269/0x0000000801a57c40.accept(Unknown Source)
       app//org.elasticsearch.indices.IndicesService.lambda$cacheShardLevelResult$20(IndicesService.java:1392)
       app//org.elasticsearch.indices.IndicesService$$Lambda$5271/0x0000000801a57440.get(Unknown Source)
       app//org.elasticsearch.indices.IndicesRequestCache$Loader.load(IndicesRequestCache.java:174)
       app//org.elasticsearch.indices.IndicesRequestCache$Loader.load(IndicesRequestCache.java:157)
       app//org.elasticsearch.common.cache.Cache.computeIfAbsent(Cache.java:433)
       app//org.elasticsearch.indices.IndicesRequestCache.getOrCompute(IndicesRequestCache.java:123)
       app//org.elasticsearch.indices.IndicesService.cacheShardLevelResult(IndicesService.java:1398)
       app//org.elasticsearch.indices.IndicesService.loadIntoContext(IndicesService.java:1332)
       app//org.elasticsearch.search.SearchService.loadOrExecuteQueryPhase(SearchService.java:336)
       app//org.elasticsearch.search.SearchService.executeQueryPhase(SearchService.java:358)
       app//org.elasticsearch.search.SearchService.lambda$executeQueryPhase$1(SearchService.java:343)
       app//org.elasticsearch.search.SearchService$$Lambda$5106/0x0000000801a07440.apply(Unknown Source)
       app//org.elasticsearch.action.ActionListener.lambda$map$2(ActionListener.java:146)
       app//org.elasticsearch.action.ActionListener$$Lambda$4026/0x000000080173bc40.accept(Unknown Source)
       app//org.elasticsearch.action.ActionListener$1.onResponse(ActionListener.java:63)
       app//org.elasticsearch.action.ActionRunnable.lambda$supply$0(ActionRunnable.java:58)
       app//org.elasticsearch.action.ActionRunnable$$Lambda$5116/0x0000000801a04c40.accept(Unknown Source)
       app//org.elasticsearch.action.ActionRunnable$2.doRun(ActionRunnable.java:73)
       app//org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37)
       app//org.elasticsearch.common.util.concurrent.TimedRunnable.doRun(TimedRunnable.java:44)
       app//org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:692)
       app//org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37)
       java.base@13.0.2/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
       java.base@13.0.2/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
       java.base@13.0.2/java.lang.Thread.run(Thread.java:830)
     3/10 snapshots sharing following 41 elements
       app//org.apache.lucene.codecs.lucene80.Lucene80DocValuesProducer$20.advanceExact(Lucene80DocValuesProducer.java:839)
       app//org.apache.lucene.index.SingletonSortedSetDocValues.advanceExact(SingletonSortedSetDocValues.java:82)
       app//org.elasticsearch.index.fielddata.FieldData$10.advanceExact(FieldData.java:345)
       app//org.elasticsearch.search.aggregations.bucket.terms.StringTermsAggregator$1.collect(StringTermsAggregator.java:85)
       app//org.elasticsearch.search.aggregations.LeafBucketCollector.collect(LeafBucketCollector.java:82)
       app//org.apache.lucene.search.MatchAllDocsQuery$1$1.score(MatchAllDocsQuery.java:64)
       app//org.elasticsearch.search.internal.CancellableBulkScorer.score(CancellableBulkScorer.java:56)
       app//org.apache.lucene.search.BulkScorer.score(BulkScorer.java:39)
       app//org.elasticsearch.search.internal.ContextIndexSearcher.searchLeaf(ContextIndexSearcher.java:198)
       app//org.elasticsearch.search.internal.ContextIndexSearcher.search(ContextIndexSearcher.java:171)
       app//org.apache.lucene.search.IndexSearcher.search(IndexSearcher.java:445)
       app//org.elasticsearch.search.query.QueryPhase.searchWithCollector(QueryPhase.java:333)
       app//org.elasticsearch.search.query.QueryPhase.executeInternal(QueryPhase.java:295)
       app//org.elasticsearch.search.query.QueryPhase.execute(QueryPhase.java:134)
       app//org.elasticsearch.indices.IndicesService.lambda$loadIntoContext$19(IndicesService.java:1335)
       app//org.elasticsearch.indices.IndicesService$$Lambda$5269/0x0000000801a57c40.accept(Unknown Source)
       app//org.elasticsearch.indices.IndicesService.lambda$cacheShardLevelResult$20(IndicesService.java:1392)
       app//org.elasticsearch.indices.IndicesService$$Lambda$5271/0x0000000801a57440.get(Unknown Source)
       app//org.elasticsearch.indices.IndicesRequestCache$Loader.load(IndicesRequestCache.java:174)
       app//org.elasticsearch.indices.IndicesRequestCache$Loader.load(IndicesRequestCache.java:157)
       app//org.elasticsearch.common.cache.Cache.computeIfAbsent(Cache.java:433)
       app//org.elasticsearch.indices.IndicesRequestCache.getOrCompute(IndicesRequestCache.java:123)
       app//org.elasticsearch.indices.IndicesService.cacheShardLevelResult(IndicesService.java:1398)
       app//org.elasticsearch.indices.IndicesService.loadIntoContext(IndicesService.java:1332)
       app//org.elasticsearch.search.SearchService.loadOrExecuteQueryPhase(SearchService.java:336)
       app//org.elasticsearch.search.SearchService.executeQueryPhase(SearchService.java:358)
       app//org.elasticsearch.search.SearchService.lambda$executeQueryPhase$1(SearchService.java:343)
       app//org.elasticsearch.search.SearchService$$Lambda$5106/0x0000000801a07440.apply(Unknown Source)
       app//org.elasticsearch.action.ActionListener.lambda$map$2(ActionListener.java:146)
       app//org.elasticsearch.action.ActionListener$$Lambda$4026/0x000000080173bc40.accept(Unknown Source)
       app//org.elasticsearch.action.ActionListener$1.onResponse(ActionListener.java:63)
       app//org.elasticsearch.action.ActionRunnable.lambda$supply$0(ActionRunnable.java:58)
       app//org.elasticsearch.action.ActionRunnable$$Lambda$5116/0x0000000801a04c40.accept(Unknown Source)
       app//org.elasticsearch.action.ActionRunnable$2.doRun(ActionRunnable.java:73)
       app//org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37)
       app//org.elasticsearch.common.util.concurrent.TimedRunnable.doRun(TimedRunnable.java:44)
       app//org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:692)
       app//org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37)
       java.base@13.0.2/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
       java.base@13.0.2/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
       java.base@13.0.2/java.lang.Thread.run(Thread.java:830)
     2/10 snapshots sharing following 38 elements
       app//org.elasticsearch.search.aggregations.bucket.terms.StringTermsAggregator$1.collect(StringTermsAggregator.java:85)
       app//org.elasticsearch.search.aggregations.LeafBucketCollector.collect(LeafBucketCollector.java:82)
       app//org.apache.lucene.search.MatchAllDocsQuery$1$1.score(MatchAllDocsQuery.java:64)
       app//org.elasticsearch.search.internal.CancellableBulkScorer.score(CancellableBulkScorer.java:56)
       app//org.apache.lucene.search.BulkScorer.score(BulkScorer.java:39)
       app//org.elasticsearch.search.internal.ContextIndexSearcher.searchLeaf(ContextIndexSearcher.java:198)
       app//org.elasticsearch.search.internal.ContextIndexSearcher.search(ContextIndexSearcher.java:171)
       app//org.apache.lucene.search.IndexSearcher.search(IndexSearcher.java:445)
       app//org.elasticsearch.search.query.QueryPhase.searchWithCollector(QueryPhase.java:333)
       app//org.elasticsearch.search.query.QueryPhase.executeInternal(QueryPhase.java:295)
       app//org.elasticsearch.search.query.QueryPhase.execute(QueryPhase.java:134)
       app//org.elasticsearch.indices.IndicesService.lambda$loadIntoContext$19(IndicesService.java:1335)
       app//org.elasticsearch.indices.IndicesService$$Lambda$5269/0x0000000801a57c40.accept(Unknown Source)
       app//org.elasticsearch.indices.IndicesService.lambda$cacheShardLevelResult$20(IndicesService.java:1392)
       app//org.elasticsearch.indices.IndicesService$$Lambda$5271/0x0000000801a57440.get(Unknown Source)
       app//org.elasticsearch.indices.IndicesRequestCache$Loader.load(IndicesRequestCache.java:174)
       app//org.elasticsearch.indices.IndicesRequestCache$Loader.load(IndicesRequestCache.java:157)
       app//org.elasticsearch.common.cache.Cache.computeIfAbsent(Cache.java:433)
       app//org.elasticsearch.indices.IndicesRequestCache.getOrCompute(IndicesRequestCache.java:123)
       app//org.elasticsearch.indices.IndicesService.cacheShardLevelResult(IndicesService.java:1398)
       app//org.elasticsearch.indices.IndicesService.loadIntoContext(IndicesService.java:1332)
       app//org.elasticsearch.search.SearchService.loadOrExecuteQueryPhase(SearchService.java:336)
       app//org.elasticsearch.search.SearchService.executeQueryPhase(SearchService.java:358)
       app//org.elasticsearch.search.SearchService.lambda$executeQueryPhase$1(SearchService.java:343)
       app//org.elasticsearch.search.SearchService$$Lambda$5106/0x0000000801a07440.apply(Unknown Source)
       app//org.elasticsearch.action.ActionListener.lambda$map$2(ActionListener.java:146)
       app//org.elasticsearch.action.ActionListener$$Lambda$4026/0x000000080173bc40.accept(Unknown Source)
       app//org.elasticsearch.action.ActionListener$1.onResponse(ActionListener.java:63)
       app//org.elasticsearch.action.ActionRunnable.lambda$supply$0(ActionRunnable.java:58)
       app//org.elasticsearch.action.ActionRunnable$$Lambda$5116/0x0000000801a04c40.accept(Unknown Source)
       app//org.elasticsearch.action.ActionRunnable$2.doRun(ActionRunnable.java:73)
       app//org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37)
       app//org.elasticsearch.common.util.concurrent.TimedRunnable.doRun(TimedRunnable.java:44)
           ...
       java.base@13.0.2/java.lang.Thread.run(Thread.java:830)

This isn't exactly my area of expertise, but I see mention of FieldData in that stack trace which suggests you're trying to sort or aggregate on a text field which can be very expensive and is often the wrong thing to do. Do you know what the offending query is and where it's coming from?

Hi guys!

I've been troubleshooting a similar issue myself.
Previously I was running a 7.5.1 cluster with no issues, however after upgrading to 7.6.0 cpu usage seems to have increased dramatically on the worker nodes.

Day before the upgrade ( es 7.5.1 )

Day of the upgrade (the blank spot is during the upgrade)

Day after the upgrade

You can see immediately after the upgrade CPU usage averaged higher.

I didn't know about the /_nodes/hot_threads api until just now, next time I see a spike I'll capture the output from it.

Cody

Sure thing @cpmoore. When you have more details it's probably best to open your own thread on the topic rather than adding to this one, since it might be something completely different.

Don't seem to notice similar across my upgrade to 7.6.0

Not quite sure, but I believe that it was a Lucerne search on a keyword field process.name with a regexp:

NOT process.name: /AM_Delta_Patch.+/

but just retried this without issues, so...

Found a way to trigger the high cpu, if I enter a regexp search in SIEM before turning off KQL (like: host.name: /some-prefix.+/) then the cluster data nodes cpus spins in userland and usage reaches for the sky for quite some time :frowning:

Nice work. Any chance you can work out the exact query being sent to Elasticsearch?

Hm or maybe it's not...
Maybe it's just a question of SOC User's impatience and clicking too fast into different SIEM views and thus generating too many outstanding queries running at the same time.

Just grep a random SIEM view's request:

{
  "aggregations": {
    "eventActionGroup": {
      "terms": {
        "field": "event.type",
        "include": [
          "authentication_success",
          "authentication_failure"
        ],
        "order": {
          "_count": "desc"
        },
        "size": 2
      },
      "aggs": {
        "events": {
          "date_histogram": {
            "field": "@timestamp",
            "fixed_interval": "2700000ms",
            "min_doc_count": 0,
            "extended_bounds": {
              "min": 1583398959591,
              "max": 1583485359592
            }
          }
        }
      }
    }
  },
  "query": {
    "bool": {
      "filter": [
        {
          "bool": {
            "must": [],
            "filter": [
              {
                "bool": {
                  "filter": [
                    {
                      "bool": {
                        "should": [
                          {
                            "match": {
                              "host.name": "/abcdefg.+/"
                            }
                          }
                        ],
                        "minimum_should_match": 1
                      }
                    },
                    {
                      "bool": {
                        "should": [
                          {
                            "match_phrase": {
                              "event.type": "authentication_failure"
                            }
                          }
                        ],
                        "minimum_should_match": 1
                      }
                    }
                  ]
                }
              }
            ],
            "should": [],
            "must_not": []
          }
        },
        {
          "range": {
            "@timestamp": {
              "gte": 1583398959591,
              "lte": 1583485359592
            }
          }
        }
      ]
    }
  },
  "size": 0,
  "track_total_hits": true
}

Thanks. What's the mapping for the event.type field?

      "type": {
        "ignore_above": 1024,
        "type": "keyword"
      }

I've had a quick chat with some colleagues who know more about this than I, and they don't think this search corresponds with the stack trace you shared above. The hot thread looks like it's running a terms aggregation with execution_hint: map, but that's not the case with this search. Do you see any other searches using execution_hint?