JVM error in elasticsearch log causes failure in application

Hi all,
Elasticsearch application failed due to jvm errors. i tried to fix that changing: esJavaOpts: "-Xmx8g -Xms8g" in yaml file. unfortunately this did not help. In general the app is bases on 3 servers with 8 cpu and 32GB and a big NFS storage.
Please assist.

{"type": "server", "timestamp": "2021-07-19T18:48:05,588Z", "level": "INFO", "component": "o.e.m.j.JvmGcMonitorService", "cluster.name": "elasticsearch", "node.name": "elasticsearch-master-0", "message": "[gc][23103] overhead, spent [525ms] collecting in the last [1s]", "cluster.uuid": "JU99REOhRxa8BEoy2Ib1Gg", "node.id": "cuF0wFv2T1a_g64EBFvgPw"  }
{"type": "server", "timestamp": "2021-07-19T18:48:45,195Z", "level": "INFO", "component": "o.e.m.j.JvmGcMonitorService", "cluster.name": "elasticsearch", "node.name": "elasticsearch-master-0", "message": "[gc][23141] overhead, spent [287ms] collecting in the last [1s]", "cluster.uuid": "JU99REOhRxa8BEoy2Ib1Gg", "node.id": "cuF0wFv2T1a_g64EBFvgPw"  }
{"type": "server", "timestamp": "2021-07-19T18:49:18,185Z", "level": "INFO", "component": "o.e.m.j.JvmGcMonitorService", "cluster.name": "elasticsearch", "node.name": "elasticsearch-master-0", "message": "[gc][23173] overhead, spent [271ms] collecting in the last [1s]", "cluster.uuid": "JU99REOhRxa8BEoy2Ib1Gg", "node.id": "cuF0wFv2T1a_g64EBFvgPw"  }
{"type": "server", "timestamp": "2021-07-19T18:49:56,697Z", "level": "INFO", "component": "o.e.m.j.JvmGcMonitorService", "cluster.name": "elasticsearch", "node.name": "elasticsearch-master-0", "message": "[gc][23211] overhead, spent [360ms] collecting in the last [1s]", "cluster.uuid": "JU99REOhRxa8BEoy2Ib1Gg", "node.id": "cuF0wFv2T1a_g64EBFvgPw"  }
{"type": "server", "timestamp": "2021-07-19T18:50:26,002Z", "level": "INFO", "component": "o.e.m.j.JvmGcMonitorService", "cluster.name": "elasticsearch", "node.name": "elasticsearch-master-0", "message": "[gc][23240] overhead, spent [374ms] collecting in the last [1.1s]", "cluster.uuid": "JU99REOhRxa8BEoy2Ib1Gg", "node.id": "cuF0wFv2T1a_g64EBFvgPw"  }
{"type": "server", "timestamp": "2021-07-19T18:50:53,397Z", "level": "WARN", "component": "o.e.m.j.JvmGcMonitorService", "cluster.name": "elasticsearch", "node.name": "elasticsearch-master-0", "message": "[gc][23267] overhead, spent [563ms] collecting in the last [1s]", "cluster.uuid": "JU99REOhRxa8BEoy2Ib1Gg", "node.id": "cuF0wFv2T1a_g64EBFvgPw"  }
{"type": "server", "timestamp": "2021-07-19T18:53:48,590Z", "level": "INFO", "component": "o.e.m.j.JvmGcMonitorService", "cluster.name": "elasticsearch", "node.name": "elasticsearch-master-0", "message": "[gc][23439] overhead, spent [292ms] collecting in the last [1s]", "cluster.uuid": "JU99REOhRxa8BEoy2Ib1Gg", "node.id": "cuF0wFv2T1a_g64EBFvgPw"  }
{"type": "server", "timestamp": "2021-07-19T18:54:26,595Z", "level": "INFO", "component": "o.e.m.j.JvmGcMonitorService", "cluster.name": "elasticsearch", "node.name": "elasticsearch-master-0", "message": "[gc][23476] overhead, spent [315ms] collecting in the last [1s]", "cluster.uuid": "JU99REOhRxa8BEoy2Ib1Gg", "node.id": "cuF0wFv2T1a_g64EBFvgPw"  }
{"type": "server", "timestamp": "2021-07-19T18:55:03,390Z", "level": "INFO", "component": "o.e.m.j.JvmGcMonitorService", "cluster.name": "elasticsearch", "node.name": "elasticsearch-master-0", "message": "[gc][23512] overhead, spent [258ms] collecting in the last [1s]", "cluster.uuid": "JU99REOhRxa8BEoy2Ib1Gg", "node.id": "cuF0wFv2T1a_g64EBFvgPw"  }
{"type": "server", "timestamp": "2021-07-19T18:56:08,988Z", "level": "INFO", "component": "o.e.m.j.JvmGcMonitorService", "cluster.name": "elasticsearch", "node.name": "elasticsearch-master-0", "message": "[gc][23577] overhead, spent [467ms] collecting in the last [1.2s]", "cluster.uuid": "JU99REOhRxa8BEoy2Ib1Gg", "node.id": "cuF0wFv2T1a_g64EBFvgPw"  }
{"type": "server", "timestamp": "2021-07-19T18:58:05,237Z", "level": "INFO", "component": "o.e.c.s.ClusterApplierService", "cluster.name": "elasticsearch", "node.name": "elasticsearch-master-0", "message": "added {{elasticsearch-master-2}{fNZ2A7YhSK6Xj1BdrhYzbg}{S2SwXydVSOmI_G3ubnQFhQ}{10.42.0.166}{10.42.0.166:9300}{cdfhilmrstw}{ml.machine_memory=25769803776, ml.max_open_jobs=20, xpack.installed=true, ml.max_jvm_size=2147483648, transform.node=true}}, term: 160, version: 174966, reason: ApplyCommitRequest{term=160, version=174966, sourceNode={elasticsearch-master-1}{jOBr_EBpTKCP-O9qPBAQCg}{TY8G2-_4QTCZ02d2ne7tug}{10.42.1.243}{10.42.1.243:9300}{cdfhilmrstw}{ml.machine_memory=10737418240, ml.max_open_jobs=20, xpack.installed=true, ml.max_jvm_size=8589934592, transform.node=true}}", "cluster.uuid": "JU99REOhRxa8BEoy2Ib1Gg", "node.id": "cuF0wFv2T1a_g64EBFvgPw"  }
{"type": "server", "timestamp": "2021-07-19T18:58:40,140Z", "level": "WARN", "component": "o.e.t.TransportService", "cluster.name": "elasticsearch", "node.name": "elasticsearch-master-0", "message": "Received response for a request that has timed out, sent [18.6s/18694ms] ago, timed out [8.8s/8815ms] ago, action [internal:coordination/fault_detection/leader_check], node [{elasticsearch-master-1}{jOBr_EBpTKCP-O9qPBAQCg}{TY8G2-_4QTCZ02d2ne7tug}{10.42.1.243}{10.42.1.243:9300}{cdfhilmrstw}{ml.machine_memory=10737418240, ml.max_open_jobs=20, xpack.installed=true, ml.max_jvm_size=8589934592, transform.node=true}], id [5306751]", "cluster.uuid": "JU99REOhRxa8BEoy2Ib1Gg", "node.id": "cuF0wFv2T1a_g64EBFvgPw"  }
{"type": "server", "timestamp": "2021-07-19T18:59:01,966Z", "level": "INFO", "component": "o.e.m.j.JvmGcMonitorService", "cluster.name": "elasticsearch", "node.name": "elasticsearch-master-0", "message": "[gc][23749] overhead, spent [262ms] collecting in the last [1s]", "cluster.uuid": "JU99REOhRxa8BEoy2Ib1Gg", "node.id": "cuF0wFv2T1a_g64EBFvgPw"  }
{"type": "server", "timestamp": "2021-07-19T18:59:26,036Z", "level": "INFO", "component": "o.e.c.c.Coordinator", "cluster.name": "elasticsearch", "node.name": "elasticsearch-master-0", "message": "master node [{elasticsearch-master-1}{jOBr_EBpTKCP-O9qPBAQCg}{TY8G2-_4QTCZ02d2ne7tug}{10.42.1.243}{10.42.1.243:9300}{cdfhilmrstw}{ml.machine_memory=10737418240, ml.max_open_jobs=20, xpack.installed=true, ml.max_jvm_size=8589934592, transform.node=true}] failed, restarting discovery", "cluster.uuid": "JU99REOhRxa8BEoy2Ib1Gg", "node.id": "cuF0wFv2T1a_g64EBFvgPw" ,
"stacktrace": ["org.elasticsearch.ElasticsearchException: node [{elasticsearch-master-1}{jOBr_EBpTKCP-O9qPBAQCg}{TY8G2-_4QTCZ02d2ne7tug}{10.42.1.243}{10.42.1.243:9300}{cdfhilmrstw}{ml.machine_memory=10737418240, ml.max_open_jobs=20, xpack.installed=true, ml.max_jvm_size=8589934592, transform.node=true}] failed [3] consecutive checks",
"at org.elasticsearch.cluster.coordination.LeaderChecker$CheckScheduler$1.handleException(LeaderChecker.java:275) ~[elasticsearch-7.12.1.jar:7.12.1]",
"at org.elasticsearch.transport.TransportService$ContextRestoreResponseHandler.handleException(TransportService.java:1290) ~[elasticsearch-7.12.1.jar:7.12.1]",
"at org.elasticsearch.transport.TransportService$ContextRestoreResponseHandler.handleException(TransportService.java:1290) ~[elasticsearch-7.12.1.jar:7.12.1]",
"at org.elasticsearch.transport.InboundHandler.lambda$handleException$3(InboundHandler.java:317) ~[elasticsearch-7.12.1.jar:7.12.1]",
"at org.elasticsearch.common.util.concurrent.EsExecutors$DirectExecutorService.execute(EsExecutors.java:215) ~[elasticsearch-7.12.1.jar:7.12.1]",
"at org.elasticsearch.transport.InboundHandler.handleException(InboundHandler.java:315) ~[elasticsearch-7.12.1.jar:7.12.1]",
"at org.elasticsearch.transport.InboundHandler.handlerResponseError(InboundHandler.java:307) ~[elasticsearch-7.12.1.jar:7.12.1]",
"at org.elasticsearch.transport.InboundHandler.messageReceived(InboundHandler.java:126) ~[elasticsearch-7.12.1.jar:7.12.1]",
"at org.elasticsearch.transport.InboundHandler.inboundMessage(InboundHandler.java:84) ~[elasticsearch-7.12.1.jar:7.12.1]",
"at org.elasticsearch.transport.TcpTransport.inboundMessage(TcpTransport.java:690) ~[elasticsearch-7.12.1.jar:7.12.1]",
"at org.elasticsearch.transport.InboundPipeline.forwardFragments(InboundPipeline.java:131) ~[elasticsearch-7.12.1.jar:7.12.1]",
"at org.elasticsearch.transport.InboundPipeline.doHandleBytes(InboundPipeline.java:106) ~[elasticsearch-7.12.1.jar:7.12.1]",
"at org.elasticsearch.transport.InboundPipeline.handleBytes(InboundPipeline.java:71) ~[elasticsearch-7.12.1.jar:7.12.1]",
"at org.elasticsearch.transport.netty4.Netty4MessageChannelHandler.channelRead(Netty4MessageChannelHandler.java:63) ~[?:?]",
"at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379) ~[?:?]",
"at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365) ~[?:?]",
"at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357) ~[?:?]",
"at io.netty.handler.logging.LoggingHandler.channelRead(LoggingHandler.java:271) ~[?:?]",
"at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379) ~[?:?]",
"at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365) ~[?:?]",
"at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357) ~[?:?]",
"at io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1410) ~[?:?]",
"at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379) ~[?:?]",
"at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365) ~[?:?]",
"at io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:919) ~[?:?]",
"at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:163) ~[?:?]",
"at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:714) ~[?:?]",
"at io.netty.channel.nio.NioEventLoop.processSelectedKeysPlain(NioEventLoop.java:615) ~[?:?]",
"at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:578) ~[?:?]",
"at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:493) ~[?:?]",
"at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:989) ~[?:?]",
"at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) ~[?:?]",
"at java.lang.Thread.run(Thread.java:831) [?:?]",

Can you explain, how you linked the error message to the heap and thus changed the heap size?

Also, can you share the full exception in a gist?

It is not clear enough what is the root cause of the exception. And i just guess that it is related to heap size because before the java exception i saw the raws of jvm. I opened one of the containers logs and this was the log. I can not bring my application to be stable for more that 2 months. It started with shards exception then memory then strange messages that i can it understand like the one i describe here and i need your expert and knowledge here.
Thanks

Please try to collect your logs in order to see the full stack trace for the last message.

In general you are right that the message of an extended GC collection more often indicates, that a lot of time is spent using GCs.

The main question here is, why is your system needing all that heap? Do you have a lot of shards? Are you running big queries with lots of aggregations?

I am using python-logstash-async python as a log handler in our python infrastructure.
Each log message in user's script sending a structured message which contains vital info: user, machine, type,line,interpreter, message text and other extra added by users.
The user let us say add in his code:logger.info("Hii"), all of what i mentioned sent automatically into logstash. Running a script which can send hundreds of such logs is sent to one index accordingly to logstash configuration filtering in my ELK app. It So this app is serving something like 150 users that all of them make a daily runs which accumulate around 5 GB data.
So index can be very big. This system is serving both development and production.
Regarding your question i am using DSL queries using elasticsearch client. But i do not think that the queries is the root cause. After almost a month from starting all from scratch i am getting Some message related to shards and this actually break elasticsearch containers. I am pretty sure that this is not the correct solution but what i did is : curl -XPUT -H 'Content-Type: application/json' 'IP-OF-ELASTIC-SERVER:9200/_cluster/settings' -d '{ "persistent" : {"cluster.max_shards_per_node" : 5000}}'. and this resolve the issue.
I know number of shards should be calculated accordingly to this : Managing and troubleshooting Elasticsearch memory | Elastic Blog

But since i did not configure in any place shards before. Then days after let us say a month the application starting to crash with messages i displayed at the beginning of the thread where i am lost and do not know how to progress and resolve!!

Thanks a lot and sorry if i elaborated all my case here.

Why do you have so many shards? In general you should try to reduce the number of shards that will also reduce your memory requirements? Is there any chance you can reduce the shard count? That would possibly offer a quick win...

total shards, but this cluster currently has [3000]/[3000] maximum shards open;"}}}}

7/28/2021 6:09:42 PM [2021-07-28T15:09:42,731][WARN ][logstash.outputs.elasticsearch][main] Could not index event to Elasticsearch. {:status=>400, :action=>["index", {:_id=>nil, :_index=>"qascreencapture-1627484927937893928", :routing=>nil, :_type=>"_doc"}, #<LogStash::Event:0x34f50dc7>], :response=>{"index"=>{"_index"=>"qascreencapture-1627484927937893928", "_type"=>"_doc", "_id"=>nil, "status"=>400, "error"=>{"type"=>"validation_exception", "reason"=>"Validation Failed: 1: this action would add [2] total shards, but this cluster currently has [3000]/[3000] maximum shards open;"}}}}

This is what i am talking about. This log found in my logstash container log. So there is no new indexes due to this error. Again i did not configure shards at all. Where Can i do that ?

curl http://xxxxxx/_cluster/health?pretty{
  "cluster_name" : "elasticsearch",
  "status" : "green",
  "timed_out" : false,
  "number_of_nodes" : 3,
  "number_of_data_nodes" : 3,
  "active_primary_shards" : 1501,
  "active_shards" : 3002,
  "relocating_shards" : 0,
  "initializing_shards" : 0,
  "unassigned_shards" : 0,
  "delayed_unassigned_shards" : 0,
  "number_of_pending_tasks" : 0,
  "number_of_in_flight_fetch" : 0,
  "task_max_waiting_in_queue_millis" : 0,
  "active_shards_percent_as_number" : 100.0
}

Which Elasticsearch version are you using? How do you index data into Elasticsearch? Via logstash only?

I recently upgraded to 7.12.1. Before i worked with 7.6.1.
ye only with logstash

so, some configuration within logstash is creating all those shards. How many primary shards per index are you using? How many indices do you have? Did you configure an index template to set this different than the default value of one shard per index?

No actually i did not take into account all of that. I used some chart of logstash. And leave it as is. sharing with you its content:

replicaCount: 3
image:
  repository: docker.elastic.co/logstash/logstash-oss
  tag: 7.6.1
  pullPolicy: IfNotPresent
service:
  type: NodePort
  ports:
    qa-tcp:
      port: 5045
      targetPort: qa-tcp
      protocol: TCP
    qa-udp:
      port: 5046
      targetPort: qa-udp
      protocol: UDP
    qascreencapture:
      port: 5046
      targetPort: qascreencapture
      protocol: TCP
ports:
    - containerPort: 5045
      name: qa-tcp
      protocol: TCP
    - name: qa-udp
      containerPort: 5045
      protocol: UDP
    - name: qascreencapture
      containerPort: 5046
      protocol: TCP
elasticsearch:
  host: elasticsearch-master
  port: 9200
  
ingress:
  enabled: false
  annotations: {}
    # kubernetes.io/ingress.class: nginx
    # kubernetes.io/tls-acme: "true"
  path: /
  hosts:
    - qa-logstash.labs.mlnx
  tls: []
  #  - secretName: logstash-tls
  #    hosts:
  #      - logstash.cluster.local
config:
  config.reload.automatic: "true"
  path.config: /usr/share/logstash/pipeline
  path.data: /usr/share/logstash/data
  queue.drain: "true"
  queue.max_bytes: 1gb
  queue.type: persisted
  log.level: info
persistence:
    enabled: true
    accessMode: ReadWriteOnce
    name: data
    size: 2Gi
# set java options like heap size
logstashJavaOpts: "-Xmx1g -Xms1g"
volumeMounts:
  - name: pipelines
    mountPath: /usr/share/logstash/pipeline
volumes:   
- name: pipelines
  configMap:
    name: logstash-conf
livenessProbe:
  httpGet:
    path: /
    port: monitor
  initialDelaySeconds: 90
readinessProbe:
  httpGet:
    path: /
    port: monitor
  initialDelaySeconds: 90

where to configure that index template. i see this term only in kibana side. I sit the correct place to do that?

10x,
Dawood

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.