Elastic node crashing with JVM dump file error generated after a while

Hello everyone,

I've started to notice very recently that some of our ElasticSearch services crash all of a sudden and they generate the attached dumpfile. Gist here.

Please help as our nodes are currently restarting very often, causing a lot of rebalancing and recovery work throughout the cluster.

Appreciate the inputs,

Regards,

What is the full output of the _cluster/stats API?

Here it is:

{
  "_nodes": {
    "total": 22,
    "successful": 22,
    "failed": 0
  },
  "cluster_name": "ibi.arm2",
  "timestamp": 1575040659711,
  "status": "green",
  "indices": {
    "count": 217,
    "shards": {
      "total": 2471,
      "primaries": 1162,
      "replication": 1.1265060240963856,
      "index": {
        "shards": {
          "min": 2,
          "max": 80,
          "avg": 11.387096774193548
        },
        "primaries": {
          "min": 1,
          "max": 40,
          "avg": 5.354838709677419
        },
        "replication": {
          "min": 0,
          "max": 3,
          "avg": 1.1382488479262673
        }
      }
    },
    "docs": {
      "count": 2795110408,
      "deleted": 6542688
    },
    "store": {
      "size_in_bytes": 13572465678303,
      "throttle_time_in_millis": 0
    },
    "fielddata": {
      "memory_size_in_bytes": 10262840,
      "evictions": 0
    },
    "query_cache": {
      "memory_size_in_bytes": 553293365,
      "total_count": 137503701,
      "hit_count": 13342864,
      "miss_count": 124160837,
      "cache_size": 33243,
      "cache_count": 92022,
      "evictions": 58779
    },
    "completion": {
      "size_in_bytes": 0
    },
    "segments": {
      "count": 34405,
      "memory_in_bytes": 23214144251,
      "terms_memory_in_bytes": 19836993696,
      "stored_fields_memory_in_bytes": 2490308256,
      "term_vectors_memory_in_bytes": 0,
      "norms_memory_in_bytes": 108472640,
      "points_memory_in_bytes": 369890211,
      "doc_values_memory_in_bytes": 408479448,
      "index_writer_memory_in_bytes": 1085106396,
      "version_map_memory_in_bytes": 1265200,
      "fixed_bit_set_memory_in_bytes": 590424,
      "max_unsafe_auto_id_timestamp": 9223372036854776000,
      "file_sizes": {}
    }
  },
  "nodes": {
    "count": {
      "total": 22,
      "data": 12,
      "coordinating_only": 0,
      "master": 5,
      "ingest": 5
    },
    "versions": [
      "5.6.2"
    ],
    "os": {
      "available_processors": 300,
      "allocated_processors": 300,
      "names": [
        {
          "name": "Linux",
          "count": 22
        }
      ],
      "mem": {
        "total_in_bytes": 2228309393408,
        "free_in_bytes": 136024956928,
        "used_in_bytes": 2092284436480,
        "free_percent": 6,
        "used_percent": 94
      }
    },
    "process": {
      "cpu": {
        "percent": 31
      },
      "open_file_descriptors": {
        "min": 815,
        "max": 1657,
        "avg": 1150
      }
    },
    "jvm": {
      "max_uptime_in_millis": 10278692830,
      "versions": [
        {
          "version": "1.8.0_141",
          "vm_name": "Java HotSpot(TM) 64-Bit Server VM",
          "vm_version": "25.141-b15",
          "vm_vendor": "Oracle Corporation",
          "count": 22
        }
      ],
      "mem": {
        "heap_used_in_bytes": 178771880944,
        "heap_max_in_bytes": 470331883520
      },
      "threads": 3257
    },
    "fs": {
      "total_in_bytes": 40983954432000,
      "free_in_bytes": 26720416649216,
      "available_in_bytes": 24816660967424,
      "spins": "true"
    },
    "plugins": [
      {
        "name": "x-pack",
        "version": "5.6.2",
        "description": "Elasticsearch Expanded Pack Plugin",
        "classname": "org.elasticsearch.xpack.XPackPlugin",
        "has_native_controller": true
      }
    ],
    "network_types": {
      "transport_types": {
        "netty4": 22
      },
      "http_types": {
        "netty4": 22
      }
    }
  }
}

Also, I get this output from hot threads in that node

::: {arm-lc-003_data}{-uaOnGLXS6KOax77L-OgiQ}{5AIHs3Y0T-24yl9HxXTkeg}{icsl6922.lc.intel.com}{10.187.72.5:9300}{ibi_site=iil, box_type=hot}
   Hot threads at 2019-11-29T17:39:01.519Z, interval=500ms, busiestThreads=3, ignoreIdleThreads=true:
   
   73.3% (366.5ms out of 500ms) cpu usage by thread 'elasticsearch[arm-lc-003_data][[.monitoring-logstash-6-2019.11.29][2]: Lucene Merge Thread #1806]'
     9/10 snapshots sharing following 14 elements
       java.util.zip.Deflater.deflateBytes(Native Method)
       java.util.zip.Deflater.deflate(Deflater.java:444)
       java.util.zip.Deflater.deflate(Deflater.java:366)
       org.apache.lucene.codecs.compressing.CompressionMode$DeflateCompressor.compress(CompressionMode.java:275)
       org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.flush(CompressingStoredFieldsWriter.java:243)
       org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.finishDocument(CompressingStoredFieldsWriter.java:170)
       org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.merge(CompressingStoredFieldsWriter.java:646)
       org.apache.lucene.index.SegmentMerger.mergeFields(SegmentMerger.java:200)
       org.apache.lucene.index.SegmentMerger.merge(SegmentMerger.java:89)
       org.apache.lucene.index.IndexWriter.mergeMiddle(IndexWriter.java:4356)
       org.apache.lucene.index.IndexWriter.merge(IndexWriter.java:3931)
       org.apache.lucene.index.ConcurrentMergeScheduler.doMerge(ConcurrentMergeScheduler.java:624)
       org.elasticsearch.index.engine.ElasticsearchConcurrentMergeScheduler.doMerge(ElasticsearchConcurrentMergeScheduler.java:99)
       org.apache.lucene.index.ConcurrentMergeScheduler$MergeThread.run(ConcurrentMergeScheduler.java:661)
     unique snapshot
       sun.nio.ch.FileDispatcherImpl.write0(Native Method)
       sun.nio.ch.FileDispatcherImpl.write(FileDispatcherImpl.java:60)
       sun.nio.ch.IOUtil.writeFromNativeBuffer(IOUtil.java:93)
       sun.nio.ch.IOUtil.write(IOUtil.java:65)
       sun.nio.ch.FileChannelImpl.write(FileChannelImpl.java:211)
       java.nio.channels.Channels.writeFullyImpl(Channels.java:78)
       java.nio.channels.Channels.writeFully(Channels.java:101)
       java.nio.channels.Channels.access$000(Channels.java:61)
       java.nio.channels.Channels$1.write(Channels.java:174)
       org.apache.lucene.store.FSDirectory$FSIndexOutput$1.write(FSDirectory.java:419)
       java.util.zip.CheckedOutputStream.write(CheckedOutputStream.java:73)
       java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
       java.io.BufferedOutputStream.write(BufferedOutputStream.java:126)
       org.apache.lucene.store.OutputStreamIndexOutput.writeBytes(OutputStreamIndexOutput.java:53)
       org.apache.lucene.store.RateLimitedIndexOutput.writeBytes(RateLimitedIndexOutput.java:73)
       org.apache.lucene.store.DataOutput.writeBytes(DataOutput.java:52)
       org.apache.lucene.codecs.compressing.CompressionMode$DeflateCompressor.compress(CompressionMode.java:286)
       org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.flush(CompressingStoredFieldsWriter.java:243)
       org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.finishDocument(CompressingStoredFieldsWriter.java:170)
       org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.merge(CompressingStoredFieldsWriter.java:646)
       org.apache.lucene.index.SegmentMerger.mergeFields(SegmentMerger.java:200)
       org.apache.lucene.index.SegmentMerger.merge(SegmentMerger.java:89)
       org.apache.lucene.index.IndexWriter.mergeMiddle(IndexWriter.java:4356)
       org.apache.lucene.index.IndexWriter.merge(IndexWriter.java:3931)
       org.apache.lucene.index.ConcurrentMergeScheduler.doMerge(ConcurrentMergeScheduler.java:624)
       org.elasticsearch.index.engine.ElasticsearchConcurrentMergeScheduler.doMerge(ElasticsearchConcurrentMergeScheduler.java:99)
       org.apache.lucene.index.ConcurrentMergeScheduler$MergeThread.run(ConcurrentMergeScheduler.java:661)
   
    5.9% (29.6ms out of 500ms) cpu usage by thread 'elasticsearch[arm-lc-003_data][bulk][T#12]'
     10/10 snapshots sharing following 5 elements
       org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:638)
       org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37)
       java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
       java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
       java.lang.Thread.run(Thread.java:748)

And one of our bulk queues has 12 tasks stuck since 2 days ago (active)

arm-lc-003_data   bulk                    12  1786  1036789
arm-lc-003_data   fetch_shard_started      0     0        0
arm-lc-003_data   fetch_shard_store        0     0        0
arm-lc-003_data   flush                    4     0        0
arm-lc-003_data   force_merge              0     0        0
arm-lc-003_data   generic                  8     0        0
arm-lc-003_data   get                      0     0        0

What is the specification of the nodes suffering GC issues and heap pressure? How many indices and shards are you actively indexing into? What type of storage do you have on the data nodes?

This looks like #31165 - a SIGBUS is how the JVM reacts to certain kinds of IO problems (e.g. bad blocks on a drive). Your kernel logs should have more information, and the remedy is normally to replace the faulty hardware.

1 Like

I found the following output in kernel logs (it matches the times when the service crashed):

Nov 28 04:09:36 icsl6922 kernel: [15555110.537941] sd 0:0:0:0: [sda]  Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE
Nov 28 04:09:36 icsl6922 kernel: [15555110.537947] sd 0:0:0:0: [sda]  Sense Key : Medium Error [current]             
Nov 28 04:09:36 icsl6922 kernel: [15555110.537950] sd 0:0:0:0: [sda]  Add. Sense: Unrecovered read error             
Nov 28 04:09:36 icsl6922 kernel: [15555110.537954] sd 0:0:0:0: [sda] CDB: Read(10): 28 00 66 50 35 e0 00 00 18 00    
Nov 28 04:09:36 icsl6922 kernel: [15555110.537960] end_request: critical target error, dev sda, sector 1716532704    
Nov 28 04:09:36 icsl6922 kernel: [15555110.538226] sd 0:0:0:0: [sda]  Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE
Nov 28 04:09:36 icsl6922 kernel: [15555110.538229] sd 0:0:0:0: [sda]  Sense Key : Medium Error [current]             
Nov 28 04:09:36 icsl6922 kernel: [15555110.538232] sd 0:0:0:0: [sda]  Add. Sense: Unrecovered read error             
Nov 28 04:09:36 icsl6922 kernel: [15555110.538236] sd 0:0:0:0: [sda] CDB: Read(10): 28 00 66 50 35 e0 00 00 08 00    
Nov 28 04:09:36 icsl6922 kernel: [15555110.538241] end_request: critical target error, dev sda, sector 1716532704
…
Nov 30 13:51:33 icsl6922 kernel: [15762786.324343] sd 0:0:0:0: [sda]  Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE
Nov 30 13:51:33 icsl6922 kernel: [15762786.324348] sd 0:0:0:0: [sda]  Sense Key : Medium Error [current]
Nov 30 13:51:33 icsl6922 kernel: [15762786.324351] sd 0:0:0:0: [sda]  Add. Sense: Unrecovered read error
Nov 30 13:51:33 icsl6922 kernel: [15762786.324355] sd 0:0:0:0: [sda] CDB: Read(10): 28 00 4a 84 9f b8 00 00 18 00
Nov 30 13:51:33 icsl6922 kernel: [15762786.324361] end_request: critical target error, dev sda, sector 1250205624
Nov 30 13:51:33 icsl6922 kernel: [15762786.449916] sd 0:0:0:0: [sda]  Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE
Nov 30 13:51:33 icsl6922 kernel: [15762786.449927] sd 0:0:0:0: [sda]  Sense Key : Medium Error [current] 
Nov 30 13:51:33 icsl6922 kernel: [15762786.449934] sd 0:0:0:0: [sda]  Add. Sense: Unrecovered read error
Nov 30 13:51:33 icsl6922 kernel: [15762786.449942] sd 0:0:0:0: [sda] CDB: Read(10): 28 00 4a 84 9f b8 00 00 08 00
Nov 30 13:51:33 icsl6922 kernel: [15762786.449963] end_request: critical target error, dev sda, sector 1250205624

Yep, looks like you need a new disk.