Hi, we are running a 4 node ES cluster. Sometimes there are frequent GC in elasticsearch nodes and it starts crash looping for some time. It seems it is crash looping because of OOM. It happens randomly and very less frequently.
- Is there a way to confirm that ES is crashing because of OOM. There is no error message with OOM in ES logs as such.
- Can we monitor something/enable more logging for this?
Adding some stats below. Please note that these stats are in normal condition, since we do not know when the issue happens.
ES_HEAP is set to 1G.
In top command, Virtual memory used by ES process is 15G. Resident memory looks fine. Can we check why virtual memory is so high ?
Thanks.
Output of "_nodes/stats" API on one of the nodes:
"6LlL-senRSSPPzRF-uWzfw" : {
"indices" : {
"docs" : {
"count" : 18760592,
"deleted" : 7174232
},
"store" : {
"size_in_bytes" : 4074668631,
"throttle_time_in_millis" : 0
},
"indexing" : {
"index_total" : 5254,
"index_time_in_millis" : 1613096,
"index_current" : 0,
"delete_total" : 5011,
"delete_time_in_millis" : 1241551,
"delete_current" : 0,
"noop_update_total" : 0,
"is_throttled" : false,
"throttle_time_in_millis" : 0
},
"merges" : {
"current" : 0,
"current_docs" : 0,
"current_size_in_bytes" : 0,
"total" : 54,
"total_time_in_millis" : 64200,
"total_docs" : 305024,
"total_size_in_bytes" : 61877981,
"total_stopped_time_in_millis" : 0,
"total_throttled_time_in_millis" : 0,
"total_auto_throttle_in_bytes" : 314572800
},
"refresh" : {
"total" : 1857,
"total_time_in_millis" : 447633
},
"flush" : {
"total" : 417,
"total_time_in_millis" : 557973
},
"fielddata" : {
"memory_size_in_bytes" : 0,
"evictions" : 0
},
"percolate" : {
"total" : 0,
"time_in_millis" : 0,
"current" : 0,
"memory_size_in_bytes" : -1,
"memory_size" : "-1b",
"queries" : 0
},
"completion" : {
"size_in_bytes" : 0
},
"segments" : {
"count" : 202,
"memory_in_bytes" : 19379660,
"terms_memory_in_bytes" : 16196400,
"stored_fields_memory_in_bytes" : 3156496,
"term_vectors_memory_in_bytes" : 0,
"norms_memory_in_bytes" : 8160,
"doc_values_memory_in_bytes" : 18604,
"index_writer_memory_in_bytes" : 0,
"index_writer_max_memory_in_bytes" : 7680000,
"version_map_memory_in_bytes" : 0,
"fixed_bit_set_memory_in_bytes" : 0
},
"os" : {
"timestamp" : 1470821759940,
"load_average" : 41.86,
"mem" : {
"total_in_bytes" : 67278057472,
"free_in_bytes" : 1224552448,
"used_in_bytes" : 66053505024,
"free_percent" : 2,
"used_percent" : 98
},
"swap" : {
"total_in_bytes" : 0,
"free_in_bytes" : 0,
"used_in_bytes" : 0
}
},
"process" : {
"timestamp" : 1470821759941,
"open_file_descriptors" : 871,
"max_file_descriptors" : 8192,
"cpu" : {
"percent" : 0,
"total_in_millis" : 568010
},
"mem" : {
"total_virtual_in_bytes" : 16815521792
}
},
"jvm" : {
"timestamp" : 1470821759944,
"uptime_in_millis" : 36490754,
"mem" : {
"heap_used_in_bytes" : 383695600,
"heap_used_percent" : 36,
"heap_committed_in_bytes" : 1037959168,
"heap_max_in_bytes" : 1037959168,
"non_heap_used_in_bytes" : 79842504,
"non_heap_committed_in_bytes" : 81248256,
"pools" : {
"young" : {
"used_in_bytes" : 284070776,
"max_in_bytes" : 286326784,
"peak_used_in_bytes" : 286326784,
"peak_max_in_bytes" : 286326784
},
"survivor" : {
"used_in_bytes" : 17477200,
"max_in_bytes" : 35782656,
"peak_used_in_bytes" : 29910688,
"peak_max_in_bytes" : 35782656
},
"old" : {
"used_in_bytes" : 82147624,
"max_in_bytes" : 715849728,
"peak_used_in_bytes" : 82147624,
"peak_max_in_bytes" : 715849728
}
}
},
"threads" : {
"count" : 227,
"peak_count" : 237
},
"gc" : {
"collectors" : {
"young" : {
"collection_count" : 35,
"collection_time_in_millis" : 1008
},
"old" : {
"collection_count" : 1,
"collection_time_in_millis" : 98
}
}
},
"buffer_pools" : {
"direct" : {
"count" : 451,
"used_in_bytes" : 62089757,
"total_capacity_in_bytes" : 62089757
},
"mapped" : {
"count" : 52,
"used_in_bytes" : 1116108764,
"total_capacity_in_bytes" : 1116108764
}
}
},