I am trying to understand what is causing high heap usage. I checked hot threads, nodes stats, fielddata, query cache. Still not able to find out the offending factor. How do you usually profile this to find out the root cause and fix the underlying problem?
We use telegraf / statsd to collect metrics and grafana to monitor the graphs (we don't use marvel / x-pack).
Each node has 15GB JVM and 15GB for lucene, 16 cores. ES version 5.3.2. Following are OS, process and jvm stats.
"os": {
"timestamp": 1509721239611,
"cpu": {
"percent": 6,
"load_average": {
"1m": 1.6,
"5m": 1.91,
"15m": 1.94
}
},
"mem": {
"total": "62.6gb",
"total_in_bytes": 67269812224,
"free": "4.6gb",
"free_in_bytes": 4939411456,
"used": "58gb",
"used_in_bytes": 62330400768,
"free_percent": 7,
"used_percent": 93
},
"swap": {
"total": "3.9gb",
"total_in_bytes": 4294963200,
"free": "1.2gb",
"free_in_bytes": 1309278208,
"used": "2.7gb",
"used_in_bytes": 2985684992
}
},
"process": {
"timestamp": 1509721239611,
"open_file_descriptors": 2184,
"max_file_descriptors": 65536,
"cpu": {
"percent": 5,
"total": "95.4d",
"total_in_millis": 8249400410
},
"mem": {
"total_virtual": "472.1gb",
"total_virtual_in_bytes": 506926440448
}
},
"jvm": {
"timestamp": 1509721239614,
"uptime": "175.5d",
"uptime_in_millis": 15170005853,
"mem": {
"heap_used": "12.2gb",
"heap_used_in_bytes": 13175585808,
"heap_used_percent": 82,
"heap_committed": "14.8gb",
"heap_committed_in_bytes": 15905521664,
"heap_max": "14.8gb",
"heap_max_in_bytes": 15905521664,
"non_heap_used": "187.3mb",
"non_heap_used_in_bytes": 196436816,
"non_heap_committed": "205mb",
"non_heap_committed_in_bytes": 215044096,
"pools": {
"young": {
"used": "1.2gb",
"used_in_bytes": 1310043488,
"max": "1.4gb",
"max_in_bytes": 1605304320,
"peak_used": "1.4gb",
"peak_used_in_bytes": 1605304320,
"peak_max": "1.4gb",
"peak_max_in_bytes": 1605304320
},
"survivor": {
"used": "86.6mb",
"used_in_bytes": 90896736,
"max": "191.3mb",
"max_in_bytes": 200605696,
"peak_used": "191.3mb",
"peak_used_in_bytes": 200605696,
"peak_max": "191.3mb",
"peak_max_in_bytes": 200605696
},
"old": {
"used": "10.9gb",
"used_in_bytes": 11774645584,
"max": "13.1gb",
"max_in_bytes": 14099611648,
"peak_used": "13.1gb",
"peak_used_in_bytes": 14097168752,
"peak_max": "13.1gb",
"peak_max_in_bytes": 14099611648
}
}
},
"threads": {
"count": 361,
"peak_count": 420
},
"gc": {
"collectors": {
"young": {
"collection_count": 1012422,
"collection_time": "16.5h",
"collection_time_in_millis": 59685110
},
"old": {
"collection_count": 150100,
"collection_time": "6.5h",
"collection_time_in_millis": 23539944
}
}
},
"buffer_pools": {
"direct": {
"count": 355,
"used": "1gb",
"used_in_bytes": 1087356244,
"total_capacity": "1gb",
"total_capacity_in_bytes": 1087356243
},
"mapped": {
"count": 17817,
"used": "436gb",
"used_in_bytes": 468180363093,
"total_capacity": "436gb",
"total_capacity_in_bytes": 468180363093
}
},
"classes": {
"current_loaded_count": 14054,
"total_loaded_count": 34153,
"total_unloaded_count": 20099
}
},