How to breakdown heap usage?

I am trying to understand what is causing high heap usage. I checked hot threads, nodes stats, fielddata, query cache. Still not able to find out the offending factor. How do you usually profile this to find out the root cause and fix the underlying problem?

We use telegraf / statsd to collect metrics and grafana to monitor the graphs (we don't use marvel / x-pack).
Each node has 15GB JVM and 15GB for lucene, 16 cores. ES version 5.3.2. Following are OS, process and jvm stats.

        "os": {
            "timestamp": 1509721239611,
            "cpu": {
                "percent": 6,
                "load_average": {
                    "1m": 1.6,
                    "5m": 1.91,
                    "15m": 1.94
                }
            },
            "mem": {
                "total": "62.6gb",
                "total_in_bytes": 67269812224,
                "free": "4.6gb",
                "free_in_bytes": 4939411456,
                "used": "58gb",
                "used_in_bytes": 62330400768,
                "free_percent": 7,
                "used_percent": 93
            },
            "swap": {
                "total": "3.9gb",
                "total_in_bytes": 4294963200,
                "free": "1.2gb",
                "free_in_bytes": 1309278208,
                "used": "2.7gb",
                "used_in_bytes": 2985684992
            }
        },
        "process": {
            "timestamp": 1509721239611,
            "open_file_descriptors": 2184,
            "max_file_descriptors": 65536,
            "cpu": {
                "percent": 5,
                "total": "95.4d",
                "total_in_millis": 8249400410
            },
            "mem": {
                "total_virtual": "472.1gb",
                "total_virtual_in_bytes": 506926440448
            }
        },
        "jvm": {
            "timestamp": 1509721239614,
            "uptime": "175.5d",
            "uptime_in_millis": 15170005853,
            "mem": {
                "heap_used": "12.2gb",
                "heap_used_in_bytes": 13175585808,
                "heap_used_percent": 82,
                "heap_committed": "14.8gb",
                "heap_committed_in_bytes": 15905521664,
                "heap_max": "14.8gb",
                "heap_max_in_bytes": 15905521664,
                "non_heap_used": "187.3mb",
                "non_heap_used_in_bytes": 196436816,
                "non_heap_committed": "205mb",
                "non_heap_committed_in_bytes": 215044096,
                "pools": {
                    "young": {
                        "used": "1.2gb",
                        "used_in_bytes": 1310043488,
                        "max": "1.4gb",
                        "max_in_bytes": 1605304320,
                        "peak_used": "1.4gb",
                        "peak_used_in_bytes": 1605304320,
                        "peak_max": "1.4gb",
                        "peak_max_in_bytes": 1605304320
                    },
                    "survivor": {
                        "used": "86.6mb",
                        "used_in_bytes": 90896736,
                        "max": "191.3mb",
                        "max_in_bytes": 200605696,
                        "peak_used": "191.3mb",
                        "peak_used_in_bytes": 200605696,
                        "peak_max": "191.3mb",
                        "peak_max_in_bytes": 200605696
                    },
                    "old": {
                        "used": "10.9gb",
                        "used_in_bytes": 11774645584,
                        "max": "13.1gb",
                        "max_in_bytes": 14099611648,
                        "peak_used": "13.1gb",
                        "peak_used_in_bytes": 14097168752,
                        "peak_max": "13.1gb",
                        "peak_max_in_bytes": 14099611648
                    }
                }
            },
            "threads": {
                "count": 361,
                "peak_count": 420
            },
            "gc": {
                "collectors": {
                    "young": {
                        "collection_count": 1012422,
                        "collection_time": "16.5h",
                        "collection_time_in_millis": 59685110
                    },
                    "old": {
                        "collection_count": 150100,
                        "collection_time": "6.5h",
                        "collection_time_in_millis": 23539944
                    }
                }
            },
            "buffer_pools": {
                "direct": {
                    "count": 355,
                    "used": "1gb",
                    "used_in_bytes": 1087356244,
                    "total_capacity": "1gb",
                    "total_capacity_in_bytes": 1087356243
                },
                "mapped": {
                    "count": 17817,
                    "used": "436gb",
                    "used_in_bytes": 468180363093,
                    "total_capacity": "436gb",
                    "total_capacity_in_bytes": 468180363093
                }
            },
            "classes": {
                "current_loaded_count": 14054,
                "total_loaded_count": 34153,
                "total_unloaded_count": 20099
            }
        },

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.