Elasticsearch 6.8.10 memory leak

100+ node es cluster , hot/warm arch

The stale node heap is 31gb, and multiple nodes occupy 90% of the heap irregularly. It will appear again one week after restarting.

segment memory(14g), query cache(2.6g), single stale node shard num 300+

this is mat result

I think the problem is here, jave local cannot be gc, but he occupies 6.2g

When will this class be used?

hello?

Although I can not answer your questions about the internals, I suspect a bit more details and context would be useful:

  • Do you have any non-default settings in your Elasticsearch config?
  • Have you verified that your heap size is set so it allows the use of compressed pointers?
  • What is the full output of the cluster stats API?
  1. No special configuration
  2. Configure 31g heap, confirm that compression is enabled
{
  "_nodes": {
    "total": 130,
    "successful": 130,
    "failed": 0
  },
  "cluster_name": "es-104",
  "cluster_uuid": "XeP8Hhn6SpmV96ZDxxRyrA",
  "timestamp": 1604886843395,
  "status": "green",
  "indices": {
    "count": 7164,
    "shards": {
      "total": 28274,
      "primaries": 14180,
      "replication": 0.993935119887165,
      "index": {
        "shards": {
          "min": 1,
          "max": 76,
          "avg": 3.946677833612507
        },
        "primaries": {
          "min": 1,
          "max": 22,
          "avg": 1.9793411501954215
        },
        "replication": {
          "min": 0,
          "max": 3,
          "avg": 0.9913456169737577
        }
      }
    },
    "docs": {
      "count": 1288729582393,
      "deleted": 161973162
    },
    "store": {
      "size": "668.3tb",
      "size_in_bytes": 734839008348310
    },
    "fielddata": {
      "memory_size": "6.4gb",
      "memory_size_in_bytes": 6955353856,
      "evictions": 0
    },
    "query_cache": {
      "memory_size": "34.8gb",
      "memory_size_in_bytes": 37426849826,
      "total_count": 175852223,
      "hit_count": 39074450,
      "miss_count": 136777773,
      "cache_size": 141452,
      "cache_count": 547355,
      "evictions": 405903
    },
    "completion": {
      "size": "0b",
      "size_in_bytes": 0
    },
    "segments": {
      "count": 328792,
      "memory": "444.4gb",
      "memory_in_bytes": 477235581752,
      "terms_memory": "309.9gb",
      "terms_memory_in_bytes": 332837416646,
      "stored_fields_memory": "114gb",
      "stored_fields_memory_in_bytes": 122425428208,
      "term_vectors_memory": "63.9kb",
      "term_vectors_memory_in_bytes": 65488,
      "norms_memory": "24.4mb",
      "norms_memory_in_bytes": 25600832,
      "points_memory": "19.8gb",
      "points_memory_in_bytes": 21345282922,
      "doc_values_memory": "573.9mb",
      "doc_values_memory_in_bytes": 601787656,
      "index_writer_memory": "24gb",
      "index_writer_memory_in_bytes": 25851720323,
      "version_map_memory": "431.5mb",
      "version_map_memory_in_bytes": 452468707,
      "fixed_bit_set": "2.8gb",
      "fixed_bit_set_memory_in_bytes": 3052130688,
      "max_unsafe_auto_id_timestamp": 1604883906295,
      "file_sizes": {

      }
    }
  },
  "nodes": {
    "count": {
      "total": 130,
      "data": 123,
      "coordinating_only": 4,
      "master": 3,
      "ingest": 126
    },
    "versions": [
      "6.8.3",
      "6.8.12"
    ],
    "os": {
      "available_processors": 1616,
      "allocated_processors": 1408,
      "names": [
        {
          "name": "Linux",
          "count": 130
        }
      ],
      "pretty_names": [
        {
          "pretty_name": "CentOS Linux 7 (Core)",
          "count": 130
        }
      ],
      "mem": {
        "total": "6.4tb",
        "total_in_bytes": 7131041304576,
        "free": "884.2gb",
        "free_in_bytes": 949446234112,
        "used": "5.6tb",
        "used_in_bytes": 6181595070464,
        "free_percent": 13,
        "used_percent": 87
      }
    },
    "process": {
      "cpu": {
        "percent": 222
      },
      "open_file_descriptors": {
        "min": 3234,
        "max": 7161,
        "avg": 4158
      }
    },
    "jvm": {
      "max_uptime": "415.7d",
      "max_uptime_in_millis": 35917212594,
      "versions": [
        {
          "version": "14.0.1",
          "vm_name": "OpenJDK 64-Bit Server VM",
          "vm_version": "14.0.1+7",
          "vm_vendor": "AdoptOpenJDK",
          "count": 129
        },
        {
          "version": "12.0.2",
          "vm_name": "OpenJDK 64-Bit Server VM",
          "vm_version": "12.0.2+10",
          "vm_vendor": "Oracle Corporation",
          "count": 1
        }
      ],
      "mem": {
        "heap_used": "1.5tb",
        "heap_used_in_bytes": 1649819597368,
        "heap_max": "2.9tb",
        "heap_max_in_bytes": 3228497412096
      },
      "threads": 16739
    },
    "fs": {
      "total": "958.8tb",
      "total_in_bytes": 1054304221237248,
      "free": "521.9tb",
      "free_in_bytes": 573839523950592,
      "available": "483.2tb",
      "available_in_bytes": 531369791172608
    },
    "plugins": [
      {
        "name": "analysis-ik",
        "version": "6.8.12",
        "elasticsearch_version": "6.8.12",
        "java_version": "1.8",
        "description": "IK Analyzer for Elasticsearch",
        "classname": "org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin",
        "extended_plugins": [

        ],
        "has_native_controller": false
      },
      {
        "name": "analysis-ik",
        "version": "6.8.3",
        "elasticsearch_version": "6.8.3",
        "java_version": "1.8",
        "description": "IK Analyzer for Elasticsearch",
        "classname": "org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin",
        "extended_plugins": [

        ],
        "has_native_controller": false
      }
    ],
    "network_types": {
      "transport_types": {
        "netty4": 130
      },
      "http_types": {
        "netty4": 130
      }
    }
  }
}

You mention 6.8.10, but you are running a mix of two 6.8 versions, neither .10? You should really make sure all nodes are running the same version.

Same for your JVMs, make sure they are all the same.

This shouldn’t be important. 6.8.3 is a tribe node. It has existed for a long time and is about to be deprecated.