Elasticsearch 6.8.10 memory leak

100+ node es cluster , hot/warm arch

The stale node heap is 31gb, and multiple nodes occupy 90% of the heap irregularly. It will appear again one week after restarting.

segment memory(14g), query cache(2.6g), single stale node shard num 300+

this is mat result

I think the problem is here, jave local cannot be gc, but he occupies 6.2g

When will this class be used?

hello?

Although I can not answer your questions about the internals, I suspect a bit more details and context would be useful:

  • Do you have any non-default settings in your Elasticsearch config?
  • Have you verified that your heap size is set so it allows the use of compressed pointers?
  • What is the full output of the cluster stats API?
  1. No special configuration
  2. Configure 31g heap, confirm that compression is enabled
{
  "_nodes": {
    "total": 130,
    "successful": 130,
    "failed": 0
  },
  "cluster_name": "es-104",
  "cluster_uuid": "XeP8Hhn6SpmV96ZDxxRyrA",
  "timestamp": 1604886843395,
  "status": "green",
  "indices": {
    "count": 7164,
    "shards": {
      "total": 28274,
      "primaries": 14180,
      "replication": 0.993935119887165,
      "index": {
        "shards": {
          "min": 1,
          "max": 76,
          "avg": 3.946677833612507
        },
        "primaries": {
          "min": 1,
          "max": 22,
          "avg": 1.9793411501954215
        },
        "replication": {
          "min": 0,
          "max": 3,
          "avg": 0.9913456169737577
        }
      }
    },
    "docs": {
      "count": 1288729582393,
      "deleted": 161973162
    },
    "store": {
      "size": "668.3tb",
      "size_in_bytes": 734839008348310
    },
    "fielddata": {
      "memory_size": "6.4gb",
      "memory_size_in_bytes": 6955353856,
      "evictions": 0
    },
    "query_cache": {
      "memory_size": "34.8gb",
      "memory_size_in_bytes": 37426849826,
      "total_count": 175852223,
      "hit_count": 39074450,
      "miss_count": 136777773,
      "cache_size": 141452,
      "cache_count": 547355,
      "evictions": 405903
    },
    "completion": {
      "size": "0b",
      "size_in_bytes": 0
    },
    "segments": {
      "count": 328792,
      "memory": "444.4gb",
      "memory_in_bytes": 477235581752,
      "terms_memory": "309.9gb",
      "terms_memory_in_bytes": 332837416646,
      "stored_fields_memory": "114gb",
      "stored_fields_memory_in_bytes": 122425428208,
      "term_vectors_memory": "63.9kb",
      "term_vectors_memory_in_bytes": 65488,
      "norms_memory": "24.4mb",
      "norms_memory_in_bytes": 25600832,
      "points_memory": "19.8gb",
      "points_memory_in_bytes": 21345282922,
      "doc_values_memory": "573.9mb",
      "doc_values_memory_in_bytes": 601787656,
      "index_writer_memory": "24gb",
      "index_writer_memory_in_bytes": 25851720323,
      "version_map_memory": "431.5mb",
      "version_map_memory_in_bytes": 452468707,
      "fixed_bit_set": "2.8gb",
      "fixed_bit_set_memory_in_bytes": 3052130688,
      "max_unsafe_auto_id_timestamp": 1604883906295,
      "file_sizes": {

      }
    }
  },
  "nodes": {
    "count": {
      "total": 130,
      "data": 123,
      "coordinating_only": 4,
      "master": 3,
      "ingest": 126
    },
    "versions": [
      "6.8.3",
      "6.8.12"
    ],
    "os": {
      "available_processors": 1616,
      "allocated_processors": 1408,
      "names": [
        {
          "name": "Linux",
          "count": 130
        }
      ],
      "pretty_names": [
        {
          "pretty_name": "CentOS Linux 7 (Core)",
          "count": 130
        }
      ],
      "mem": {
        "total": "6.4tb",
        "total_in_bytes": 7131041304576,
        "free": "884.2gb",
        "free_in_bytes": 949446234112,
        "used": "5.6tb",
        "used_in_bytes": 6181595070464,
        "free_percent": 13,
        "used_percent": 87
      }
    },
    "process": {
      "cpu": {
        "percent": 222
      },
      "open_file_descriptors": {
        "min": 3234,
        "max": 7161,
        "avg": 4158
      }
    },
    "jvm": {
      "max_uptime": "415.7d",
      "max_uptime_in_millis": 35917212594,
      "versions": [
        {
          "version": "14.0.1",
          "vm_name": "OpenJDK 64-Bit Server VM",
          "vm_version": "14.0.1+7",
          "vm_vendor": "AdoptOpenJDK",
          "count": 129
        },
        {
          "version": "12.0.2",
          "vm_name": "OpenJDK 64-Bit Server VM",
          "vm_version": "12.0.2+10",
          "vm_vendor": "Oracle Corporation",
          "count": 1
        }
      ],
      "mem": {
        "heap_used": "1.5tb",
        "heap_used_in_bytes": 1649819597368,
        "heap_max": "2.9tb",
        "heap_max_in_bytes": 3228497412096
      },
      "threads": 16739
    },
    "fs": {
      "total": "958.8tb",
      "total_in_bytes": 1054304221237248,
      "free": "521.9tb",
      "free_in_bytes": 573839523950592,
      "available": "483.2tb",
      "available_in_bytes": 531369791172608
    },
    "plugins": [
      {
        "name": "analysis-ik",
        "version": "6.8.12",
        "elasticsearch_version": "6.8.12",
        "java_version": "1.8",
        "description": "IK Analyzer for Elasticsearch",
        "classname": "org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin",
        "extended_plugins": [

        ],
        "has_native_controller": false
      },
      {
        "name": "analysis-ik",
        "version": "6.8.3",
        "elasticsearch_version": "6.8.3",
        "java_version": "1.8",
        "description": "IK Analyzer for Elasticsearch",
        "classname": "org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin",
        "extended_plugins": [

        ],
        "has_native_controller": false
      }
    ],
    "network_types": {
      "transport_types": {
        "netty4": 130
      },
      "http_types": {
        "netty4": 130
      }
    }
  }
}

You mention 6.8.10, but you are running a mix of two 6.8 versions, neither .10? You should really make sure all nodes are running the same version.

Same for your JVMs, make sure they are all the same.

1 Like

This shouldn’t be important. 6.8.3 is a tribe node. It has existed for a long time and is about to be deprecated.

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.