Nodes getting removed often from the cluster

hello, I have a couple of clusters running:

  • es 6.8.21
  • es 5.6.16

the machines are Ubuntu VMs and are continuously showing these error in the logs:
[2022-06-23T09:08:46,082][INFO ][o.e.c.s.ClusterApplierService] [server-001] removed {{server-010}{Jdrbx1-iT8mSDVBv4uXNyQ}{xxcn8tyISM-mG5AZQEkFkQ}{10.210.71.35}{10.210.71.35:9300}{ml.machine_memory=33738076160, ml.max_open_jobs=20, xpack.installed=true, ml.enabled=true},}, reason: apply cluster state (from master [master {server-023}{0746xA4YQhaiHtBP3uZIkA}{e6mgUQ3ATBGJTibtwzrwIg}{10.210.71.41}{10.210.71.41:9300}{ml.machine_memory=33737912320, ml.max_open_jobs=20, xpack.installed=true, ml.enabled=true} committed version [26088904]])

What changed recently is that I moved the vms from ubuntu kvm hosts to rocky kvm hosts, and I am trying to figure out what is the issue.

I already tried to apply this

sysctl -w net.ipv4.tcp_keepalive_time=600
sysctl -w net.ipv4.tcp_keepalive_intvl=60
sysctl -w net.ipv4.tcp_keepalive_probes=20

with no success.

As the host changed, it's for sure the host as the same issues is happening on both the clusters, running different versions of ES - I am just wondering if someone has an idea on what to tune in this case.

thanks in advance for your help

Welcome to our community! :smiley:

Both 6.X and 5.X are well past EOL, you should be looking to upgrade as a matter of urgency. There has been numerous improvements to clustering and resilience from those versions to 7.X and 8.X.

Otherwise, we would need to see more logs, and the output from the _cluster/stats?pretty&human API?

hello Mark,

thank you!

{
  "_nodes": {
    "total": 32,
    "successful": 32,
    "failed": 0
  },
  "cluster_name": "tf-prd-leaderboard-elasticsearch",
  "cluster_uuid": "A-MuWgIaRZ62936wzbg7rw",
  "timestamp": 1655971677926,
  "status": "green",
  "indices": {
    "count": 11,
    "shards": {
      "total": 322,
      "primaries": 81,
      "replication": 2.9753086419753085,
      "index": {
        "shards": {
          "min": 2,
          "max": 32,
          "avg": 29.272727272727273
        },
        "primaries": {
          "min": 1,
          "max": 8,
          "avg": 7.363636363636363
        },
        "replication": {
          "min": 1,
          "max": 3,
          "avg": 2.8181818181818183
        }
      }
    },
    "docs": {
      "count": 89802044,
      "deleted": 21730330
    },
    "store": {
      "size": "69.3gb",
      "size_in_bytes": 74441815568
    },
    "fielddata": {
      "memory_size": "0b",
      "memory_size_in_bytes": 0,
      "evictions": 0
    },
    "query_cache": {
      "memory_size": "2.6gb",
      "memory_size_in_bytes": 2795345218,
      "total_count": 3831026221,
      "hit_count": 51215630,
      "miss_count": 3779810591,
      "cache_size": 417040,
      "cache_count": 8652998,
      "evictions": 8235958
    },
    "completion": {
      "size": "0b",
      "size_in_bytes": 0
    },
    "segments": {
      "count": 3769,
      "memory": "238.5mb",
      "memory_in_bytes": 250160962,
      "terms_memory": "193.8mb",
      "terms_memory_in_bytes": 203215359,
      "stored_fields_memory": "29.9mb",
      "stored_fields_memory_in_bytes": 31394400,
      "term_vectors_memory": "0b",
      "term_vectors_memory_in_bytes": 0,
      "norms_memory": "47.4kb",
      "norms_memory_in_bytes": 48576,
      "points_memory": "11.1mb",
      "points_memory_in_bytes": 11644527,
      "doc_values_memory": "3.6mb",
      "doc_values_memory_in_bytes": 3858100,
      "index_writer_memory": "20.9mb",
      "index_writer_memory_in_bytes": 21974873,
      "version_map_memory": "637.5kb",
      "version_map_memory_in_bytes": 652900,
      "fixed_bit_set": "0b",
      "fixed_bit_set_memory_in_bytes": 0,
      "max_unsafe_auto_id_timestamp": -1,
      "file_sizes": {

      }
    }
  },
  "nodes": {
    "count": {
      "total": 32,
      "data": 32,
      "coordinating_only": 0,
      "master": 32,
      "ingest": 32
    },
    "versions": [
      "6.8.21"
    ],
    "os": {
      "available_processors": 128,
      "allocated_processors": 128,
      "names": [
        {
          "name": "Linux",
          "count": 32
        }
      ],
      "pretty_names": [
        {
          "pretty_name": "Ubuntu 16.04.3 LTS",
          "count": 18
        },
        {
          "pretty_name": "Ubuntu 16.04.7 LTS",
          "count": 8
        },
        {
          "pretty_name": "Ubuntu 16.04.6 LTS",
          "count": 6
        }
      ],
      "mem": {
        "total": "1005.4gb",
        "total_in_bytes": 1079615975424,
        "free": "265.3gb",
        "free_in_bytes": 284923678720,
        "used": "740.1gb",
        "used_in_bytes": 794692296704,
        "free_percent": 26,
        "used_percent": 74
      }
    },
    "process": {
      "cpu": {
        "percent": 84
      },
      "open_file_descriptors": {
        "min": 1133,
        "max": 1338,
        "avg": 1276
      }
    },
    "jvm": {
      "max_uptime": "93.7d",
      "max_uptime_in_millis": 8099955836,
      "versions": [
        {
          "version": "1.8.0_292",
          "vm_name": "OpenJDK 64-Bit Server VM",
          "vm_version": "25.292-b10",
          "vm_vendor": "Private Build",
          "count": 32
        }
      ],
      "mem": {
        "heap_used": "192.2gb",
        "heap_used_in_bytes": 206392449792,
        "heap_max": "510.9gb",
        "heap_max_in_bytes": 548640129024
      },
      "threads": 2622
    },
    "fs": {
      "total": "940.8gb",
      "total_in_bytes": 1010189598720,
      "free": "851.8gb",
      "free_in_bytes": 914690457600,
      "available": "851.3gb",
      "available_in_bytes": 914153586688
    },
    "plugins": [

    ],
    "network_types": {
      "transport_types": {
        "security4": 32
      },
      "http_types": {
        "security4": 32
      }
    }
  }
}

in the logs besides those errors there is nothing else, and es works fine but it's continuously going into warning because of the vms being removed.

thanks for looking into the issue

This kind of thing is almost impossible to diagnose in such old versions. If you upgrade to the latest version then it might fix the problem, but if not then you will be able to follow these docs on troubleshooting unstable clusters.