Elastic Data Node leaves the cluster after force merge

Elasticsearch version 7.3.2, JVM 12.0.2; Heap 15 GB.
The machine remains blocked and it's not possible to access by ssh.
These the node statististics just before the issue.


And these are the system metrics:

Anybody experienced similar problems?

In the same environment I had the same problem making a massive reindex. I could solve the problem increasing index.translog.sync_interval
(for target nodes) and reducing the size of the search (for source nodes).

Welcome to our community! :smiley:

Please upgrade, 7.3 is now EOL and there has been numerous improvements between that and latest - 7.13.

What is the output from the _cluster/stats?pretty&human API?

Thank you Mark.

Unfortunately I couldn't catch _cluster/stats?pretty&human at that time.
This is the result now.

{
  "_nodes" : {
    "total" : 10,
    "successful" : 10,
    "failed" : 0
  },
  "cluster_name" : "cluster",
  "cluster_uuid" : "fatpm6PdQPG-5mBLykcASw",
  "timestamp" : 1624530375346,
  "status" : "green",
  "indices" : {
    "count" : 41,
    "shards" : {
      "total" : 95,
      "primaries" : 47,
      "replication" : 1.0212765957446808,
      "index" : {
        "shards" : {
          "min" : 1,
          "max" : 6,
          "avg" : 2.317073170731707
        },
        "primaries" : {
          "min" : 1,
          "max" : 3,
          "avg" : 1.146341463414634
        },
        "replication" : {
          "min" : 0.0,
          "max" : 4.0,
          "avg" : 1.024390243902439
        }
      }
    },
    "docs" : {
      "count" : 354812060,
      "deleted" : 22793
    },
    "store" : {
      "size" : "807.3gb",
      "size_in_bytes" : 866848224226
    },
    "fielddata" : {
      "memory_size" : "140.1kb",
      "memory_size_in_bytes" : 143464,
      "evictions" : 0
    },
    "query_cache" : {
      "memory_size" : "211mb",
      "memory_size_in_bytes" : 221264112,
      "total_count" : 20095277,
      "hit_count" : 161278,
      "miss_count" : 19933999,
      "cache_size" : 13126,
      "cache_count" : 14610,
      "evictions" : 1484
    },
    "completion" : {
      "size" : "0b",
      "size_in_bytes" : 0
    },
    "segments" : {
      "count" : 1340,
      "memory" : "521.5mb",
      "memory_in_bytes" : 546893786,
      "terms_memory" : "200.8mb",
      "terms_memory_in_bytes" : 210615342,
      "stored_fields_memory" : "308.3mb",
      "stored_fields_memory_in_bytes" : 323347440,
      "term_vectors_memory" : "0b",
      "term_vectors_memory_in_bytes" : 0,
      "norms_memory" : "436.8kb",
      "norms_memory_in_bytes" : 447360,
      "points_memory" : "11.2mb",
      "points_memory_in_bytes" : 11789884,
      "doc_values_memory" : "677.5kb",
      "doc_values_memory_in_bytes" : 693760,
      "index_writer_memory" : "277.9mb",
      "index_writer_memory_in_bytes" : 291497620,
      "version_map_memory" : "283.6kb",
      "version_map_memory_in_bytes" : 290472,
      "fixed_bit_set" : "2kb",
      "fixed_bit_set_memory_in_bytes" : 2128,
      "max_unsafe_auto_id_timestamp" : 1624349184155,
      "file_sizes" : { }
    }
  },
  "nodes" : {
    "count" : {
      "total" : 10,
      "coordinating_only" : 2,
      "data" : 5,
      "ingest" : 0,
      "master" : 3
    },
    "versions" : [
      "7.3.2"
    ],
    "os" : {
      "available_processors" : 40,
      "allocated_processors" : 40,
      "names" : [
        {
          "name" : "Linux",
          "count" : 10
        }
      ],
      "pretty_names" : [
        {
          "pretty_name" : "SUSE Linux Enterprise Server 12 SP4",
          "count" : 7
        },
        {
          "pretty_name" : "SUSE Linux Enterprise Server 15 SP2",
          "count" : 3
        }
      ],
      "mem" : {
        "total" : "299.1gb",
        "total_in_bytes" : 321168334848,
        "free" : "57.4gb",
        "free_in_bytes" : 61699235840,
        "used" : "241.6gb",
        "used_in_bytes" : 259469099008,
        "free_percent" : 19,
        "used_percent" : 81
      }
    },
    "process" : {
      "cpu" : {
        "percent" : 5
      },
      "open_file_descriptors" : {
        "min" : 495,
        "max" : 946,
        "avg" : 682
      }
    },
    "jvm" : {
      "max_uptime" : "22.9d",
      "max_uptime_in_millis" : 1986666245,
      "versions" : [
        {
          "version" : "12.0.2",
          "vm_name" : "OpenJDK 64-Bit Server VM",
          "vm_version" : "12.0.2+10",
          "vm_vendor" : "Oracle Corporation",
          "bundled_jdk" : true,
          "using_bundled_jdk" : true,
          "count" : 10
        }
      ],
      "mem" : {
        "heap_used" : "34.6gb",
        "heap_used_in_bytes" : 37246643632,
        "heap_max" : "136.6gb",
        "heap_max_in_bytes" : 146753978368
      },
      "threads" : 689
    },
    "fs" : {
      "total" : "7.5tb",
      "total_in_bytes" : 8254108151808,
      "free" : "6.7tb",
      "free_in_bytes" : 7379488034816,
      "available" : "6.3tb",
      "available_in_bytes" : 6959488700416
    },
    "discovery_types" : {
      "zen" : 10
    }
  }
}

On the other hand I have elasticsearch metrics obtained by metricbeat.
Two nodes down, just the ones that stored the index which was "force merged":


The involved index consists of one pimary shard and one replica shard.
These were the index stats just during the force merge (1 segment) before the failure


The following is the current situation (still not merged)

Can you upgrade, 7.3 is EOL?

What is the specification of the hardware used in the cluster? What type of storage are you using?

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.