Cluster returns 200 to queries even if entire shards are unavailable

I'm working with about 80 instances, and a index of 2TB and 50 shards with 1 replica

However for some reason from time to time the cluster collapses (still root causing this)

As you can see from the image, instances start going OOM and die.

The major problem right now, is not returning inconsistent results when this happens: we cannot return 200 if some instances didn't process the query and just return partial results.

Only sometimes I see the exception "node disconnected", but other times this is totally ignored.

Do you have any advice? how can I make sure that I'll get 503/429, or even 500?

Welcome to our community! :smiley:

Can you share the full request and response?
What is the output from the _cluster/stats?pretty&human API?

Thank you @warkolm!

GET _cluster/stats?pretty&human


{
  "_nodes" : {
    "total" : 77,
    "successful" : 77,
    "failed" : 0
  },
  "cluster_name" : "es-prod",
  "cluster_uuid" : "0Zn69OBSQKjr",
  "timestamp" : 1634047453765,
  "status" : "green",
  "indices" : {
    "count" : 59,
    "shards" : {
      "total" : 322,
      "primaries" : 161,
      "replication" : 1.0,
      "index" : {
        "shards" : {
          "min" : 2,
          "max" : 100,
          "avg" : 5.4576271186440675
        },
        "primaries" : {
          "min" : 1,
          "max" : 50,
          "avg" : 2.7288135593220337
        },
        "replication" : {
          "min" : 1.0,
          "max" : 1.0,
          "avg" : 1.0
        }
      }
    },
    "docs" : {
      "count" : 2965994148,
      "deleted" : 659304589
    },
    "store" : {
      "size" : "3.5tb",
      "size_in_bytes" : 3877196873274,
      "reserved" : "0b",
      "reserved_in_bytes" : 0
    },
    "fielddata" : {
      "memory_size" : "114.3kb",
      "memory_size_in_bytes" : 117120,
      "evictions" : 0
    },
    "query_cache" : {
      "memory_size" : "11.1gb",
      "memory_size_in_bytes" : 11979351248,
      "total_count" : 31996260002,
      "hit_count" : 155247503,
      "miss_count" : 31841012499,
      "cache_size" : 1399019,
      "cache_count" : 46518049,
      "evictions" : 45119030
    },
    "completion" : {
      "size" : "0b",
      "size_in_bytes" : 0
    },
    "segments" : {
      "count" : 5719,
      "memory" : "15.5gb",
      "memory_in_bytes" : 16724133032,
      "terms_memory" : "428.7mb",
      "terms_memory_in_bytes" : 449616424,
      "stored_fields_memory" : "3.6mb",
      "stored_fields_memory_in_bytes" : 3779640,
      "term_vectors_memory" : "0b",
      "term_vectors_memory_in_bytes" : 0,
      "norms_memory" : "61.1mb",
      "norms_memory_in_bytes" : 64118528,
      "points_memory" : "0b",
      "points_memory_in_bytes" : 0,
      "doc_values_memory" : "15gb",
      "doc_values_memory_in_bytes" : 16206618440,
      "index_writer_memory" : "38.5gb",
      "index_writer_memory_in_bytes" : 41430714718,
      "version_map_memory" : "146.9kb",
      "version_map_memory_in_bytes" : 150461,
      "fixed_bit_set" : "859.7mb",
      "fixed_bit_set_memory_in_bytes" : 901507864,
      "max_unsafe_auto_id_timestamp" : 1634020463375,
      "file_sizes" : { }
    },
    "mappings" : {
      "field_types" : [
        {
          "name" : "binary",
          "count" : 14,
          "index_count" : 3
        },
        {
          "name" : "boolean",
          "count" : 266,
          "index_count" : 31
        },
        {
          "name" : "date",
          "count" : 23414,
          "index_count" : 53
        },
        {
          "name" : "flattened",
          "count" : 9,
          "index_count" : 1
        },
        {
          "name" : "float",
          "count" : 74876,
          "index_count" : 25
        },
        {
          "name" : "integer",
          "count" : 29,
          "index_count" : 3
        },
        {
          "name" : "keyword",
          "count" : 2627,
          "index_count" : 53
        },
        {
          "name" : "long",
          "count" : 2172,
          "index_count" : 48
        },
        {
          "name" : "nested",
          "count" : 106,
          "index_count" : 16
        },
        {
          "name" : "object",
          "count" : 12631,
          "index_count" : 56
        },
        {
          "name" : "text",
          "count" : 2384,
          "index_count" : 51
        }
      ]
    },
    "analysis" : {
      "char_filter_types" : [ ],
      "tokenizer_types" : [ ],
      "filter_types" : [
        {
          "name" : "pattern_capture",
          "count" : 1,
          "index_count" : 1
        }
      ],
      "analyzer_types" : [
        {
          "name" : "custom",
          "count" : 1,
          "index_count" : 1
        }
      ],
      "built_in_char_filters" : [ ],
      "built_in_tokenizers" : [
        {
          "name" : "uax_url_email",
          "count" : 1,
          "index_count" : 1
        }
      ],
      "built_in_filters" : [
        {
          "name" : "lowercase",
          "count" : 1,
          "index_count" : 1
        },
        {
          "name" : "unique",
          "count" : 1,
          "index_count" : 1
        }
      ],
      "built_in_analyzers" : [ ]
    }
  },
  "nodes" : {
    "count" : {
      "total" : 77,
      "coordinating_only" : 0,
      "data" : 72,
      "data_cold" : 72,
      "data_content" : 72,
      "data_hot" : 72,
      "data_warm" : 72,
      "ingest" : 72,
      "master" : 5,
      "ml" : 0,
      "remote_cluster_client" : 77,
      "transform" : 72,
      "voting_only" : 0
    },
    "versions" : [
      "7.10.2"
    ],
    "os" : {
      "available_processors" : 4272,
      "allocated_processors" : 4272,
      "names" : [
        {
          "name" : "Linux",
          "count" : 77
        }
      ],
      "pretty_names" : [
        {
          "pretty_name" : "CentOS Linux 7 (Core)",
          "count" : 77
        }
      ],
      "mem" : {
        "total" : "35.6tb",
        "total_in_bytes" : 39224658612224,
        "free" : "417.7gb",
        "free_in_bytes" : 448510455808,
        "used" : "35.2tb",
        "used_in_bytes" : 38776148156416,
        "free_percent" : 1,
        "used_percent" : 99
      }
    },
    "process" : {
      "cpu" : {
        "percent" : 86
      },
      "open_file_descriptors" : {
        "min" : 2306,
        "max" : 2461,
        "avg" : 2354
      }
    },
    "jvm" : {
      "max_uptime" : "16.5d",
      "max_uptime_in_millis" : 1426505182,
      "versions" : [
        {
          "version" : "15.0.1",
          "vm_name" : "OpenJDK 64-Bit Server VM",
          "vm_version" : "15.0.1+9",
          "vm_vendor" : "AdoptOpenJDK",
          "bundled_jdk" : true,
          "using_bundled_jdk" : true,
          "count" : 77
        }
      ],
      "mem" : {
        "heap_used" : "903.7gb",
        "heap_used_in_bytes" : 970439758120,
        "heap_max" : "2.2tb",
        "heap_max_in_bytes" : 2480343613440
      },
      "threads" : 19356
    },
    "fs" : {
      "total" : "92.5tb",
      "total_in_bytes" : 101717399175168,
      "free" : "75.9tb",
      "free_in_bytes" : 83467440238592,
      "available" : "75.9tb",
      "available_in_bytes" : 83467440238592
    },
    "plugins" : [
      {
        "name" : "opendistro-sql",
        "version" : "1.13.2.0",
        "elasticsearch_version" : "7.10.2",
        "java_version" : "1.8",
        "description" : "Open Distro for Elasticsearch SQL",
        "classname" : "com.amazon.opendistroforelasticsearch.sql.plugin.SQLPlugin",
        "extended_plugins" : [ ],
        "has_native_controller" : false
      }
    ],
    "network_types" : {
      "transport_types" : {
        "security4" : 77
      },
      "http_types" : {
        "security4" : 77
      }
    },
    "discovery_types" : {
      "zen" : 77
    },
    "packaging_types" : [
      {
        "flavor" : "default",
        "type" : "tar",
        "count" : 77
      }
    ],
    "ingest" : {
      "number_of_pipelines" : 2,
      "processor_stats" : {
        "gsub" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "script" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        }
      }
    }
  }
}

Part 2/3 (Max post size is 13k chars only)

I add also

GET _cat/nodes?v=true&h=name,node*,heap*

name                        id   node.role heap.current heap.percent heap.max
elasticsearch_XXXX  tTIe cdhirstw        10.8gb           36     30gb
elasticsearch_XXXX       skhT cdhirstw        15.3gb           51     30gb
elasticsearch_XXXX ArUA mr                 4gb           13     30gb
elasticsearch_XXXX       oAJ6 cdhirstw        12.7gb           42     30gb
elasticsearch_XXXX     IBMv cdhirstw        11.6gb           38     30gb
elasticsearch_XXXX     yxXk cdhirstw        16.4gb           54     30gb
elasticsearch_XXXX       gZgV cdhirstw        14.9gb           49     30gb
elasticsearch_XXXX  jADI cdhirstw           4gb           13     30gb
elasticsearch_XXXX     GZ8P cdhirstw        10.2gb           34     30gb
elasticsearch_XXXX     kRJd cdhirstw        13.4gb           44     30gb
elasticsearch_XXXX    UI_g cdhirstw          14gb           46     30gb
elasticsearch_XXXX TcAv mr               2.9gb            9     30gb
elasticsearch_XXXX    eC_F cdhirstw        10.2gb           34     30gb
elasticsearch_XXXX       rMcu cdhirstw        11.8gb           39     30gb
elasticsearch_XXXX     tmPh cdhirstw        11.9gb           39     30gb
elasticsearch_XXXX       v2Qu cdhirstw        11.7gb           39     30gb
elasticsearch_XXXX     2yT0 cdhirstw         8.9gb           29     30gb
elasticsearch_XXXX coGm mr               3.1gb           10     30gb
elasticsearch_XXXX     VA9w cdhirstw        10.7gb           35     30gb
elasticsearch_XXXX     BHS4 cdhirstw        16.6gb           55     30gb
elasticsearch_XXXX  3YW1 cdhirstw        13.9gb           46     30gb
elasticsearch_XXXX  jOgE cdhirstw        19.8gb           66     30gb
elasticsearch_XXXX     zjFZ cdhirstw        15.4gb           51     30gb
elasticsearch_XXXX       rCWO cdhirstw        10.9gb           36     30gb
elasticsearch_XXXX    9_wo cdhirstw        17.1gb           57     30gb
elasticsearch_XXXX     1PZL cdhirstw        16.4gb           54     30gb
elasticsearch_XXXX       -jCV cdhirstw         4.9gb           16     30gb
elasticsearch_XXXX     tJCn cdhirstw        18.8gb           62     30gb
elasticsearch_XXXX  KPeL cdhirstw        16.5gb           55     30gb
elasticsearch_XXXX       a3Hx cdhirstw         9.8gb           32     30gb
elasticsearch_XXXX       T_rq cdhirstw           7gb           23     30gb
elasticsearch_XXXX  ckxB cdhirstw        16.5gb           55     30gb
elasticsearch_XXXX     M8Zm cdhirstw        18.8gb           62     30gb
elasticsearch_XXXX    SHSE cdhirstw        19.2gb           64     30gb
elasticsearch_XXXX  T5L- cdhirstw         6.8gb           22     30gb
elasticsearch_XXXX T3wt mr               2.7gb            9     30gb
elasticsearch_XXXX     qE9d cdhirstw        16.8gb           56     30gb
elasticsearch_XXXX     PIez cdhirstw        14.6gb           48     30gb
elasticsearch_XXXX w_8T mr               2.7gb            9     30gb
elasticsearch_XXXX       N-bQ cdhirstw        10.8gb           36     30gb
elasticsearch_XXXX     MQVB cdhirstw        12.4gb           41     30gb
elasticsearch_XXXX       HG_d cdhirstw          15gb           50     30gb
elasticsearch_XXXX       rdFl cdhirstw        16.5gb           55     30gb
elasticsearch_XXXX       a1yp cdhirstw        14.1gb           47     30gb
elasticsearch_XXXX    EKQS cdhirstw        14.2gb           47     30gb
elasticsearch_XXXX  Fnit cdhirstw        12.1gb           40     30gb
elasticsearch_XXXX  Nf4H cdhirstw        12.9gb           43     30gb
elasticsearch_XXXX    KADB cdhirstw        14.4gb           48     30gb
elasticsearch_XXXX  nzP0 cdhirstw        15.8gb           52     30gb
elasticsearch_XXXX     zgOu cdhirstw        17.5gb           58     30gb
elasticsearch_XXXX  RSmv cdhirstw        14.6gb           48     30gb
elasticsearch_XXXX       XKxk cdhirstw        15.9gb           53     30gb
elasticsearch_XXXX     mlO4 cdhirstw        10.9gb           36     30gb
elasticsearch_XXXX       FIDw cdhirstw        16.8gb           56     30gb
elasticsearch_XXXX     9uLK cdhirstw         7.7gb           25     30gb
elasticsearch_XXXX       P8Fn cdhirstw        17.4gb           58     30gb
elasticsearch_XXXX       1DFF cdhirstw        12.5gb           41     30gb
elasticsearch_XXXX     Qj21 cdhirstw          10gb           33     30gb
elasticsearch_XXXX       R9Rj cdhirstw        13.9gb           46     30gb
elasticsearch_XXXX  aBpG cdhirstw        14.9gb           49     30gb
elasticsearch_XXXX       AGva cdhirstw         3.3gb           11     30gb
elasticsearch_XXXX       Jl0s cdhirstw        15.6gb           52     30gb
elasticsearch_XXXX     Ug5Z cdhirstw        16.1gb           53     30gb
elasticsearch_XXXX     EfTu cdhirstw        11.5gb           38     30gb
elasticsearch_XXXX     JN3X cdhirstw        14.6gb           48     30gb
elasticsearch_XXXX       n5cf cdhirstw        11.2gb           37     30gb
elasticsearch_XXXX  8QII cdhirstw         8.4gb           28     30gb
elasticsearch_XXXX  h9jp cdhirstw        11.5gb           38     30gb
elasticsearch_XXXX  szbT cdhirstw        14.2gb           47     30gb
elasticsearch_XXXX       A0Rc cdhirstw        16.8gb           56     30gb
elasticsearch_XXXX    T_ID cdhirstw        15.4gb           51     30gb
elasticsearch_XXXX  -_Nz cdhirstw        14.2gb           47     30gb
elasticsearch_XXXX       NrYk cdhirstw        11.7gb           39     30gb
elasticsearch_XXXX     UKZk cdhirstw         7.8gb           26     30gb
elasticsearch_XXXX     4xam cdhirstw          17gb           56     30gb
elasticsearch_XXXX    v3FD cdhirstw        19.3gb           64     30gb
elasticsearch_XXXX       tNW1 cdhirstw         7.8gb           26     30gb

Part 3/3 (Max post size is 13k chars only)

GET _nodes/stats/breaker

{
  "_nodes" : {
    "total" : 77,
    "successful" : 77,
    "failed" : 0
  },
  "cluster_name" : "es-prod",
  "nodes" : {
    "UKZkFgefQzSB_RRo3Kk-VQ" : {
      "timestamp" : 1634047653752,
      "name" : "XXXXXXXX",
      "transport_address" : "XXXXXXXX",
      "host" : "XXXXXXXX",
      "ip" : "XXXXXXXX",
      "roles" : [
        "data",
        "data_cold",
        "data_content",
        "data_hot",
        "data_warm",
        "ingest",
        "remote_cluster_client",
        "transform"
      ],
      "attributes" : {
        "rack_id" : "rack_XXXXXXXXXX",
        "xpack.installed" : "true",
        "transform.node" : "true"
      },
      "breakers" : {
        "request" : {
          "limit_size_in_bytes" : 19327352832,
          "limit_size" : "18gb",
          "estimated_size_in_bytes" : 0,
          "estimated_size" : "0b",
          "overhead" : 1.0,
          "tripped" : 0
        },
        "fielddata" : {
          "limit_size_in_bytes" : 12884901888,
          "limit_size" : "12gb",
          "estimated_size_in_bytes" : 0,
          "estimated_size" : "0b",
          "overhead" : 1.03,
          "tripped" : 0
        },
        "in_flight_requests" : {
          "limit_size_in_bytes" : 32212254720,
          "limit_size" : "30gb",
          "estimated_size_in_bytes" : 22138,
          "estimated_size" : "21.6kb",
          "overhead" : 2.0,
          "tripped" : 0
        },
        "model_inference" : {
          "limit_size_in_bytes" : 16106127360,
          "limit_size" : "15gb",
          "estimated_size_in_bytes" : 0,
          "estimated_size" : "0b",
          "overhead" : 1.0,
          "tripped" : 0
        },
        "accounting" : {
          "limit_size_in_bytes" : 32212254720,
          "limit_size" : "30gb",
          "estimated_size_in_bytes" : 241758824,
          "estimated_size" : "230.5mb",
          "overhead" : 1.0,
          "tripped" : 0
        },
        "parent" : {
          "limit_size_in_bytes" : 30601641984,
          "limit_size" : "28.5gb",
          "estimated_size_in_bytes" : 19733097832,
          "estimated_size" : "18.3gb",
          "overhead" : 1.0,
          "tripped" : 3644
        }
      }
    },
    "coGmt5nWS0GTdWRsQPuqJQ" : {
      "timestamp" : 1634047653766,
      "name" : "XXXXXXXX",
      "transport_address" : "XXXXXXXX",
      "host" : "XXXXXXXX",
      "ip" : "XXXXXXXX",
      "roles" : [
        "master",
        "remote_cluster_client"
      ],
      "attributes" : {
        "rack_id" : "rack_XXXXXXXXXX",
        "xpack.installed" : "true",
        "transform.node" : "false"
      },
      "breakers" : {
        "request" : {
          "limit_size_in_bytes" : 19327352832,
          "limit_size" : "18gb",
          "estimated_size_in_bytes" : 0,
          "estimated_size" : "0b",
          "overhead" : 1.0,
          "tripped" : 0
        },
        "fielddata" : {
          "limit_size_in_bytes" : 12884901888,
          "limit_size" : "12gb",
          "estimated_size_in_bytes" : 0,
          "estimated_size" : "0b",
          "overhead" : 1.03,
          "tripped" : 0
        },
        "in_flight_requests" : {
          "limit_size_in_bytes" : 32212254720,
          "limit_size" : "30gb",
          "estimated_size_in_bytes" : 22138,
          "estimated_size" : "21.6kb",
          "overhead" : 2.0,
          "tripped" : 0
        },
        "model_inference" : {
          "limit_size_in_bytes" : 16106127360,
          "limit_size" : "15gb",
          "estimated_size_in_bytes" : 0,
          "estimated_size" : "0b",
          "overhead" : 1.0,
          "tripped" : 0
        },
        "accounting" : {
          "limit_size_in_bytes" : 32212254720,
          "limit_size" : "30gb",
          "estimated_size_in_bytes" : 0,
          "estimated_size" : "0b",
          "overhead" : 1.0,
          "tripped" : 0
        },
        "parent" : {
          "limit_size_in_bytes" : 30601641984,
          "limit_size" : "28.5gb",
          "estimated_size_in_bytes" : 4809535664,
          "estimated_size" : "4.4gb",
          "overhead" : 1.0,
          "tripped" : 0
        }
      }
    },
    "PIezJK7FTiSFAYFY__SwIw" : {
      "timestamp" : 1634047653768,
      "name" : "XXXXXXXX",
      "transport_address" : "XXXXXXXX",
      "host" : "XXXXXXXX",
      "ip" : "XXXXXXXX",
      "roles" : [
        "data",
        "data_cold",
        "data_content",
        "data_hot",
        "data_warm",
        "ingest",
        "remote_cluster_client",
        "transform"
      ],
      "attributes" : {
        "rack_id" : "rack_XXXXXXXXXX",
        "xpack.installed" : "true",
        "transform.node" : "true"
      },
      "breakers" : {
        "request" : {
          "limit_size_in_bytes" : 19327352832,
          "limit_size" : "18gb",
          "estimated_size_in_bytes" : 0,
          "estimated_size" : "0b",
          "overhead" : 1.0,
          "tripped" : 0
        },
        "fielddata" : {
          "limit_size_in_bytes" : 12884901888,
          "limit_size" : "12gb",
          "estimated_size_in_bytes" : 0,
          "estimated_size" : "0b",
          "overhead" : 1.03,
          "tripped" : 0
        },
        "in_flight_requests" : {
          "limit_size_in_bytes" : 32212254720,
          "limit_size" : "30gb",
          "estimated_size_in_bytes" : 22138,
          "estimated_size" : "21.6kb",
          "overhead" : 2.0,
          "tripped" : 0
        },
        "model_inference" : {
          "limit_size_in_bytes" : 16106127360,
          "limit_size" : "15gb",
          "estimated_size_in_bytes" : 0,
          "estimated_size" : "0b",
          "overhead" : 1.0,
          "tripped" : 0
        },
        "accounting" : {
          "limit_size_in_bytes" : 32212254720,
          "limit_size" : "30gb",
          "estimated_size_in_bytes" : 346595736,
          "estimated_size" : "330.5mb",
          "overhead" : 1.0,
          "tripped" : 0
        },
        "parent" : {
          "limit_size_in_bytes" : 30601641984,
          "limit_size" : "28.5gb",
          "estimated_size_in_bytes" : 14547092944,
          "estimated_size" : "13.5gb",
          "overhead" : 1.0,
          "tripped" : 767
        }
      }
    },
    "tNW1vfBKQAmSe5LMfBpqgw" : {
      "timestamp" : 1634047653768,
      "name" : "XXXXXXXX",
      "transport_address" : "XXXXXXXX",
      "host" : "XXXXXXXX",
      "ip" : "XXXXXXXX",
      "roles" : [
        "data",
        "data_cold",
        "data_content",
        "data_hot",
        "data_warm",
        "ingest",
        "remote_cluster_client",
        "transform"
      ],
      "attributes" : {
        "rack_id" : "rack_XXXXXXXXXX",
        "xpack.installed" : "true",
        "transform.node" : "true"
      },
      "breakers" : {
        "request" : {
          "limit_size_in_bytes" : 19327352832,
          "limit_size" : "18gb",
          "estimated_size_in_bytes" : 0,
          "estimated_size" : "0b",
          "overhead" : 1.0,
          "tripped" : 0
        },
        "fielddata" : {
          "limit_size_in_bytes" : 12884901888,
          "limit_size" : "12gb",
          "estimated_size_in_bytes" : 0,
          "estimated_size" : "0b",
          "overhead" : 1.03,
          "tripped" : 0
        },
        "in_flight_requests" : {
          "limit_size_in_bytes" : 32212254720,
          "limit_size" : "30gb",
          "estimated_size_in_bytes" : 22138,
          "estimated_size" : "21.6kb",
          "overhead" : 2.0,
          "tripped" : 0
        },
        "model_inference" : {
          "limit_size_in_bytes" : 16106127360,
          "limit_size" : "15gb",
          "estimated_size_in_bytes" : 0,
          "estimated_size" : "0b",
          "overhead" : 1.0,
          "tripped" : 0
        },
        "accounting" : {
          "limit_size_in_bytes" : 32212254720,
          "limit_size" : "30gb",
          "estimated_size_in_bytes" : 198951474,
          "estimated_size" : "189.7mb",
          "overhead" : 1.0,
          "tripped" : 0
        },
        "parent" : {
          "limit_size_in_bytes" : 30601641984,
          "limit_size" : "28.5gb",
          "estimated_size_in_bytes" : 10289990664,
          "estimated_size" : "9.5gb",
          "overhead" : 1.0,
          "tripped" : 1602
        }
      }
    },
    "GZ8PqaS7SEqfBhQUiTXiwQ" : {
      "timestamp" : 1634047653768,
      "name" : "XXXXXXXX",
      "transport_address" : "XXXXXXXX",
      "host" : "XXXXXXXX",
      "ip" : "XXXXXXXX",
      "roles" : [
        "data",
        "data_cold",
        "data_content",
        "data_hot",
        "data_warm",
        "ingest",
        "remote_cluster_client",
        "transform"
      ],
      "attributes" : {
        "rack_id" : "rack_XXXXXXXXXX",
        "xpack.installed" : "true",
        "transform.node" : "true"
      },
      "breakers" : {
        "request" : {
          "limit_size_in_bytes" : 19327352832,
          "limit_size" : "18gb",
          "estimated_size_in_bytes" : 0,
          "estimated_size" : "0b",
          "overhead" : 1.0,
          "tripped" : 0
        },
        "fielddata" : {
          "limit_size_in_bytes" : 12884901888,
          "limit_size" : "12gb",
          "estimated_size_in_bytes" : 0,
          "estimated_size" : "0b",
          "overhead" : 1.03,
          "tripped" : 0
        },
        "in_flight_requests" : {
          "limit_size_in_bytes" : 32212254720,
          "limit_size" : "30gb",
          "estimated_size_in_bytes" : 22138,
          "estimated_size" : "21.6kb",
          "overhead" : 2.0,
          "tripped" : 0
        },
        "model_inference" : {
          "limit_size_in_bytes" : 16106127360,
          "limit_size" : "15gb",
          "estimated_size_in_bytes" : 0,
          "estimated_size" : "0b",
          "overhead" : 1.0,
          "tripped" : 0
        },
        "accounting" : {
          "limit_size_in_bytes" : 32212254720,
          "limit_size" : "30gb",
          "estimated_size_in_bytes" : 236611292,
          "estimated_size" : "225.6mb",
          "overhead" : 1.0,
          "tripped" : 0
        },
        "parent" : {
          "limit_size_in_bytes" : 30601641984,
          "limit_size" : "28.5gb",
          "estimated_size_in_bytes" : 9185876608,
          "estimated_size" : "8.5gb",
          "overhead" : 1.0,
          "tripped" : 1946
        }
      }
    },
    "8QIISxNlQ56dVfByGnJ22Q" : {
      "timestamp" : 1634047653768,
      "name" : "XXXXXXXX",
      "transport_address" : "XXXXXXXX",
      "host" : "XXXXXXXX",
      "ip" : "XXXXXXXX",
      "roles" : [
        "data",
        "data_cold",
        "data_content",
        "data_hot",
        "data_warm",
        "ingest",
        "remote_cluster_client",
        "transform"
      ],
      "attributes" : {
        "rack_id" : "rack_XXXXXXXXXX",
        "xpack.installed" : "true",
        "transform.node" : "true"
      },
      "breakers" : {
        "request" : {
          "limit_size_in_bytes" : 19327352832,
          "limit_size" : "18gb",
          "estimated_size_in_bytes" : 0,
          "estimated_size" : "0b",
          "overhead" : 1.0,
          "tripped" : 0
        },
        "fielddata" : {
          "limit_size_in_bytes" : 12884901888,
          "limit_size" : "12gb",
          "estimated_size_in_bytes" : 0,
          "estimated_size" : "0b",
          "overhead" : 1.03,
          "tripped" : 0
        },
        "in_flight_requests" : {
          "limit_size_in_bytes" : 32212254720,
          "limit_size" : "30gb",
          "estimated_size_in_bytes" : 22138,
          "estimated_size" : "21.6kb",
          "overhead" : 2.0,
          "tripped" : 0
        },
        "model_inference" : {
          "limit_size_in_bytes" : 16106127360,
          "limit_size" : "15gb",
          "estimated_size_in_bytes" : 0,
          "estimated_size" : "0b",
          "overhead" : 1.0,
          "tripped" : 0
        },
        "accounting" : {
          "limit_size_in_bytes" : 32212254720,
          "limit_size" : "30gb",
          "estimated_size_in_bytes" : 253563286,
          "estimated_size" : "241.8mb",
          "overhead" : 1.0,
          "tripped" : 0
        },
        "parent" : {
          "limit_size_in_bytes" : 30601641984,
          "limit_size" : "28.5gb",
          "estimated_size_in_bytes" : 9507863760,
          "estimated_size" : "8.8gb",
          "overhead" : 1.0,
          "tripped" : 1125
        }
      }
    },
    .................... cut for 13kb limit .......................
    }
  }
}

I can see that you have the opendistro-sql third party plugin installed. Are you using this for querying the cluster?

This plugin is not supported here and I am not familiar with its performance characteristics, heap usage or potential bugs that might have potential to contribute to the issues you are seeing. I would recommend asking the OpenDistro community for assistance or uninstall the plugin to see if this has any impact.

1 Like

Thanks @Christian_Dahlqvist for your pointer. I'll take a look at the heap dump at OOM to verify this is or is not the case.
At a first look it looks unlikely because the OOM happens at the data node level which doesn't run Opendistro.

Is there something specific that suggests you that it is failing due to opendistro?

Even in that case, do you think that we have the guarantee that if an entire shard dies (primary+replica), we will never get a 200?

As I have no experience with this plugin I do not know how it is deployed or what load it puts on the cluster.

I think you can control this through the allow_partial_search_results parameter described in the docs.

Oh that setting looks incredibly interesting! Thank you @Christian_Dahlqvist

Please let me know if there is anything else that could help this OOM problem or the best way to debug it.

Do you have the HTTP request and response you can share?

We don't know how that code interacts with Elasticsearch, and our standard approach to 3rd party/unsupported plugins is to verify the issue exists without that plugin in place.

@warkolm understandable :slight_smile:

I've continued with the investigation, looking at almost all prometheus ES metrics.
In the end it looks like that all of the metrics start spiking only AFTER GC starts, and when the old GC kicks in everything is doomed.

This is especially not ideal if the instance becomes unresponsive and other requests start piling up causing crash also of other machines

Assuming there are no memory leaks (trying to get the heap dump), is there a way to either:

  • remove the server from the cluster and forcing GCs every X
  • verify that it is not caused by some rogue query.
  • some other solution. Happy for any feedback!

Premises:

  • all thread pools are normal (closed to 0 active). Only after old GC the search one starts spiking
  • there are no merge requests going on
  • no major segments operations
  • no major write operations
  • no waits on IO/net

Let's take in consideration the one that happened at 0.22 (midnight and 22 mins)

Is there anything in the Elasticsearch logs around slow GC?