Elasticsearch per request circuit breakers and real memory

Hi. After upgrading to version 7.13.1 with new circuit breakers that use real memory we are having issues with heavy aggregates on Elasticsearch cluster. Per request circuit breakers are not being tripped and nodes (58GB of RAM and 28GB of JVM) are being evicted from cluster due to huge memory pressure.

Is there some configuration missing?

Circuit breaker settings:

indices.breaker.total.use_real_memory: true
{
  "persistent" : {
    "indices" : {
      "breaker" : {
        "fielddata" : {
          "limit" : "10%"
        },
        "request" : {
          "limit" : "3%"
        }
      }
    }
  }
}

Circuit breaker stats on example node:

     "breakers" : {
       "request" : {
         "limit_size_in_bytes" : 901943132,
         "limit_size" : "860.1mb",
         "estimated_size_in_bytes" : 278528,
         "estimated_size" : "272kb",
         "overhead" : 1.0,
         "tripped" : 0
       },
       "fielddata" : {
         "limit_size_in_bytes" : 3006477107,
         "limit_size" : "2.7gb",
         "estimated_size_in_bytes" : 781288,
         "estimated_size" : "762.9kb",
         "overhead" : 1.03,
         "tripped" : 0
       },
       "in_flight_requests" : {
         "limit_size_in_bytes" : 30064771072,
         "limit_size" : "28gb",
         "estimated_size_in_bytes" : 1624685,
         "estimated_size" : "1.5mb",
         "overhead" : 2.0,
         "tripped" : 0
       },
       "model_inference" : {
         "limit_size_in_bytes" : 15032385536,
         "limit_size" : "14gb",
         "estimated_size_in_bytes" : 0,
         "estimated_size" : "0b",
         "overhead" : 1.0,
         "tripped" : 0
       },
       "accounting" : {
         "limit_size_in_bytes" : 30064771072,
         "limit_size" : "28gb",
         "estimated_size_in_bytes" : 95191136,
         "estimated_size" : "90.7mb",
         "overhead" : 1.0,
         "tripped" : 0
       },
       "parent" : {
         "limit_size_in_bytes" : 28561532518,
         "limit_size" : "26.5gb",
         "estimated_size_in_bytes" : 15394289608,
         "estimated_size" : "14.3gb",
         "overhead" : 1.0,
         "tripped" : 101432
       }

What is the output from the _cluster/stats?pretty&human API?

On Elasticsearch version 6 above configuration worked without issues. I could revert to using legacy circuit breakers but that would be just sweeping the problem under a rug. Anyways, here are cluster stats...

{
  "_nodes" : {
    "total" : 33,
    "successful" : 33,
    "failed" : 0
  },
  "cluster_name" : "production_cluster",
  "cluster_uuid" : "1a-ytX4lS96SzOMoiLVmDQ",
  "timestamp" : 1637101329710,
  "status" : "green",
  "indices" : {
    "count" : 78,
    "shards" : {
      "total" : 2766,
      "primaries" : 1383,
      "replication" : 1.0,
      "index" : {
        "shards" : {
          "min" : 2,
          "max" : 60,
          "avg" : 35.46153846153846
        },
        "primaries" : {
          "min" : 1,
          "max" : 30,
          "avg" : 17.73076923076923
        },
        "replication" : {
          "min" : 1.0,
          "max" : 1.0,
          "avg" : 1.0
        }
      }
    },
    "docs" : {
      "count" : 31690553001,
      "deleted" : 841865803
    },
    "store" : {
      "size" : "60.4tb",
      "size_in_bytes" : 66424149510743,
      "total_data_set_size" : "60.4tb",
      "total_data_set_size_in_bytes" : 66424149510743,
      "reserved" : "0b",
      "reserved_in_bytes" : 0
    },
    "fielddata" : {
      "memory_size" : "9.7mb",
      "memory_size_in_bytes" : 10270280,
      "evictions" : 0
    },
    "query_cache" : {
      "memory_size" : "12gb",
      "memory_size_in_bytes" : 12887714548,
      "total_count" : 420601599,
      "hit_count" : 54074669,
      "miss_count" : 366526930,
      "cache_size" : 424067,
      "cache_count" : 428104,
      "evictions" : 4037
    },
    "completion" : {
      "size" : "0b",
      "size_in_bytes" : 0
    },
    "segments" : {
      "count" : 79254,
      "memory" : "2.6gb",
      "memory_in_bytes" : 2824804906,
      "terms_memory" : "901.7mb",
      "terms_memory_in_bytes" : 945580616,
      "stored_fields_memory" : "60.4mb",
      "stored_fields_memory_in_bytes" : 63388560,
      "term_vectors_memory" : "0b",
      "term_vectors_memory_in_bytes" : 0,
      "norms_memory" : "11.7mb",
      "norms_memory_in_bytes" : 12281152,
      "points_memory" : "0b",
      "points_memory_in_bytes" : 0,
      "doc_values_memory" : "1.6gb",
      "doc_values_memory_in_bytes" : 1803554578,
      "index_writer_memory" : "24.2gb",
      "index_writer_memory_in_bytes" : 26071634942,
      "version_map_memory" : "2.3gb",
      "version_map_memory_in_bytes" : 2529854508,
      "fixed_bit_set" : "5kb",
      "fixed_bit_set_memory_in_bytes" : 5216,
      "max_unsafe_auto_id_timestamp" : 1636507800398,
      "file_sizes" : { }
    },
    "mappings" : {
      "field_types" : [
        {
          "name" : "boolean",
          "count" : 510,
          "index_count" : 36,
          "script_count" : 0
        },
        {
          "name" : "byte",
          "count" : 15,
          "index_count" : 15,
          "script_count" : 0
        },
        {
          "name" : "date",
          "count" : 1227,
          "index_count" : 55,
          "script_count" : 0
        },
        {
          "name" : "double",
          "count" : 240,
          "index_count" : 15,
          "script_count" : 0
        },
        {
          "name" : "float",
          "count" : 8,
          "index_count" : 4,
          "script_count" : 0
        },
        {
          "name" : "integer",
          "count" : 750,
          "index_count" : 15,
          "script_count" : 0
        },
        {
          "name" : "keyword",
          "count" : 2768,
          "index_count" : 55,
          "script_count" : 0
        },
        {
          "name" : "long",
          "count" : 2574,
          "index_count" : 53,
          "script_count" : 0
        },
        {
          "name" : "nested",
          "count" : 4,
          "index_count" : 4,
          "script_count" : 0
        },
        {
          "name" : "object",
          "count" : 368,
          "index_count" : 40,
          "script_count" : 0
        },
        {
          "name" : "short",
          "count" : 930,
          "index_count" : 15,
          "script_count" : 0
        },
        {
          "name" : "text",
          "count" : 135,
          "index_count" : 40,
          "script_count" : 0
        }
      ],
      "runtime_field_types" : [ ]
    },
    "analysis" : {
      "char_filter_types" : [ ],
      "tokenizer_types" : [ ],
      "filter_types" : [ ],
      "analyzer_types" : [
        {
          "name" : "custom",
          "count" : 15,
          "index_count" : 15
        }
      ],
      "built_in_char_filters" : [ ],
      "built_in_tokenizers" : [
        {
          "name" : "keyword",
          "count" : 15,
          "index_count" : 15
        }
      ],
      "built_in_filters" : [
        {
          "name" : "lowercase",
          "count" : 15,
          "index_count" : 15
        }
      ],
      "built_in_analyzers" : [ ]
    },
    "versions" : [
      {
        "version" : "6.8.1",
        "index_count" : 13,
        "primary_shard_count" : 13,
        "total_primary_size" : "60.7mb",
        "total_primary_bytes" : 63707518
      },
      {
        "version" : "7.13.1",
        "index_count" : 65,
        "primary_shard_count" : 1370,
        "total_primary_size" : "30.2tb",
        "total_primary_bytes" : 33224534555546
      }
    ]
  },
  "nodes" : {
    "count" : {
      "total" : 33,
      "coordinating_only" : 0,
      "data" : 30,
      "data_cold" : 30,
      "data_content" : 30,
      "data_frozen" : 30,
      "data_hot" : 30,
      "data_warm" : 30,
      "ingest" : 33,
      "master" : 3,
      "ml" : 33,
      "remote_cluster_client" : 33,
      "transform" : 30,
      "voting_only" : 0
    },
    "versions" : [
      "7.13.1"
    ],
    "os" : {
      "available_processors" : 492,
      "allocated_processors" : 492,
      "names" : [
        {
          "name" : "Linux",
          "count" : 33
        }
      ],
      "pretty_names" : [
        {
          "pretty_name" : "CentOS Linux 8",
          "count" : 33
        }
      ],
      "architectures" : [
        {
          "arch" : "amd64",
          "count" : 33
        }
      ],
      "mem" : {
        "total" : "1.6tb",
        "total_in_bytes" : 1842078707712,
        "free" : "36.8gb",
        "free_in_bytes" : 39605731328,
        "used" : "1.6tb",
        "used_in_bytes" : 1802472976384,
        "free_percent" : 2,
        "used_percent" : 98
      }
    },
    "process" : {
      "cpu" : {
        "percent" : 426
      },
      "open_file_descriptors" : {
        "min" : 1027,
        "max" : 2880,
        "avg" : 2596
      }
    },
    "jvm" : {
      "max_uptime" : "3.8h",
      "max_uptime_in_millis" : 13937991,
      "versions" : [
        {
          "version" : "16",
          "vm_name" : "OpenJDK 64-Bit Server VM",
          "vm_version" : "16+36",
          "vm_vendor" : "AdoptOpenJDK",
          "bundled_jdk" : true,
          "using_bundled_jdk" : true,
          "count" : 33
        }
      ],
      "mem" : {
        "heap_used" : "406.6gb",
        "heap_used_in_bytes" : 436662002848,
        "heap_max" : "852gb",
        "heap_max_in_bytes" : 914828034048
      },
      "threads" : 4007
    },
    "fs" : {
      "total" : "134.7tb",
      "total_in_bytes" : 148156018974720,
      "free" : "74tb",
      "free_in_bytes" : 81422377238528,
      "available" : "74tb",
      "available_in_bytes" : 81422377238528
    },
    "plugins" : [ ],
    "network_types" : {
      "transport_types" : {
        "security4" : 33
      },
      "http_types" : {
        "security4" : 33
      }
    },
    "discovery_types" : {
      "zen" : 33
    },
    "packaging_types" : [
      {
        "flavor" : "default",
        "type" : "docker",
        "count" : 33
      }
    ],
    "ingest" : {
      "number_of_pipelines" : 3,
      "processor_stats" : {
        "gsub" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "rename" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "script" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "set" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        }
      }
    }
  }
}

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.