Request memory constant increase untill it hits circuit breaker request limit. No drop if we stop queries

Hi All,

We noticed that the aggregation query we use to rollup data in a python script is causing our elasticsearch cluster request
What we have isolated is that the queries consume memory which is not freed up even when we stop our python script. The memory consumption just plateaus and stays at that size for days. We need to restart the node in order to free up that memory.

Please find attached the images from our Grafana

Thank you in advance for your assistance.

What is the output from the _cluster/stats?pretty&human API?
What do your node GC logs show? What about slow logs?
What does your script look like?

API response:

{
  "_nodes" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "cluster_name" : "elastic_cluster",
  "cluster_uuid" : "IYdUO8FsR2SA-UfdvrXung",
  "timestamp" : 1632300475669,
  "status" : "green",
  "indices" : {
    "count" : 43,
    "shards" : {
      "total" : 122,
      "primaries" : 61,
      "replication" : 1.0,
      "index" : {
        "shards" : {
          "min" : 2,
          "max" : 8,
          "avg" : 2.8372093023255816
        },
        "primaries" : {
          "min" : 1,
          "max" : 4,
          "avg" : 1.4186046511627908
        },
        "replication" : {
          "min" : 1.0,
          "max" : 1.0,
          "avg" : 1.0
        }
      }
    },
    "docs" : {
      "count" : 2635045747,
      "deleted" : 1646158
    },
    "store" : {
      "size" : "877.3gb",
      "size_in_bytes" : 942014990721,
      "total_data_set_size" : "877.3gb",
      "total_data_set_size_in_bytes" : 942014990721,
      "reserved" : "0b",
      "reserved_in_bytes" : 0
    },
    "fielddata" : {
      "memory_size" : "67.8kb",
      "memory_size_in_bytes" : 69504,
      "evictions" : 0
    },
    "query_cache" : {
      "memory_size" : "245.9mb",
      "memory_size_in_bytes" : 257872657,
      "total_count" : 8707567,
      "hit_count" : 1030173,
      "miss_count" : 7677394,
      "cache_size" : 21401,
      "cache_count" : 125636,
      "evictions" : 104235
    },
    "completion" : {
      "size" : "0b",
      "size_in_bytes" : 0
    },
    "segments" : {
      "count" : 1999,
      "memory" : "26.6mb",
      "memory_in_bytes" : 27927104,
      "terms_memory" : "4.7mb",
      "terms_memory_in_bytes" : 4969904,
      "stored_fields_memory" : "1.3mb",
      "stored_fields_memory_in_bytes" : 1436712,
      "term_vectors_memory" : "0b",
      "term_vectors_memory_in_bytes" : 0,
      "norms_memory" : "418kb",
      "norms_memory_in_bytes" : 428096,
      "points_memory" : "0b",
      "points_memory_in_bytes" : 0,
      "doc_values_memory" : "20.1mb",
      "doc_values_memory_in_bytes" : 21092392,
      "index_writer_memory" : "128.5mb",
      "index_writer_memory_in_bytes" : 134787372,
      "version_map_memory" : "3mb",
      "version_map_memory_in_bytes" : 3200080,
      "fixed_bit_set" : "1mb",
      "fixed_bit_set_memory_in_bytes" : 1103000,
      "max_unsafe_auto_id_timestamp" : 1632283958783,
      "file_sizes" : { }
    },
    "mappings" : {
      "field_types" : [
        {
          "name" : "boolean",
          "count" : 44,
          "index_count" : 16,
          "script_count" : 0
        },
        {
          "name" : "date",
          "count" : 84,
          "index_count" : 30,
          "script_count" : 0
        },
        {
          "name" : "date_nanos",
          "count" : 4,
          "index_count" : 4,
          "script_count" : 0
        },
        {
          "name" : "double",
          "count" : 89,
          "index_count" : 8,
          "script_count" : 0
        },
        {
          "name" : "float",
          "count" : 422,
          "index_count" : 17,
          "script_count" : 0
        },
        {
          "name" : "half_float",
          "count" : 56,
          "index_count" : 14,
          "script_count" : 0
        },
        {
          "name" : "integer",
          "count" : 154,
          "index_count" : 7,
          "script_count" : 0
        },
        {
          "name" : "keyword",
          "count" : 502,
          "index_count" : 30,
          "script_count" : 0
        },
        {
          "name" : "long",
          "count" : 2231,
          "index_count" : 30,
          "script_count" : 0
        },
        {
          "name" : "nested",
          "count" : 21,
          "index_count" : 7,
          "script_count" : 0
        },
        {
          "name" : "object",
          "count" : 821,
          "index_count" : 20,
          "script_count" : 0
        },
        {
          "name" : "text",
          "count" : 95,
          "index_count" : 23,
          "script_count" : 0
        }
      ],
      "runtime_field_types" : [ ]
    },
    "analysis" : {
      "char_filter_types" : [ ],
      "tokenizer_types" : [ ],
      "filter_types" : [ ],
      "analyzer_types" : [ ],
      "built_in_char_filters" : [ ],
      "built_in_tokenizers" : [ ],
      "built_in_filters" : [ ],
      "built_in_analyzers" : [ ]
    },
    "versions" : [
      {
        "version" : "7.13.4",
        "index_count" : 22,
        "primary_shard_count" : 24,
        "total_primary_size" : "131.9gb",
        "total_primary_bytes" : 141679815974
      },
      {
        "version" : "7.14.1",
        "index_count" : 21,
        "primary_shard_count" : 37,
        "total_primary_size" : "306.3gb",
        "total_primary_bytes" : 328985594725
      }
    ]
  },
  "nodes" : {
    "count" : {
      "total" : 5,
      "coordinating_only" : 0,
      "data" : 4,
      "data_cold" : 4,
      "data_content" : 4,
      "data_frozen" : 4,
      "data_hot" : 4,
      "data_warm" : 4,
      "ingest" : 4,
      "master" : 5,
      "ml" : 4,
      "remote_cluster_client" : 4,
      "transform" : 4,
      "voting_only" : 1
    },
    "versions" : [
      "7.14.1"
    ],
    "os" : {
      "available_processors" : 33,
      "allocated_processors" : 33,
      "names" : [
        {
          "name" : "Linux",
          "count" : 5
        }
      ],
      "pretty_names" : [
        {
          "pretty_name" : "Ubuntu 20.04.3 LTS",
          "count" : 2
        },
        {
          "pretty_name" : "Ubuntu 20.04.2 LTS",
          "count" : 3
        }
      ],
      "architectures" : [
        {
          "arch" : "amd64",
          "count" : 5
        }
      ],
      "mem" : {
        "total" : "126.4gb",
        "total_in_bytes" : 135736377344,
        "free" : "7.4gb",
        "free_in_bytes" : 7987986432,
        "used" : "118.9gb",
        "used_in_bytes" : 127748390912,
        "free_percent" : 6,
        "used_percent" : 94
      }
    },
    "process" : {
      "cpu" : {
        "percent" : 224
      },
      "open_file_descriptors" : {
        "min" : 398,
        "max" : 892,
        "avg" : 756
      }
    },
    "jvm" : {
      "max_uptime" : "1.8d",
      "max_uptime_in_millis" : 157460932,
      "versions" : [
        {
          "version" : "16.0.2",
          "vm_name" : "OpenJDK 64-Bit Server VM",
          "vm_version" : "16.0.2+7",
          "vm_vendor" : "Eclipse Foundation",
          "bundled_jdk" : true,
          "using_bundled_jdk" : true,
          "count" : 5
        }
      ],
      "mem" : {
        "heap_used" : "38.2gb",
        "heap_used_in_bytes" : 41056714640,
        "heap_max" : "63.1gb",
        "heap_max_in_bytes" : 67788341248
      },
      "threads" : 388
    },
    "fs" : {
      "total" : "2.4tb",
      "total_in_bytes" : 2729094082560,
      "free" : "1.5tb",
      "free_in_bytes" : 1755310579712,
      "available" : "1.4tb",
      "available_in_bytes" : 1616584454144
    },
    "plugins" : [ ],
    "network_types" : {
      "transport_types" : {
        "security4" : 5
      },
      "http_types" : {
        "security4" : 5
      }
    },
    "discovery_types" : {
      "zen" : 5
    },
    "packaging_types" : [
      {
        "flavor" : "default",
        "type" : "deb",
        "count" : 5
      }
    ],
    "ingest" : {
      "number_of_pipelines" : 2,
      "processor_stats" : {
        "gsub" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "script" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        }
      }
    }
  }
}

Our aggregation query running for the last 1 minute every 1 minute.

{
    "runtime_mappings": {
      "field.name": {
        "type": "keyword",
        "script": "<REPLACE_ME>"
      }
    },
    "aggs": {
      "grouping": {
        "multi_terms": {
          "terms": [
            { "field": "database.keyword" },
            { "field": "table.keyword" },
            { "field": "host_id.keyword" },
            { "field": "field.name" }
          ],
          "size": 100000
        },
        "aggs": {
          "last": {
            "top_metrics": {
              "metrics": { "field": "<REPLACE_ME>" },
              "sort": { "@timestamp": { "order": "desc" } }
            }
          },
          "first": {
            "top_metrics": {
              "metrics": { "field": "<REPLACE_ME>" },
              "sort": { "@timestamp": "asc" }
            }
          },
          "stats": {
            "stats": { "field": "<REPLACE_ME>" }
          }
        }
      }
    },
    "query": {
      "bool": {
        "must": [
          { "range": { "@timestamp": "<REPLACE_ME>" } },
          { "match": { "table.keyword": "<REPLACE_ME>" } },
          { "match": { "database.keyword": "<REPLACE_ME>" } }
        ]
      }
    },
    "size": 0
  }

gc.log node 1

[2021-09-22T09:07:45.312+0000][1665847][safepoint   ] Safepoint "G1CollectForAllocation", Time since last: 18977766411 ns, Reaching safepoint: 239330 ns, At safepoint: 13485245 ns, Total: 13724575 ns
[2021-09-22T09:08:02.333+0000][1665847][gc,start    ] GC(5556) Pause Young (Normal) (G1 Evacuation Pause)
[2021-09-22T09:08:02.333+0000][1665847][gc,task     ] GC(5556) Using 8 workers of 8 for evacuation
[2021-09-22T09:08:02.333+0000][1665847][gc,age      ] GC(5556) Desired survivor size 633339904 bytes, new threshold 15 (max threshold 15)
[2021-09-22T09:08:02.349+0000][1665847][gc,age      ] GC(5556) Age table with threshold 15 (max threshold 15)
[2021-09-22T09:08:02.349+0000][1665847][gc,age      ] GC(5556) - age   1:   42367392 bytes,   42367392 total
[2021-09-22T09:08:02.349+0000][1665847][gc,age      ] GC(5556) - age   2:    1044944 bytes,   43412336 total
[2021-09-22T09:08:02.349+0000][1665847][gc,age      ] GC(5556) - age   3:    1797744 bytes,   45210080 total
[2021-09-22T09:08:02.349+0000][1665847][gc,age      ] GC(5556) - age   4:     879576 bytes,   46089656 total
[2021-09-22T09:08:02.349+0000][1665847][gc,age      ] GC(5556) - age   5:    1885776 bytes,   47975432 total
[2021-09-22T09:08:02.349+0000][1665847][gc,age      ] GC(5556) - age   6:     215968 bytes,   48191400 total
[2021-09-22T09:08:02.349+0000][1665847][gc,age      ] GC(5556) - age   7:     833064 bytes,   49024464 total
[2021-09-22T09:08:02.349+0000][1665847][gc,age      ] GC(5556) - age   8:      18184 bytes,   49042648 total
[2021-09-22T09:08:02.349+0000][1665847][gc,age      ] GC(5556) - age   9:     577400 bytes,   49620048 total
[2021-09-22T09:08:02.349+0000][1665847][gc,age      ] GC(5556) - age  10:     314312 bytes,   49934360 total
[2021-09-22T09:08:02.349+0000][1665847][gc,age      ] GC(5556) - age  11:     538112 bytes,   50472472 total
[2021-09-22T09:08:02.349+0000][1665847][gc,age      ] GC(5556) - age  12:      18440 bytes,   50490912 total
[2021-09-22T09:08:02.349+0000][1665847][gc,age      ] GC(5556) - age  13:     518424 bytes,   51009336 total
[2021-09-22T09:08:02.349+0000][1665847][gc,age      ] GC(5556) - age  14:       5864 bytes,   51015200 total
[2021-09-22T09:08:02.349+0000][1665847][gc,age      ] GC(5556) - age  15:     332528 bytes,   51347728 total
[2021-09-22T09:08:02.349+0000][1665847][gc,phases   ] GC(5556)   Pre Evacuate Collection Set: 0.5ms
[2021-09-22T09:08:02.349+0000][1665847][gc,phases   ] GC(5556)   Merge Heap Roots: 0.5ms
[2021-09-22T09:08:02.349+0000][1665847][gc,phases   ] GC(5556)   Evacuate Collection Set: 8.4ms
[2021-09-22T09:08:02.349+0000][1665847][gc,phases   ] GC(5556)   Post Evacuate Collection Set: 5.8ms
[2021-09-22T09:08:02.349+0000][1665847][gc,phases   ] GC(5556)   Other: 0.9ms
[2021-09-22T09:08:02.349+0000][1665847][gc,heap     ] GC(5556) Eden regions: 1197->0(1197)
[2021-09-22T09:08:02.349+0000][1665847][gc,heap     ] GC(5556) Survivor regions: 7->7(151)
[2021-09-22T09:08:02.349+0000][1665847][gc,heap     ] GC(5556) Old regions: 44->44
[2021-09-22T09:08:02.349+0000][1665847][gc,heap     ] GC(5556) Archive regions: 2->2
[2021-09-22T09:08:02.349+0000][1665847][gc,heap     ] GC(5556) Humongous regions: 7->7
[2021-09-22T09:08:02.349+0000][1665847][gc,metaspace] GC(5556) Metaspace: 137281K(139072K)->137281K(139072K) NonClass: 120373K(121408K)->120373K(121408K) Class: 16908K(17664K)->16908K(17664K)
[2021-09-22T09:08:02.349+0000][1665847][gc          ] GC(5556) Pause Young (Normal) (G1 Evacuation Pause) 10038M->462M(16064M) 16.161ms
[2021-09-22T09:08:02.349+0000][1665847][gc,cpu      ] GC(5556) User=0.07s Sys=0.01s Real=0.02s
[2021-09-22T09:08:02.349+0000][1665847][safepoint   ] Safepoint "G1CollectForAllocation", Time since last: 17020409615 ns, Reaching safepoint: 174960 ns, At safepoint: 16377191 ns, Total: 16552151 ns
[2021-09-22T09:08:29.842+0000][1665847][gc,start    ] GC(5557) Pause Young (Normal) (G1 Evacuation Pause)
[2021-09-22T09:08:29.842+0000][1665847][gc,task     ] GC(5557) Using 8 workers of 8 for evacuation
[2021-09-22T09:08:29.842+0000][1665847][gc,age      ] GC(5557) Desired survivor size 633339904 bytes, new threshold 15 (max threshold 15)
[2021-09-22T09:08:29.854+0000][1665847][gc,age      ] GC(5557) Age table with threshold 15 (max threshold 15)
[2021-09-22T09:08:29.855+0000][1665847][gc,age      ] GC(5557) - age   1:   24884808 bytes,   24884808 total
[2021-09-22T09:08:29.855+0000][1665847][gc,age      ] GC(5557) - age   2:     940816 bytes,   25825624 total
[2021-09-22T09:08:29.855+0000][1665847][gc,age      ] GC(5557) - age   3:     791752 bytes,   26617376 total
[2021-09-22T09:08:29.855+0000][1665847][gc,age      ] GC(5557) - age   4:    1044896 bytes,   27662272 total
[2021-09-22T09:08:29.855+0000][1665847][gc,age      ] GC(5557) - age   5:     741616 bytes,   28403888 total
[2021-09-22T09:08:29.855+0000][1665847][gc,age      ] GC(5557) - age   6:    1718384 bytes,   30122272 total
[2021-09-22T09:08:29.855+0000][1665847][gc,age      ] GC(5557) - age   7:     213168 bytes,   30335440 total
[2021-09-22T09:08:29.855+0000][1665847][gc,age      ] GC(5557) - age   8:     733208 bytes,   31068648 total
[2021-09-22T09:08:29.855+0000][1665847][gc,age      ] GC(5557) - age   9:      16712 bytes,   31085360 total
[2021-09-22T09:08:29.855+0000][1665847][gc,age      ] GC(5557) - age  10:     459400 bytes,   31544760 total
[2021-09-22T09:08:29.855+0000][1665847][gc,age      ] GC(5557) - age  11:     274816 bytes,   31819576 total
[2021-09-22T09:08:29.855+0000][1665847][gc,age      ] GC(5557) - age  12:     480968 bytes,   32300544 total
[2021-09-22T09:08:29.855+0000][1665847][gc,age      ] GC(5557) - age  13:      12680 bytes,   32313224 total
[2021-09-22T09:08:29.855+0000][1665847][gc,age      ] GC(5557) - age  14:     516544 bytes,   32829768 total
[2021-09-22T09:08:29.855+0000][1665847][gc,age      ] GC(5557) - age  15:       4360 bytes,   32834128 total
[2021-09-22T09:08:29.855+0000][1665847][gc,phases   ] GC(5557)   Pre Evacuate Collection Set: 0.6ms
[2021-09-22T09:08:29.855+0000][1665847][gc,phases   ] GC(5557)   Merge Heap Roots: 0.5ms
[2021-09-22T09:08:29.855+0000][1665847][gc,phases   ] GC(5557)   Evacuate Collection Set: 7.6ms
[2021-09-22T09:08:29.855+0000][1665847][gc,phases   ] GC(5557)   Post Evacuate Collection Set: 3.5ms
[2021-09-22T09:08:29.855+0000][1665847][gc,phases   ] GC(5557)   Other: 0.8ms
[2021-09-22T09:08:29.855+0000][1665847][gc,heap     ] GC(5557) Eden regions: 1197->0(1199)
[2021-09-22T09:08:29.855+0000][1665847][gc,heap     ] GC(5557) Survivor regions: 7->5(151)
[2021-09-22T09:08:29.855+0000][1665847][gc,heap     ] GC(5557) Old regions: 44->44
[2021-09-22T09:08:29.855+0000][1665847][gc,heap     ] GC(5557) Archive regions: 2->2
[2021-09-22T09:08:29.855+0000][1665847][gc,heap     ] GC(5557) Humongous regions: 7->7
[2021-09-22T09:08:29.855+0000][1665847][gc,metaspace] GC(5557) Metaspace: 137281K(139072K)->137281K(139072K) NonClass: 120373K(121408K)->120373K(121408K) Class: 16908K(17664K)->16908K(17664K)
[2021-09-22T09:08:29.855+0000][1665847][gc          ] GC(5557) Pause Young (Normal) (G1 Evacuation Pause) 10038M->444M(16064M) 13.097ms
[2021-09-22T09:08:29.855+0000][1665847][gc,cpu      ] GC(5557) User=0.08s Sys=0.00s Real=0.01s
[2021-09-22T09:08:29.855+0000][1665847][safepoint   ] Safepoint "G1CollectForAllocation", Time since last: 27492352383 ns, Reaching safepoint: 333030 ns, At safepoint: 13468544 ns, Total: 13801574 ns
[2021-09-22T09:08:46.007+0000][1665847][gc,start    ] GC(5558) Pause Young (Normal) (G1 Evacuation Pause)
[2021-09-22T09:08:46.007+0000][1665847][gc,task     ] GC(5558) Using 8 workers of 8 for evacuation
[2021-09-22T09:08:46.008+0000][1665847][gc,age      ] GC(5558) Desired survivor size 633339904 bytes, new threshold 15 (max threshold 15)
[2021-09-22T09:08:46.022+0000][1665847][gc,age      ] GC(5558) Age table with threshold 15 (max threshold 15)
[2021-09-22T09:08:46.022+0000][1665847][gc,age      ] GC(5558) - age   1:   37129808 bytes,   37129808 total
[2021-09-22T09:08:46.022+0000][1665847][gc,age      ] GC(5558) - age   2:    9904568 bytes,   47034376 total
[2021-09-22T09:08:46.022+0000][1665847][gc,age      ] GC(5558) - age   3:     925328 bytes,   47959704 total
[2021-09-22T09:08:46.022+0000][1665847][gc,age      ] GC(5558) - age   4:     783424 bytes,   48743128 total
[2021-09-22T09:08:46.022+0000][1665847][gc,age      ] GC(5558) - age   5:    1040976 bytes,   49784104 total
[2021-09-22T09:08:46.022+0000][1665847][gc,age      ] GC(5558) - age   6:     607968 bytes,   50392072 total
[2021-09-22T09:08:46.022+0000][1665847][gc,age      ] GC(5558) - age   7:    1717296 bytes,   52109368 total
[2021-09-22T09:08:46.022+0000][1665847][gc,age      ] GC(5558) - age   8:     211440 bytes,   52320808 total
[2021-09-22T09:08:46.022+0000][1665847][gc,age      ] GC(5558) - age   9:     730904 bytes,   53051712 total
[2021-09-22T09:08:46.022+0000][1665847][gc,age      ] GC(5558) - age  10:      15896 bytes,   53067608 total
[2021-09-22T09:08:46.022+0000][1665847][gc,age      ] GC(5558) - age  11:     457352 bytes,   53524960 total
[2021-09-22T09:08:46.022+0000][1665847][gc,age      ] GC(5558) - age  12:     272208 bytes,   53797168 total
[2021-09-22T09:08:46.022+0000][1665847][gc,age      ] GC(5558) - age  13:     476616 bytes,   54273784 total
[2021-09-22T09:08:46.022+0000][1665847][gc,age      ] GC(5558) - age  14:      12104 bytes,   54285888 total
[2021-09-22T09:08:46.022+0000][1665847][gc,age      ] GC(5558) - age  15:     514560 bytes,   54800448 total
[2021-09-22T09:08:46.022+0000][1665847][gc,phases   ] GC(5558)   Pre Evacuate Collection Set: 0.5ms
[2021-09-22T09:08:46.022+0000][1665847][gc,phases   ] GC(5558)   Merge Heap Roots: 0.5ms
[2021-09-22T09:08:46.022+0000][1665847][gc,phases   ] GC(5558)   Evacuate Collection Set: 9.3ms
[2021-09-22T09:08:46.022+0000][1665847][gc,phases   ] GC(5558)   Post Evacuate Collection Set: 3.7ms
[2021-09-22T09:08:46.022+0000][1665847][gc,phases   ] GC(5558)   Other: 0.8ms
[2021-09-22T09:08:46.022+0000][1665847][gc,heap     ] GC(5558) Eden regions: 1199->0(1197)
[2021-09-22T09:08:46.022+0000][1665847][gc,heap     ] GC(5558) Survivor regions: 5->7(151)
[2021-09-22T09:08:46.022+0000][1665847][gc,heap     ] GC(5558) Old regions: 44->44
[2021-09-22T09:08:46.022+0000][1665847][gc,heap     ] GC(5558) Archive regions: 2->2
[2021-09-22T09:08:46.022+0000][1665847][gc,heap     ] GC(5558) Humongous regions: 7->7
[2021-09-22T09:08:46.022+0000][1665847][gc,metaspace] GC(5558) Metaspace: 137281K(139072K)->137281K(139072K) NonClass: 120373K(121408K)->120373K(121408K) Class: 16908K(17664K)->16908K(17664K)
[2021-09-22T09:08:46.022+0000][1665847][gc          ] GC(5558) Pause Young (Normal) (G1 Evacuation Pause) 10036M->464M(16064M) 14.746ms
[2021-09-22T09:08:46.022+0000][1665847][gc,cpu      ] GC(5558) User=0.07s Sys=0.01s Real=0.02s
[2021-09-22T09:08:46.022+0000][1665847][safepoint   ] Safepoint "G1CollectForAllocation", Time since last: 16151664146 ns, Reaching safepoint: 321630 ns, At safepoint: 15044303 ns, Total: 15365933 ns

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.