Translog of index size is huge

TL;DR one of my index in one of my data node got out of end and took all the disk available space making the node crash.

The Problem

On one of my data nodes, one of my biggest index (~10G) had suddenly a translog the weighted approx. 90G - it filled up the entire disk. the node couldn't connect to the cluster.
I deleted manually (yes I know, very bad) this translog folder in my VM and restarted the node. It got back to normal.

How can I know what is the root cause of this, and how to prevent this from happening again?

Other Info

It happened once already, when I was on v7.9.1 I thought upgrading it might help me remedy that. but now in v7.16.2 it happened again.
Note that in the last time is was another index (the biggest one - 15G) that had this translog.

My cluster consists of: 3 master nodes, and 2 data nodes, 1 node for Kibana and Logstash.

My cluster is not stable. In the last weeks sometime one of the two data node crashes, and until the cluster recover to green my cluster can't index files and is very slow (CPU usage is at 90%+ while normally it's at 5%-15%)

Welcome to our community! :smiley:

What is the output from the _cluster/stats?pretty&human API?

Hey thanks for the welcome,

This is the output of _cluster/stats/?pretty&human:

{
  "_nodes" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "cluster_name" : "es-####-uat",
  "cluster_uuid" : "####",
  "timestamp" : 1644225237984,
  "status" : "green",
  "indices" : {
    "count" : 158,
    "shards" : {
      "total" : 316,
      "primaries" : 158,
      "replication" : 1.0,
      "index" : {
        "shards" : {
          "min" : 2,
          "max" : 2,
          "avg" : 2.0
        },
        "primaries" : {
          "min" : 1,
          "max" : 1,
          "avg" : 1.0
        },
        "replication" : {
          "min" : 1.0,
          "max" : 1.0,
          "avg" : 1.0
        }
      }
    },
    "docs" : {
      "count" : 96375660,
      "deleted" : 7659867
    },
    "store" : {
      "size" : "96.9gb",
      "size_in_bytes" : 104069007267,
      "total_data_set_size" : "96.9gb",
      "total_data_set_size_in_bytes" : 104069007267,
      "reserved" : "0b",
      "reserved_in_bytes" : 0
    },
    "fielddata" : {
      "memory_size" : "12.7mb",
      "memory_size_in_bytes" : 13381592,
      "evictions" : 0
    },
    "query_cache" : {
      "memory_size" : "1.6mb",
      "memory_size_in_bytes" : 1718366,
      "total_count" : 2495295,
      "hit_count" : 88484,
      "miss_count" : 2406811,
      "cache_size" : 426,
      "cache_count" : 4622,
      "evictions" : 4196
    },
    "completion" : {
      "size" : "0b",
      "size_in_bytes" : 0
    },
    "segments" : {
      "count" : 2136,
      "memory" : "41mb",
      "memory_in_bytes" : 43034390,
      "terms_memory" : "22.1mb",
      "terms_memory_in_bytes" : 23258224,
      "stored_fields_memory" : "1.2mb",
      "stored_fields_memory_in_bytes" : 1346256,
      "term_vectors_memory" : "0b",
      "term_vectors_memory_in_bytes" : 0,
      "norms_memory" : "2.6mb",
      "norms_memory_in_bytes" : 2782144,
      "points_memory" : "0b",
      "points_memory_in_bytes" : 0,
      "doc_values_memory" : "14.9mb",
      "doc_values_memory_in_bytes" : 15647766,
      "index_writer_memory" : "99.5mb",
      "index_writer_memory_in_bytes" : 104420768,
      "version_map_memory" : "6.5kb",
      "version_map_memory_in_bytes" : 6669,
      "fixed_bit_set" : "24mb",
      "fixed_bit_set_memory_in_bytes" : 25233152,
      "max_unsafe_auto_id_timestamp" : 1644218472666,
      "file_sizes" : { }
    },
    "mappings" : {
      "field_types" : [
        {
          "name" : "alias",
          "count" : 64,
          "index_count" : 4,
          "script_count" : 0
        },
        {
          "name" : "boolean",
          "count" : 1469,
          "index_count" : 108,
          "script_count" : 0
        },
        {
          "name" : "byte",
          "count" : 3,
          "index_count" : 3,
          "script_count" : 0
        },
        {
          "name" : "constant_keyword",
          "count" : 12,
          "index_count" : 4,
          "script_count" : 0
        },
        {
          "name" : "date",
          "count" : 1085,
          "index_count" : 145,
          "script_count" : 0
        },
        {
          "name" : "double",
          "count" : 423,
          "index_count" : 12,
          "script_count" : 0
        },
        {
          "name" : "flattened",
          "count" : 103,
          "index_count" : 4,
          "script_count" : 0
        },
        {
          "name" : "float",
          "count" : 452,
          "index_count" : 21,
          "script_count" : 0
        },
        {
          "name" : "geo_point",
          "count" : 39,
          "index_count" : 4,
          "script_count" : 0
        },
        {
          "name" : "half_float",
          "count" : 88,
          "index_count" : 24,
          "script_count" : 0
        },
        {
          "name" : "integer",
          "count" : 192,
          "index_count" : 16,
          "script_count" : 0
        },
        {
          "name" : "ip",
          "count" : 416,
          "index_count" : 5,
          "script_count" : 0
        },
        {
          "name" : "ip_range",
          "count" : 2,
          "index_count" : 2,
          "script_count" : 0
        },
        {
          "name" : "keyword",
          "count" : 21246,
          "index_count" : 143,
          "script_count" : 0
        },
        {
          "name" : "long",
          "count" : 8923,
          "index_count" : 137,
          "script_count" : 0
        },
        {
          "name" : "match_only_text",
          "count" : 195,
          "index_count" : 3,
          "script_count" : 0
        },
        {
          "name" : "nested",
          "count" : 111,
          "index_count" : 27,
          "script_count" : 0
        },
        {
          "name" : "object",
          "count" : 7850,
          "index_count" : 142,
          "script_count" : 0
        },
        {
          "name" : "scaled_float",
          "count" : 147,
          "index_count" : 3,
          "script_count" : 0
        },
        {
          "name" : "short",
          "count" : 331,
          "index_count" : 11,
          "script_count" : 0
        },
        {
          "name" : "text",
          "count" : 6772,
          "index_count" : 127,
          "script_count" : 0
        },
        {
          "name" : "version",
          "count" : 2,
          "index_count" : 2,
          "script_count" : 0
        },
        {
          "name" : "wildcard",
          "count" : 53,
          "index_count" : 3,
          "script_count" : 0
        }
      ],
      "runtime_field_types" : [ ]
    },
    "analysis" : {
      "char_filter_types" : [ ],
      "tokenizer_types" : [ ],
      "filter_types" : [ ],
      "analyzer_types" : [ ],
      "built_in_char_filters" : [ ],
      "built_in_tokenizers" : [ ],
      "built_in_filters" : [ ],
      "built_in_analyzers" : [ ]
    },
    "versions" : [
      {
        "version" : "7.9.1",
        "index_count" : 49,
        "primary_shard_count" : 49,
        "total_primary_size" : "26.1gb",
        "total_primary_bytes" : 28037644642
      },
      {
        "version" : "7.16.2",
        "index_count" : 95,
        "primary_shard_count" : 95,
        "total_primary_size" : "19.3gb",
        "total_primary_bytes" : 20822146675
      },
      {
        "version" : "7.17.0",
        "index_count" : 14,
        "primary_shard_count" : 14,
        "total_primary_size" : "872.2mb",
        "total_primary_bytes" : 914600972
      }
    ]
  },
  "nodes" : {
    "count" : {
      "total" : 5,
      "coordinating_only" : 0,
      "data" : 2,
      "data_cold" : 0,
      "data_content" : 0,
      "data_frozen" : 0,
      "data_hot" : 0,
      "data_warm" : 0,
      "ingest" : 5,
      "master" : 3,
      "ml" : 0,
      "remote_cluster_client" : 5,
      "transform" : 2,
      "voting_only" : 0
    },
    "versions" : [
      "7.17.0"
    ],
    "os" : {
      "available_processors" : 14,
      "allocated_processors" : 14,
      "names" : [
        {
          "name" : "Linux",
          "count" : 5
        }
      ],
      "pretty_names" : [
        {
          "pretty_name" : "Ubuntu 18.04.6 LTS",
          "count" : 5
        }
      ],
      "architectures" : [
        {
          "arch" : "amd64",
          "count" : 5
        }
      ],
      "mem" : {
        "total" : "54.5gb",
        "total_in_bytes" : 58609258496,
        "free" : "7.7gb",
        "free_in_bytes" : 8297578496,
        "used" : "46.8gb",
        "used_in_bytes" : 50311680000,
        "free_percent" : 14,
        "used_percent" : 86
      }
    },
    "process" : {
      "cpu" : {
        "percent" : 45
      },
      "open_file_descriptors" : {
        "min" : 430,
        "max" : 1829,
        "avg" : 988
      }
    },
    "jvm" : {
      "max_uptime" : "18.1h",
      "max_uptime_in_millis" : 65490701,
      "versions" : [
        {
          "version" : "17.0.1",
          "vm_name" : "OpenJDK 64-Bit Server VM",
          "vm_version" : "17.0.1+12",
          "vm_vendor" : "Eclipse Adoptium",
          "bundled_jdk" : true,
          "using_bundled_jdk" : true,
          "count" : 5
        }
      ],
      "mem" : {
        "heap_used" : "11.8gb",
        "heap_used_in_bytes" : 12685400424,
        "heap_max" : "28gb",
        "heap_max_in_bytes" : 30064771072
      },
      "threads" : 320
    },
    "fs" : {
      "total" : "441.4gb",
      "total_in_bytes" : 473991561216,
      "free" : "296.9gb",
      "free_in_bytes" : 318809591808,
      "available" : "296.8gb",
      "available_in_bytes" : 318759260160
    },
    "plugins" : [
      {
        "name" : "repository-azure",
        "version" : "7.17.0",
        "elasticsearch_version" : "7.17.0",
        "java_version" : "1.8",
        "description" : "The Azure Repository plugin adds support for Azure storage repositories.",
        "classname" : "org.elasticsearch.repositories.azure.AzureRepositoryPlugin",
        "extended_plugins" : [ ],
        "has_native_controller" : false,
        "licensed" : false,
        "type" : "isolated"
      }
    ],
    "network_types" : {
      "transport_types" : {
        "security4" : 5
      },
      "http_types" : {
        "security4" : 5
      }
    },
    "discovery_types" : {
      "zen" : 5
    },
    "packaging_types" : [
      {
        "flavor" : "default",
        "type" : "deb",
        "count" : 5
      }
    ],
    "ingest" : {
      "number_of_pipelines" : 53,
      "processor_stats" : {
        "append" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "conditional" : {
          "count" : 2286844,
          "failed" : 0,
          "current" : 0,
          "time" : "2.6m",
          "time_in_millis" : 159432
        },
        "convert" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "date" : {
          "count" : 801252,
          "failed" : 0,
          "current" : 0,
          "time" : "17.7s",
          "time_in_millis" : 17769
        },
        "dot_expander" : {
          "count" : 296144,
          "failed" : 0,
          "current" : 0,
          "time" : "670ms",
          "time_in_millis" : 670
        },
        "grok" : {
          "count" : 3847208,
          "failed" : 759128,
          "current" : 0,
          "time" : "1.7m",
          "time_in_millis" : 102958
        },
        "gsub" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "json" : {
          "count" : 37018,
          "failed" : 0,
          "current" : 0,
          "time" : "804ms",
          "time_in_millis" : 804
        },
        "pipeline" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "remove" : {
          "count" : 2477052,
          "failed" : 0,
          "current" : 0,
          "time" : "2.2s",
          "time_in_millis" : 2210
        },
        "rename" : {
          "count" : 2749937,
          "failed" : 0,
          "current" : 0,
          "time" : "5.4s",
          "time_in_millis" : 5439
        },
        "script" : {
          "count" : 1615487,
          "failed" : 759128,
          "current" : 0,
          "time" : "1m",
          "time_in_millis" : 63116
        },
        "set" : {
          "count" : 5603070,
          "failed" : 0,
          "current" : 0,
          "time" : "57.8s",
          "time_in_millis" : 57829
        },
        "split" : {
          "count" : 764250,
          "failed" : 0,
          "current" : 0,
          "time" : "1.5s",
          "time_in_millis" : 1537
        }
      }
    }
  }
}

Note: I've censored the name of the client with ####