Translog of index size is huge

TL;DR one of my index in one of my data node got out of end and took all the disk available space making the node crash.

The Problem

On one of my data nodes, one of my biggest index (~10G) had suddenly a translog the weighted approx. 90G - it filled up the entire disk. the node couldn't connect to the cluster.
I deleted manually (yes I know, very bad) this translog folder in my VM and restarted the node. It got back to normal.

How can I know what is the root cause of this, and how to prevent this from happening again?

Other Info

It happened once already, when I was on v7.9.1 I thought upgrading it might help me remedy that. but now in v7.16.2 it happened again.
Note that in the last time is was another index (the biggest one - 15G) that had this translog.

My cluster consists of: 3 master nodes, and 2 data nodes, 1 node for Kibana and Logstash.

My cluster is not stable. In the last weeks sometime one of the two data node crashes, and until the cluster recover to green my cluster can't index files and is very slow (CPU usage is at 90%+ while normally it's at 5%-15%)

Welcome to our community! :smiley:

What is the output from the _cluster/stats?pretty&human API?

Hey thanks for the welcome,

This is the output of _cluster/stats/?pretty&human:

{
  "_nodes" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "cluster_name" : "es-####-uat",
  "cluster_uuid" : "####",
  "timestamp" : 1644225237984,
  "status" : "green",
  "indices" : {
    "count" : 158,
    "shards" : {
      "total" : 316,
      "primaries" : 158,
      "replication" : 1.0,
      "index" : {
        "shards" : {
          "min" : 2,
          "max" : 2,
          "avg" : 2.0
        },
        "primaries" : {
          "min" : 1,
          "max" : 1,
          "avg" : 1.0
        },
        "replication" : {
          "min" : 1.0,
          "max" : 1.0,
          "avg" : 1.0
        }
      }
    },
    "docs" : {
      "count" : 96375660,
      "deleted" : 7659867
    },
    "store" : {
      "size" : "96.9gb",
      "size_in_bytes" : 104069007267,
      "total_data_set_size" : "96.9gb",
      "total_data_set_size_in_bytes" : 104069007267,
      "reserved" : "0b",
      "reserved_in_bytes" : 0
    },
    "fielddata" : {
      "memory_size" : "12.7mb",
      "memory_size_in_bytes" : 13381592,
      "evictions" : 0
    },
    "query_cache" : {
      "memory_size" : "1.6mb",
      "memory_size_in_bytes" : 1718366,
      "total_count" : 2495295,
      "hit_count" : 88484,
      "miss_count" : 2406811,
      "cache_size" : 426,
      "cache_count" : 4622,
      "evictions" : 4196
    },
    "completion" : {
      "size" : "0b",
      "size_in_bytes" : 0
    },
    "segments" : {
      "count" : 2136,
      "memory" : "41mb",
      "memory_in_bytes" : 43034390,
      "terms_memory" : "22.1mb",
      "terms_memory_in_bytes" : 23258224,
      "stored_fields_memory" : "1.2mb",
      "stored_fields_memory_in_bytes" : 1346256,
      "term_vectors_memory" : "0b",
      "term_vectors_memory_in_bytes" : 0,
      "norms_memory" : "2.6mb",
      "norms_memory_in_bytes" : 2782144,
      "points_memory" : "0b",
      "points_memory_in_bytes" : 0,
      "doc_values_memory" : "14.9mb",
      "doc_values_memory_in_bytes" : 15647766,
      "index_writer_memory" : "99.5mb",
      "index_writer_memory_in_bytes" : 104420768,
      "version_map_memory" : "6.5kb",
      "version_map_memory_in_bytes" : 6669,
      "fixed_bit_set" : "24mb",
      "fixed_bit_set_memory_in_bytes" : 25233152,
      "max_unsafe_auto_id_timestamp" : 1644218472666,
      "file_sizes" : { }
    },
    "mappings" : {
      "field_types" : [
        {
          "name" : "alias",
          "count" : 64,
          "index_count" : 4,
          "script_count" : 0
        },
        {
          "name" : "boolean",
          "count" : 1469,
          "index_count" : 108,
          "script_count" : 0
        },
        {
          "name" : "byte",
          "count" : 3,
          "index_count" : 3,
          "script_count" : 0
        },
        {
          "name" : "constant_keyword",
          "count" : 12,
          "index_count" : 4,
          "script_count" : 0
        },
        {
          "name" : "date",
          "count" : 1085,
          "index_count" : 145,
          "script_count" : 0
        },
        {
          "name" : "double",
          "count" : 423,
          "index_count" : 12,
          "script_count" : 0
        },
        {
          "name" : "flattened",
          "count" : 103,
          "index_count" : 4,
          "script_count" : 0
        },
        {
          "name" : "float",
          "count" : 452,
          "index_count" : 21,
          "script_count" : 0
        },
        {
          "name" : "geo_point",
          "count" : 39,
          "index_count" : 4,
          "script_count" : 0
        },
        {
          "name" : "half_float",
          "count" : 88,
          "index_count" : 24,
          "script_count" : 0
        },
        {
          "name" : "integer",
          "count" : 192,
          "index_count" : 16,
          "script_count" : 0
        },
        {
          "name" : "ip",
          "count" : 416,
          "index_count" : 5,
          "script_count" : 0
        },
        {
          "name" : "ip_range",
          "count" : 2,
          "index_count" : 2,
          "script_count" : 0
        },
        {
          "name" : "keyword",
          "count" : 21246,
          "index_count" : 143,
          "script_count" : 0
        },
        {
          "name" : "long",
          "count" : 8923,
          "index_count" : 137,
          "script_count" : 0
        },
        {
          "name" : "match_only_text",
          "count" : 195,
          "index_count" : 3,
          "script_count" : 0
        },
        {
          "name" : "nested",
          "count" : 111,
          "index_count" : 27,
          "script_count" : 0
        },
        {
          "name" : "object",
          "count" : 7850,
          "index_count" : 142,
          "script_count" : 0
        },
        {
          "name" : "scaled_float",
          "count" : 147,
          "index_count" : 3,
          "script_count" : 0
        },
        {
          "name" : "short",
          "count" : 331,
          "index_count" : 11,
          "script_count" : 0
        },
        {
          "name" : "text",
          "count" : 6772,
          "index_count" : 127,
          "script_count" : 0
        },
        {
          "name" : "version",
          "count" : 2,
          "index_count" : 2,
          "script_count" : 0
        },
        {
          "name" : "wildcard",
          "count" : 53,
          "index_count" : 3,
          "script_count" : 0
        }
      ],
      "runtime_field_types" : [ ]
    },
    "analysis" : {
      "char_filter_types" : [ ],
      "tokenizer_types" : [ ],
      "filter_types" : [ ],
      "analyzer_types" : [ ],
      "built_in_char_filters" : [ ],
      "built_in_tokenizers" : [ ],
      "built_in_filters" : [ ],
      "built_in_analyzers" : [ ]
    },
    "versions" : [
      {
        "version" : "7.9.1",
        "index_count" : 49,
        "primary_shard_count" : 49,
        "total_primary_size" : "26.1gb",
        "total_primary_bytes" : 28037644642
      },
      {
        "version" : "7.16.2",
        "index_count" : 95,
        "primary_shard_count" : 95,
        "total_primary_size" : "19.3gb",
        "total_primary_bytes" : 20822146675
      },
      {
        "version" : "7.17.0",
        "index_count" : 14,
        "primary_shard_count" : 14,
        "total_primary_size" : "872.2mb",
        "total_primary_bytes" : 914600972
      }
    ]
  },
  "nodes" : {
    "count" : {
      "total" : 5,
      "coordinating_only" : 0,
      "data" : 2,
      "data_cold" : 0,
      "data_content" : 0,
      "data_frozen" : 0,
      "data_hot" : 0,
      "data_warm" : 0,
      "ingest" : 5,
      "master" : 3,
      "ml" : 0,
      "remote_cluster_client" : 5,
      "transform" : 2,
      "voting_only" : 0
    },
    "versions" : [
      "7.17.0"
    ],
    "os" : {
      "available_processors" : 14,
      "allocated_processors" : 14,
      "names" : [
        {
          "name" : "Linux",
          "count" : 5
        }
      ],
      "pretty_names" : [
        {
          "pretty_name" : "Ubuntu 18.04.6 LTS",
          "count" : 5
        }
      ],
      "architectures" : [
        {
          "arch" : "amd64",
          "count" : 5
        }
      ],
      "mem" : {
        "total" : "54.5gb",
        "total_in_bytes" : 58609258496,
        "free" : "7.7gb",
        "free_in_bytes" : 8297578496,
        "used" : "46.8gb",
        "used_in_bytes" : 50311680000,
        "free_percent" : 14,
        "used_percent" : 86
      }
    },
    "process" : {
      "cpu" : {
        "percent" : 45
      },
      "open_file_descriptors" : {
        "min" : 430,
        "max" : 1829,
        "avg" : 988
      }
    },
    "jvm" : {
      "max_uptime" : "18.1h",
      "max_uptime_in_millis" : 65490701,
      "versions" : [
        {
          "version" : "17.0.1",
          "vm_name" : "OpenJDK 64-Bit Server VM",
          "vm_version" : "17.0.1+12",
          "vm_vendor" : "Eclipse Adoptium",
          "bundled_jdk" : true,
          "using_bundled_jdk" : true,
          "count" : 5
        }
      ],
      "mem" : {
        "heap_used" : "11.8gb",
        "heap_used_in_bytes" : 12685400424,
        "heap_max" : "28gb",
        "heap_max_in_bytes" : 30064771072
      },
      "threads" : 320
    },
    "fs" : {
      "total" : "441.4gb",
      "total_in_bytes" : 473991561216,
      "free" : "296.9gb",
      "free_in_bytes" : 318809591808,
      "available" : "296.8gb",
      "available_in_bytes" : 318759260160
    },
    "plugins" : [
      {
        "name" : "repository-azure",
        "version" : "7.17.0",
        "elasticsearch_version" : "7.17.0",
        "java_version" : "1.8",
        "description" : "The Azure Repository plugin adds support for Azure storage repositories.",
        "classname" : "org.elasticsearch.repositories.azure.AzureRepositoryPlugin",
        "extended_plugins" : [ ],
        "has_native_controller" : false,
        "licensed" : false,
        "type" : "isolated"
      }
    ],
    "network_types" : {
      "transport_types" : {
        "security4" : 5
      },
      "http_types" : {
        "security4" : 5
      }
    },
    "discovery_types" : {
      "zen" : 5
    },
    "packaging_types" : [
      {
        "flavor" : "default",
        "type" : "deb",
        "count" : 5
      }
    ],
    "ingest" : {
      "number_of_pipelines" : 53,
      "processor_stats" : {
        "append" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "conditional" : {
          "count" : 2286844,
          "failed" : 0,
          "current" : 0,
          "time" : "2.6m",
          "time_in_millis" : 159432
        },
        "convert" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "date" : {
          "count" : 801252,
          "failed" : 0,
          "current" : 0,
          "time" : "17.7s",
          "time_in_millis" : 17769
        },
        "dot_expander" : {
          "count" : 296144,
          "failed" : 0,
          "current" : 0,
          "time" : "670ms",
          "time_in_millis" : 670
        },
        "grok" : {
          "count" : 3847208,
          "failed" : 759128,
          "current" : 0,
          "time" : "1.7m",
          "time_in_millis" : 102958
        },
        "gsub" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "json" : {
          "count" : 37018,
          "failed" : 0,
          "current" : 0,
          "time" : "804ms",
          "time_in_millis" : 804
        },
        "pipeline" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "remove" : {
          "count" : 2477052,
          "failed" : 0,
          "current" : 0,
          "time" : "2.2s",
          "time_in_millis" : 2210
        },
        "rename" : {
          "count" : 2749937,
          "failed" : 0,
          "current" : 0,
          "time" : "5.4s",
          "time_in_millis" : 5439
        },
        "script" : {
          "count" : 1615487,
          "failed" : 759128,
          "current" : 0,
          "time" : "1m",
          "time_in_millis" : 63116
        },
        "set" : {
          "count" : 5603070,
          "failed" : 0,
          "current" : 0,
          "time" : "57.8s",
          "time_in_millis" : 57829
        },
        "split" : {
          "count" : 764250,
          "failed" : 0,
          "current" : 0,
          "time" : "1.5s",
          "time_in_millis" : 1537
        }
      }
    }
  }
}

Note: I've censored the name of the client with ####

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.