Missing documents after elastic restart

Hello,

I have had a strange issue. An index has lost many documents and the elastic logs doesn´t show any message about it. These are monitoring visualizations about the index:

I have noticed this issue after restarting the Elasticsearch service on the three nodes. Before restarting, I modified Elasticsearch.yml to add the node.roles option.

Node 1:

node.roles:
   - master
   - ingest
   - transform

Node 2:

node.roles:
   - master
   - ingest
   - transform
   - data

Node 3:

node.roles:
   - master
   - ingest
   - transform
   - data

At Node 1 I was only remove the machine learning role "ml".
At Node 2 and Node 3, I added all the above configuration. Before that, I hadn´t the "node.roles" option configured, so the nodes had all the default roles.

The only time Elasticsearch will delete docs is if you have an ILM policy in place, or something else asks it to delete them.

Adding more node roles won't impact that.

There is an ILM policy assigned to that index but It wasn´t triggered because the shard size was not reached.

The index is the same, but documents older than 24 hours have been lost.

Looking at the first monitoring visualization that I posted before, is strange that "Store (primaries)" is about 1 Gb and "Store" (primary+replica) is about 10 Gb between 15:00 and 17:00. That makes no sense.
Maybe primary and replica shards was not synced?

What is the output from the _cluster/stats?pretty&human API?

Command output:

 {
  "_nodes" : {
    "total" : 3,
    "successful" : 3,
    "failed" : 0
  },
  "cluster_name" : "my-cluster",
  "cluster_uuid" : "lc_t71XGS8uER7rP_2h9dQ",
  "timestamp" : 1637917269890,
  "status" : "green",
  "indices" : {
    "count" : 47,
    "shards" : {
      "total" : 94,
      "primaries" : 47,
      "replication" : 1.0,
      "index" : {
        "shards" : {
          "min" : 2,
          "max" : 2,
          "avg" : 2.0
        },
        "primaries" : {
          "min" : 1,
          "max" : 1,
          "avg" : 1.0
        },
        "replication" : {
          "min" : 1.0,
          "max" : 1.0,
          "avg" : 1.0
        }
      }
    },
    "docs" : {
      "count" : 22464979,
      "deleted" : 46955
    },
    "store" : {
      "size" : "13.9gb",
      "size_in_bytes" : 15031817782,
      "total_data_set_size" : "13.9gb",
      "total_data_set_size_in_bytes" : 15031817782,
      "reserved" : "0b",
      "reserved_in_bytes" : 0
    },
    "fielddata" : {
      "memory_size" : "40.2kb",
      "memory_size_in_bytes" : 41192,
      "evictions" : 0
    },
    "query_cache" : {
      "memory_size" : "30.4mb",
      "memory_size_in_bytes" : 31908103,
      "total_count" : 1862508,
      "hit_count" : 194600,
      "miss_count" : 1667908,
      "cache_size" : 9736,
      "cache_count" : 13117,
      "evictions" : 3381
    },
    "completion" : {
      "size" : "0b",
      "size_in_bytes" : 0
    },
    "segments" : {
      "count" : 662,
      "memory" : "14.3mb",
      "memory_in_bytes" : 15034024,
      "terms_memory" : "6.3mb",
      "terms_memory_in_bytes" : 6707264,
      "stored_fields_memory" : "331.7kb",
      "stored_fields_memory_in_bytes" : 339696,
      "term_vectors_memory" : "0b",
      "term_vectors_memory_in_bytes" : 0,
      "norms_memory" : "659.6kb",
      "norms_memory_in_bytes" : 675456,
      "points_memory" : "0b",
      "points_memory_in_bytes" : 0,
      "doc_values_memory" : "6.9mb",
      "doc_values_memory_in_bytes" : 7311608,
      "index_writer_memory" : "272.8mb",
      "index_writer_memory_in_bytes" : 286127492,
      "version_map_memory" : "63.8mb",
      "version_map_memory_in_bytes" : 66909619,
      "fixed_bit_set" : "4.6mb",
      "fixed_bit_set_memory_in_bytes" : 4916296,
      "max_unsafe_auto_id_timestamp" : 1637884831547,
      "file_sizes" : { }
    },
    "mappings" : {
      "field_types" : [
        {
          "name" : "binary",
          "count" : 1,
          "index_count" : 1,
          "script_count" : 0
        },
        {
          "name" : "boolean",
          "count" : 61,
          "index_count" : 26,
          "script_count" : 0
        },
        {
          "name" : "byte",
          "count" : 1,
          "index_count" : 1,
          "script_count" : 0
        },
        {
          "name" : "date",
          "count" : 141,
          "index_count" : 35,
          "script_count" : 0
        },
        {
          "name" : "date_nanos",
          "count" : 1,
          "index_count" : 1,
          "script_count" : 0
        },
        {
          "name" : "date_range",
          "count" : 1,
          "index_count" : 1,
          "script_count" : 0
        },
        {
          "name" : "double",
          "count" : 8,
          "index_count" : 8,
          "script_count" : 0
        },
        {
          "name" : "double_range",
          "count" : 1,
          "index_count" : 1,
          "script_count" : 0
        },
        {
          "name" : "flattened",
          "count" : 15,
          "index_count" : 5,
          "script_count" : 0
        },
        {
          "name" : "float",
          "count" : 82,
          "index_count" : 16,
          "script_count" : 0
        },
        {
          "name" : "float_range",
          "count" : 1,
          "index_count" : 1,
          "script_count" : 0
        },
        {
          "name" : "geo_point",
          "count" : 1,
          "index_count" : 1,
          "script_count" : 0
        },
        {
          "name" : "geo_shape",
          "count" : 1,
          "index_count" : 1,
          "script_count" : 0
        },
        {
          "name" : "half_float",
          "count" : 78,
          "index_count" : 22,
          "script_count" : 0
        },
        {
          "name" : "integer",
          "count" : 169,
          "index_count" : 15,
          "script_count" : 0
        },
        {
          "name" : "integer_range",
          "count" : 1,
          "index_count" : 1,
          "script_count" : 0
        },
        {
          "name" : "ip",
          "count" : 1,
          "index_count" : 1,
          "script_count" : 0
        },
        {
          "name" : "ip_range",
          "count" : 1,
          "index_count" : 1,
          "script_count" : 0
        },
        {
          "name" : "keyword",
          "count" : 1374,
          "index_count" : 35,
          "script_count" : 0
        },
        {
          "name" : "long",
          "count" : 1528,
          "index_count" : 28,
          "script_count" : 0
        },
        {
          "name" : "long_range",
          "count" : 1,
          "index_count" : 1,
          "script_count" : 0
        },
        {
          "name" : "nested",
          "count" : 52,
          "index_count" : 17,
          "script_count" : 0
        },
        {
          "name" : "object",
          "count" : 1034,
          "index_count" : 34,
          "script_count" : 0
        },
        {
          "name" : "shape",
          "count" : 1,
          "index_count" : 1,
          "script_count" : 0
        },
        {
          "name" : "short",
          "count" : 22,
          "index_count" : 8,
          "script_count" : 0
        },
        {
          "name" : "text",
          "count" : 563,
          "index_count" : 20,
          "script_count" : 0
        }
      ],
      "runtime_field_types" : [ ]
    },
    "analysis" : {
      "char_filter_types" : [ ],
      "tokenizer_types" : [ ],
      "filter_types" : [ ],
      "analyzer_types" : [ ],
      "built_in_char_filters" : [ ],
      "built_in_tokenizers" : [ ],
      "built_in_filters" : [ ],
      "built_in_analyzers" : [ ]
    },
    "versions" : [
      {
        "version" : "7.14.1",
        "index_count" : 47,
        "primary_shard_count" : 47,
        "total_primary_size" : "6.9gb",
        "total_primary_bytes" : 7484671218
      }
    ]
  },
  "nodes" : {
    "count" : {
      "total" : 3,
      "coordinating_only" : 0,
      "data" : 2,
      "data_cold" : 0,
      "data_content" : 0,
      "data_frozen" : 0,
      "data_hot" : 0,
      "data_warm" : 0,
      "ingest" : 3,
      "master" : 3,
      "ml" : 0,
      "remote_cluster_client" : 0,
      "transform" : 3,
      "voting_only" : 0
    },
    "versions" : [
      "7.14.1"
    ],
    "os" : {
      "available_processors" : 14,
      "allocated_processors" : 14,
      "names" : [
        {
          "name" : "Linux",
          "count" : 3
        }
      ],
      "pretty_names" : [
        {
          "pretty_name" : "CentOS Linux 7 (Core)",
          "count" : 3
        }
      ],
      "architectures" : [
        {
          "arch" : "amd64",
          "count" : 3
        }
      ],
      "mem" : {
        "total" : "54.4gb",
        "total_in_bytes" : 58424098816,
        "free" : "4.9gb",
        "free_in_bytes" : 5367050240,
        "used" : "49.4gb",
        "used_in_bytes" : 53057048576,
        "free_percent" : 9,
        "used_percent" : 91
      }
    },
    "process" : {
      "cpu" : {
        "percent" : 1
      },
      "open_file_descriptors" : {
        "min" : 434,
        "max" : 695,
        "avg" : 606
      }
    },
    "jvm" : {
      "max_uptime" : "17h",
      "max_uptime_in_millis" : 61240999,
      "versions" : [
        {
          "version" : "16.0.2",
          "vm_name" : "OpenJDK 64-Bit Server VM",
          "vm_version" : "16.0.2+7",
          "vm_vendor" : "Eclipse Foundation",
          "bundled_jdk" : true,
          "using_bundled_jdk" : true,
          "count" : 3
        }
      ],
      "mem" : {
        "heap_used" : "8.9gb",
        "heap_used_in_bytes" : 9576061216,
        "heap_max" : "27.2gb",
        "heap_max_in_bytes" : 29217521664
      },
      "threads" : 202
    },
    "fs" : {
      "total" : "834.5gb",
      "total_in_bytes" : 896132452352,
      "free" : "816.5gb",
      "free_in_bytes" : 876716838912,
      "available" : "816.5gb",
      "available_in_bytes" : 876716838912
    },
    "plugins" : [ ],
    "network_types" : {
      "transport_types" : {
        "security4" : 3
      },
      "http_types" : {
        "security4" : 3
      }
    },
    "discovery_types" : {
      "zen" : 3
    },
    "packaging_types" : [
      {
        "flavor" : "default",
        "type" : "rpm",
        "count" : 3
      }
    ],
    "ingest" : {
      "number_of_pipelines" : 19,
      "processor_stats" : {
        "conditional" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "convert" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "geoip" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "grok" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "gsub" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "remove" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "rename" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "script" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "set" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        },
        "set_security_user" : {
          "count" : 0,
          "failed" : 0,
          "current" : 0,
          "time" : "0s",
          "time_in_millis" : 0
        }
      }
    }
  }
}

The index have lost about 20 million documents but command output only show 46955 deleted docs.

Another thing that worries me is that number of docs is not the same in the primary and the replica shard in some indices:

Replica shard of index ".ds-logstash-iib-pro-2021.11.19-000001" has almost 500.000 more documents than the primary shard. This is the index that have lost 20 million documents after a Elasticsearch rolling restart.

1 Like