Strange index size

Hi,

I have 2 clusters, running, 7.10 on different locations. They index the same kind of logging so are set up identically. I conduct searches via kibana on local as well as remote indices in those 2 clusters.

Now here's in both clusters this datastream. It has a simple lifecyle policy: max size = 100G and delete after 30 days.

Now in the index management I notice this:

Every time after (or before?) an index of normal size (1replica) I see a very small one.

Where is this coming from?

settings of one (settings of big and small ones are the same,except for provided_name :

{
  "settings": {
    "index": {
      "lifecycle": {
        "name": "agl-data-stream-policy",
        "indexing_complete": "true"
      },
      "routing": {
        "allocation": {
          "include": {
            "_tier_preference": "data_hot"
          }
        }
      },
      "refresh_interval": "30s",
      "hidden": "true",
      "number_of_shards": "5",
      "provided_name": ".ds-agl-api-ds-000710",
      "creation_date": "1611145397585",
      "number_of_replicas": "1",
      "uuid": "udaRS-DNSBCDLcu1HW1-Aw",
      "version": {
        "created": "7100099"
      }
    }
  },
  "defaults": {
    "index": {
      "flush_after_merge": "512mb",
      "final_pipeline": "_none",
      "max_inner_result_window": "100",
      "unassigned": {
        "node_left": {
          "delayed_timeout": "1m"
        }
      },
      "max_terms_count": "65536",
      "lifecycle": {
        "parse_origination_date": "false",
        "rollover_alias": "",
        "origination_date": "-1"
      },
      "routing_partition_size": "1",
      "force_memory_term_dictionary": "false",
      "max_docvalue_fields_search": "100",
      "merge": {
        "scheduler": {
          "max_thread_count": "3",
          "auto_throttle": "true",
          "max_merge_count": "8"
        },
        "policy": {
          "reclaim_deletes_weight": "2.0",
          "floor_segment": "2mb",
          "max_merge_at_once_explicit": "30",
          "max_merge_at_once": "10",
          "max_merged_segment": "5gb",
          "expunge_deletes_allowed": "10.0",
          "segments_per_tier": "10.0",
          "deletes_pct_allowed": "33.0"
        }
      },
      "max_refresh_listeners": "1000",
      "max_regex_length": "1000",
      "load_fixed_bitset_filters_eagerly": "true",
      "number_of_routing_shards": "1",
      "write": {
        "wait_for_active_shards": "1"
      },
      "verified_before_close": "false",
      "mapping": {
        "coerce": "false",
        "nested_fields": {
          "limit": "50"
        },
        "depth": {
          "limit": "20"
        },
        "field_name_length": {
          "limit": "9223372036854775807"
        },
        "total_fields": {
          "limit": "1000"
        },
        "nested_objects": {
          "limit": "10000"
        },
        "ignore_malformed": "false"
      },
      "source_only": "false",
      "soft_deletes": {
        "enabled": "false",
        "retention": {
          "operations": "0"
        },
        "retention_lease": {
          "period": "12h"
        }
      },
      "max_script_fields": "32",
      "query": {
        "default_field": [
          "*"
        ],
        "parse": {
          "allow_unmapped_fields": "true"
        }
      },
      "format": "0",
      "frozen": "false",
      "sort": {
        "missing": [],
        "mode": [],
        "field": [],
        "order": []
      },
      "priority": "1",
      "codec": "default",
      "max_rescore_window": "10000",
      "max_adjacency_matrix_filters": "100",
      "analyze": {
        "max_token_count": "10000"
      },
      "gc_deletes": "60s",
      "top_metrics_max_size": "10",
      "optimize_auto_generated_id": "true",
      "max_ngram_diff": "1",
      "translog": {
        "generation_threshold_size": "64mb",
        "flush_threshold_size": "512mb",
        "sync_interval": "5s",
        "retention": {
          "size": "512MB",
          "age": "12h"
        },
        "durability": "REQUEST"
      },
      "auto_expand_replicas": "false",
      "mapper": {
        "dynamic": "true"
      },
      "recovery": {
        "type": ""
      },
      "requests": {
        "cache": {
          "enable": "true"
        }
      },
      "data_path": "",
      "highlight": {
        "max_analyzed_offset": "1000000"
      },
      "routing": {
        "rebalance": {
          "enable": "all"
        },
        "allocation": {
          "include": {
            "_tier": ""
          },
          "exclude": {
            "_tier": ""
          },
          "require": {
            "_tier": ""
          },
          "enable": "all",
          "total_shards_per_node": "-1"
        }
      },
      "search": {
        "slowlog": {
          "level": "TRACE",
          "threshold": {
            "fetch": {
              "warn": "-1",
              "trace": "-1",
              "debug": "-1",
              "info": "-1"
            },
            "query": {
              "warn": "-1",
              "trace": "-1",
              "debug": "-1",
              "info": "-1"
            }
          }
        },
        "idle": {
          "after": "30s"
        },
        "throttled": "false"
      },
      "fielddata": {
        "cache": "node"
      },
      "default_pipeline": "_none",
      "max_slices_per_scroll": "1024",
      "shard": {
        "check_on_startup": "false"
      },
      "xpack": {
        "watcher": {
          "template": {
            "version": ""
          }
        },
        "version": "",
        "ccr": {
          "following_index": "false"
        }
      },
      "percolator": {
        "map_unmapped_fields_as_text": "false"
      },
      "allocation": {
        "max_retries": "5",
        "existing_shards_allocator": "gateway_allocator"
      },
      "indexing": {
        "slowlog": {
          "reformat": "true",
          "threshold": {
            "index": {
              "warn": "-1",
              "trace": "-1",
              "debug": "-1",
              "info": "-1"
            }
          },
          "source": "1000",
          "level": "TRACE"
        }
      },
      "compound_format": "0.1",
      "blocks": {
        "metadata": "false",
        "read": "false",
        "read_only_allow_delete": "false",
        "read_only": "false",
        "write": "false"
      },
      "max_result_window": "10000",
      "store": {
        "stats_refresh_interval": "10s",
        "type": "",
        "fs": {
          "fs_lock": "native"
        },
        "preload": [],
        "snapshot": {
          "snapshot_name": "",
          "index_uuid": "",
          "cache": {
            "prewarm": {
              "enabled": "true"
            },
            "enabled": "true",
            "excluded_file_types": []
          },
          "uncached_chunk_size": "-1b",
          "index_name": "",
          "repository_name": "",
          "snapshot_uuid": ""
        }
      },
      "queries": {
        "cache": {
          "enabled": "true"
        }
      },
      "warmer": {
        "enabled": "true"
      },
      "max_shingle_diff": "3",
      "query_string": {
        "lenient": "false"
      }
    }
  }
}

Concerning lifecyclepolicy:

PUT _ilm/policy/agl-data-stream-policy
{
  "policy": {
    "phases": {
      "hot": {
        "min_age": "0ms",
        "actions": {
          "rollover": {
            "max_size": "100gb"
          }
        }
      },
      "delete": {
        "min_age": "30d",
        "actions": {
          "delete": {
            "delete_searchable_snapshot": true
          }
        }
      }
    }
  }
}

What is happening and why is it on both clusters?

I believe you may be hitting this bug: https://github.com/elastic/elasticsearch/issues/67777

@andreidan is this behavior consistent with what you saw (and fixed) regarding that bug?

1 Like

Hmmm I wonder.

I do not manually rollover?

Do you use Fleet? Fleet does manual rollovers during some of its processes.

Don't know what that is, so I guess not.

@dakrone I think the behaviour would be consistent with externally (concurrent or not) triggered rollovers

@Tuckson have you updated the ILM policy after it started being used? (ie. after the agl-* data stream was created?) If so, in what way? (did you add and subsequently removemax_age/max_docs conditions to the rollover action?)

Can you post the the result of querying the ilm-history for a few consecutive indices? (eg.
.ds-agl-api-ds-000706, .ds-agl-api-ds-000707, .ds-agl-api-ds-000708, .ds-agl-api-ds-000709) You can use a query along the lines of the one below for every mentioned index

GET ilm-history-*/_search
{
  "size": 50,
  "query": {
          "term": {
            "index": ".ds-agl-api-ds-000706"
  },
  "sort": [
    {
      "@timestamp": {
        "order": "desc"
      }
    }
  ]
}

Only to change maxindexsize. But never to just 5G if I remember correctly.

Ehmmm... when I try to run this query (in devtools) it says: No request selected. Select a request by placing the cursor inside it.

Now I am pretty sure I DID select it.

Ah, I think it needs a leading /in devtools

GET /ilm-history-*/_search
{
  "size": 50,
  "query": {
          "term": {
            "index": ".ds-agl-api-ds-000706"
  },
  "sort": [
    {
      "@timestamp": {
        "order": "desc"
      }
    }
  ]
}

Can you also please also post the _ilm/explain output for your latest 3-4 backing indices?

eg.

GET /.ds-agl-api-ds-0007*/_ilm/explain

{
  "indices" : {
    ".ds-agl-api-ds-000718" : {
      "index" : ".ds-agl-api-ds-000718",
      "managed" : true,
      "policy" : "agl-data-stream-policy",
      "lifecycle_date_millis" : 1611282197497,
      "age" : "7.68h",
      "phase" : "hot",
      "phase_time_millis" : 1611282198516,
      "action" : "rollover",
      "action_time_millis" : 1611282797788,
      "step" : "check-rollover-ready",
      "step_time_millis" : 1611282797788,
      "phase_execution" : {
        "policy" : "agl-data-stream-policy",
        "phase_definition" : {
          "min_age" : "0ms",
          "actions" : {
            "rollover" : {
              "max_size" : "100gb"
            }
          }
        },
        "version" : 5,
        "modified_date_in_millis" : 1607950970645
      }
    }
  }
}
{
  "indices" : {
    ".ds-agl-api-ds-000717" : {
      "index" : ".ds-agl-api-ds-000717",
      "managed" : true,
      "policy" : "agl-data-stream-policy",
      "lifecycle_date_millis" : 1611281597415,
      "age" : "7.86h",
      "phase" : "hot",
      "phase_time_millis" : 1611281598526,
      "action" : "rollover",
      "action_time_millis" : 1611282199080,
      "step" : "check-rollover-ready",
      "step_time_millis" : 1611282199080,
      "phase_execution" : {
        "policy" : "agl-data-stream-policy",
        "phase_definition" : {
          "min_age" : "0ms",
          "actions" : {
            "rollover" : {
              "max_size" : "100gb"
            }
          }
        },
        "version" : 5,
        "modified_date_in_millis" : 1607950970645
      }
    }
  }
}
{
  "indices" : {
    ".ds-agl-api-ds-000716" : {
      "index" : ".ds-agl-api-ds-000716",
      "managed" : true,
      "policy" : "agl-data-stream-policy",
      "lifecycle_date_millis" : 1611281597426,
      "age" : "7.88h",
      "phase" : "hot",
      "phase_time_millis" : 1611238998079,
      "action" : "complete",
      "action_time_millis" : 1611281598924,
      "step" : "complete",
      "step_time_millis" : 1611281598924,
      "phase_execution" : {
        "policy" : "agl-data-stream-policy",
        "phase_definition" : {
          "min_age" : "0ms",
          "actions" : {
            "rollover" : {
              "max_size" : "100gb"
            }
          }
        },
        "version" : 5,
        "modified_date_in_millis" : 1607950970645
      }
    }
  }
}
{
  "indices" : {
    ".ds-agl-api-ds-000715" : {
      "index" : ".ds-agl-api-ds-000715",
      "managed" : true,
      "policy" : "agl-data-stream-policy",
      "lifecycle_date_millis" : 1611238997442,
      "age" : "19.72h",
      "phase" : "hot",
      "phase_time_millis" : 1611282197374,
      "action" : "complete",
      "action_time_millis" : 1611282199019,
      "step" : "complete",
      "step_time_millis" : 1611282199019,
      "phase_execution" : {
        "policy" : "agl-data-stream-policy",
        "phase_definition" : {
          "min_age" : "0ms",
          "actions" : {
            "rollover" : {
              "max_size" : "100gb"
            }
          }
        },
        "version" : 5,
        "modified_date_in_millis" : 1607950970645
      }
    }
  }
}

Also with the / I get the same error when trying to execute this request in devtools

lol, it missed a curly bracket.

This request:

GET /ilm-history-*/_search
{
  "size": 50,
  "query": {
          "term": {
            "index": ".ds-agl-api-ds-000717"
          }
  },
  "sort": [
    {
      "@timestamp": {
        "order": "desc"
      }
    }
  ]
}

Gives this result for all 4 indices from the previous post:

{
  "took" : 0,
  "timed_out" : false,
  "_shards" : {
    "total" : 0,
    "successful" : 0,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 0,
      "relation" : "eq"
    },
    "max_score" : 0.0,
    "hits" : [ ]
  }
}

OK,

Because I saw the latest index was about to rollover, I kept watching.
The storage size increased to over 211Gb
Before rollover, both current (718) and the previous index (717) had as current action: rollover.
Now after rollover I see this:


The 717 index has as current action 'complete'.
The 718 shows this (stacktrace underneath picture):

ElasticsearchException[Concurrent modification of alias [agl-api-ds] during rollover]
	at org.elasticsearch.action.admin.indices.rollover.TransportRolloverAction$1$1.execute(TransportRolloverAction.java:143)
	at org.elasticsearch.cluster.ClusterStateUpdateTask.execute(ClusterStateUpdateTask.java:47)
	at org.elasticsearch.cluster.service.MasterService.executeTasks(MasterService.java:702)
	at org.elasticsearch.cluster.service.MasterService.calculateTaskOutputs(MasterService.java:324)
	at org.elasticsearch.cluster.service.MasterService.runTasks(MasterService.java:219)
	at org.elasticsearch.cluster.service.MasterService.access$000(MasterService.java:73)
	at org.elasticsearch.cluster.service.MasterService$Batcher.run(MasterService.java:151)
	at org.elasticsearch.cluster.service.TaskBatcher.runIfNotProcessed(TaskBatcher.java:150)
	at org.elasticsearch.cluster.service.TaskBatcher$BatchedTask.run(TaskBatcher.java:188)
	at org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingRunnable.run(ThreadContext.java:678)
	at org.elasticsearch.common.util.concurrent.PrioritizedEsThreadPoolExecutor$TieBreakingPrioritizedRunnable.runAndClean(PrioritizedEsThreadPoolExecutor.java:252)
	at org.elasticsearch.common.util.concurrent.PrioritizedEsThreadPoolExecutor$TieBreakingPrioritizedRunnable.run(PrioritizedEsThreadPoolExecutor.java:215)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1130)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:630)
	at java.base/java.lang.Thread.run(Thread.java:832)

After a few minutes it turns to this:


Looking at the index list I see this:

zooming in to the latest 2 indices:

Phase definition:

{
  "policy": "agl-data-stream-policy",
  "phase_definition": {
    "min_age": "0ms",
    "actions": {
      "rollover": {
        "max_size": "100gb"
      }
    }
  },
  "version": 5,
  "modified_date_in_millis": 1607950970645
}

After a manual rollover no small index is created,it is just being filled like the previous one.

@Tuckson thank you for all the information. I can now confirm that you are hitting this bug https://github.com/elastic/elasticsearch/issues/67777.
A manual data stream rollover at any stage will potentially trigger a situation where multiple backing indices will attempt to trigger the rollover via ILM.

The fix for this will be available in the 7.11 release line.

In the meantime, for 7.10.x this situation could be visible if a data stream's non-write index is in the check-rollover-ready step (as you pointed out: "Before rollover, both current (718) and the previous index (717) had as current action: rollover").
If that's the case a manual move-to-step to skip the check-rollover-ready and attempt-rollover steps on the older generation backing index (to wait-for-active-shards which waits for the data stream write index to be allocated) would avoid the odd sized rollover indices.

eg. (illustrative, please check which previous index is in the rollover action, likely in the check-rollover-ready step)

POST /_ilm/move/.ds-agl-api-ds-000719
{
  "current_step": { 
    "phase": "hot",
    "action": "rollover",
    "name": "check-rollover-ready"
  },
  "next_step": { 
    "phase": "hot",
    "action": "rollover
    "name": "wait-for-active-shards"
  }
}

Thnx for your responses. Let's hope this release is delivered soon then. Was already looking forward to it for the runtime fields :slight_smile:

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.