Strange index size

Tuckson · January 20, 2021, 8:45pm

Hi,

I have 2 clusters, running, 7.10 on different locations. They index the same kind of logging so are set up identically. I conduct searches via kibana on local as well as remote indices in those 2 clusters.

Now here's in both clusters this datastream. It has a simple lifecyle policy: max size = 100G and delete after 30 days.

Now in the index management I notice this:

Every time after (or before?) an index of normal size (1replica) I see a very small one.

Where is this coming from?

settings of one (settings of big and small ones are the same,except for provided_name :

{
  "settings": {
    "index": {
      "lifecycle": {
        "name": "agl-data-stream-policy",
        "indexing_complete": "true"
      },
      "routing": {
        "allocation": {
          "include": {
            "_tier_preference": "data_hot"
          }
        }
      },
      "refresh_interval": "30s",
      "hidden": "true",
      "number_of_shards": "5",
      "provided_name": ".ds-agl-api-ds-000710",
      "creation_date": "1611145397585",
      "number_of_replicas": "1",
      "uuid": "udaRS-DNSBCDLcu1HW1-Aw",
      "version": {
        "created": "7100099"
      }
    }
  },
  "defaults": {
    "index": {
      "flush_after_merge": "512mb",
      "final_pipeline": "_none",
      "max_inner_result_window": "100",
      "unassigned": {
        "node_left": {
          "delayed_timeout": "1m"
        }
      },
      "max_terms_count": "65536",
      "lifecycle": {
        "parse_origination_date": "false",
        "rollover_alias": "",
        "origination_date": "-1"
      },
      "routing_partition_size": "1",
      "force_memory_term_dictionary": "false",
      "max_docvalue_fields_search": "100",
      "merge": {
        "scheduler": {
          "max_thread_count": "3",
          "auto_throttle": "true",
          "max_merge_count": "8"
        },
        "policy": {
          "reclaim_deletes_weight": "2.0",
          "floor_segment": "2mb",
          "max_merge_at_once_explicit": "30",
          "max_merge_at_once": "10",
          "max_merged_segment": "5gb",
          "expunge_deletes_allowed": "10.0",
          "segments_per_tier": "10.0",
          "deletes_pct_allowed": "33.0"
        }
      },
      "max_refresh_listeners": "1000",
      "max_regex_length": "1000",
      "load_fixed_bitset_filters_eagerly": "true",
      "number_of_routing_shards": "1",
      "write": {
        "wait_for_active_shards": "1"
      },
      "verified_before_close": "false",
      "mapping": {
        "coerce": "false",
        "nested_fields": {
          "limit": "50"
        },
        "depth": {
          "limit": "20"
        },
        "field_name_length": {
          "limit": "9223372036854775807"
        },
        "total_fields": {
          "limit": "1000"
        },
        "nested_objects": {
          "limit": "10000"
        },
        "ignore_malformed": "false"
      },
      "source_only": "false",
      "soft_deletes": {
        "enabled": "false",
        "retention": {
          "operations": "0"
        },
        "retention_lease": {
          "period": "12h"
        }
      },
      "max_script_fields": "32",
      "query": {
        "default_field": [
          "*"
        ],
        "parse": {
          "allow_unmapped_fields": "true"
        }
      },
      "format": "0",
      "frozen": "false",
      "sort": {
        "missing": [],
        "mode": [],
        "field": [],
        "order": []
      },
      "priority": "1",
      "codec": "default",
      "max_rescore_window": "10000",
      "max_adjacency_matrix_filters": "100",
      "analyze": {
        "max_token_count": "10000"
      },
      "gc_deletes": "60s",
      "top_metrics_max_size": "10",
      "optimize_auto_generated_id": "true",
      "max_ngram_diff": "1",
      "translog": {
        "generation_threshold_size": "64mb",
        "flush_threshold_size": "512mb",
        "sync_interval": "5s",
        "retention": {
          "size": "512MB",
          "age": "12h"
        },
        "durability": "REQUEST"
      },
      "auto_expand_replicas": "false",
      "mapper": {
        "dynamic": "true"
      },
      "recovery": {
        "type": ""
      },
      "requests": {
        "cache": {
          "enable": "true"
        }
      },
      "data_path": "",
      "highlight": {
        "max_analyzed_offset": "1000000"
      },
      "routing": {
        "rebalance": {
          "enable": "all"
        },
        "allocation": {
          "include": {
            "_tier": ""
          },
          "exclude": {
            "_tier": ""
          },
          "require": {
            "_tier": ""
          },
          "enable": "all",
          "total_shards_per_node": "-1"
        }
      },
      "search": {
        "slowlog": {
          "level": "TRACE",
          "threshold": {
            "fetch": {
              "warn": "-1",
              "trace": "-1",
              "debug": "-1",
              "info": "-1"
            },
            "query": {
              "warn": "-1",
              "trace": "-1",
              "debug": "-1",
              "info": "-1"
            }
          }
        },
        "idle": {
          "after": "30s"
        },
        "throttled": "false"
      },
      "fielddata": {
        "cache": "node"
      },
      "default_pipeline": "_none",
      "max_slices_per_scroll": "1024",
      "shard": {
        "check_on_startup": "false"
      },
      "xpack": {
        "watcher": {
          "template": {
            "version": ""
          }
        },
        "version": "",
        "ccr": {
          "following_index": "false"
        }
      },
      "percolator": {
        "map_unmapped_fields_as_text": "false"
      },
      "allocation": {
        "max_retries": "5",
        "existing_shards_allocator": "gateway_allocator"
      },
      "indexing": {
        "slowlog": {
          "reformat": "true",
          "threshold": {
            "index": {
              "warn": "-1",
              "trace": "-1",
              "debug": "-1",
              "info": "-1"
            }
          },
          "source": "1000",
          "level": "TRACE"
        }
      },
      "compound_format": "0.1",
      "blocks": {
        "metadata": "false",
        "read": "false",
        "read_only_allow_delete": "false",
        "read_only": "false",
        "write": "false"
      },
      "max_result_window": "10000",
      "store": {
        "stats_refresh_interval": "10s",
        "type": "",
        "fs": {
          "fs_lock": "native"
        },
        "preload": [],
        "snapshot": {
          "snapshot_name": "",
          "index_uuid": "",
          "cache": {
            "prewarm": {
              "enabled": "true"
            },
            "enabled": "true",
            "excluded_file_types": []
          },
          "uncached_chunk_size": "-1b",
          "index_name": "",
          "repository_name": "",
          "snapshot_uuid": ""
        }
      },
      "queries": {
        "cache": {
          "enabled": "true"
        }
      },
      "warmer": {
        "enabled": "true"
      },
      "max_shingle_diff": "3",
      "query_string": {
        "lenient": "false"
      }
    }
  }
}

Concerning lifecyclepolicy:

PUT _ilm/policy/agl-data-stream-policy
{
  "policy": {
    "phases": {
      "hot": {
        "min_age": "0ms",
        "actions": {
          "rollover": {
            "max_size": "100gb"
          }
        }
      },
      "delete": {
        "min_age": "30d",
        "actions": {
          "delete": {
            "delete_searchable_snapshot": true
          }
        }
      }
    }
  }
}

What is happening and why is it on both clusters?

dakrone · January 20, 2021, 10:15pm

I believe you may be hitting this bug: https://github.com/elastic/elasticsearch/issues/67777

@andreidan is this behavior consistent with what you saw (and fixed) regarding that bug?

Tuckson · January 21, 2021, 12:28am

Hmmm I wonder.

I do not manually rollover?

dakrone · January 21, 2021, 3:42am

Do you use Fleet? Fleet does manual rollovers during some of its processes.

Tuckson · January 21, 2021, 8:47am

Don't know what that is, so I guess not.

andreidan · January 21, 2021, 4:06pm

@dakrone I think the behaviour would be consistent with externally (concurrent or not) triggered rollovers

@Tuckson have you updated the ILM policy after it started being used? (ie. after the agl-* data stream was created?) If so, in what way? (did you add and subsequently removemax_age/max_docs conditions to the rollover action?)

Can you post the the result of querying the ilm-history for a few consecutive indices? (eg.
.ds-agl-api-ds-000706, .ds-agl-api-ds-000707, .ds-agl-api-ds-000708, .ds-agl-api-ds-000709) You can use a query along the lines of the one below for every mentioned index

GET ilm-history-*/_search
{
  "size": 50,
  "query": {
          "term": {
            "index": ".ds-agl-api-ds-000706"
  },
  "sort": [
    {
      "@timestamp": {
        "order": "desc"
      }
    }
  ]
}

Tuckson · January 21, 2021, 4:37pm

Only to change maxindexsize. But never to just 5G if I remember correctly.

Ehmmm... when I try to run this query (in devtools) it says: No request selected. Select a request by placing the cursor inside it.

Now I am pretty sure I DID select it.

andreidan · January 21, 2021, 5:27pm

Ah, I think it needs a leading /in devtools

GET /ilm-history-*/_search
{
  "size": 50,
  "query": {
          "term": {
            "index": ".ds-agl-api-ds-000706"
  },
  "sort": [
    {
      "@timestamp": {
        "order": "desc"
      }
    }
  ]
}

Can you also please also post the _ilm/explain output for your latest 3-4 backing indices?

eg.

GET /.ds-agl-api-ds-0007*/_ilm/explain

Tuckson · January 22, 2021, 10:07am

{
  "indices" : {
    ".ds-agl-api-ds-000718" : {
      "index" : ".ds-agl-api-ds-000718",
      "managed" : true,
      "policy" : "agl-data-stream-policy",
      "lifecycle_date_millis" : 1611282197497,
      "age" : "7.68h",
      "phase" : "hot",
      "phase_time_millis" : 1611282198516,
      "action" : "rollover",
      "action_time_millis" : 1611282797788,
      "step" : "check-rollover-ready",
      "step_time_millis" : 1611282797788,
      "phase_execution" : {
        "policy" : "agl-data-stream-policy",
        "phase_definition" : {
          "min_age" : "0ms",
          "actions" : {
            "rollover" : {
              "max_size" : "100gb"
            }
          }
        },
        "version" : 5,
        "modified_date_in_millis" : 1607950970645
      }
    }
  }
}

{
  "indices" : {
    ".ds-agl-api-ds-000717" : {
      "index" : ".ds-agl-api-ds-000717",
      "managed" : true,
      "policy" : "agl-data-stream-policy",
      "lifecycle_date_millis" : 1611281597415,
      "age" : "7.86h",
      "phase" : "hot",
      "phase_time_millis" : 1611281598526,
      "action" : "rollover",
      "action_time_millis" : 1611282199080,
      "step" : "check-rollover-ready",
      "step_time_millis" : 1611282199080,
      "phase_execution" : {
        "policy" : "agl-data-stream-policy",
        "phase_definition" : {
          "min_age" : "0ms",
          "actions" : {
            "rollover" : {
              "max_size" : "100gb"
            }
          }
        },
        "version" : 5,
        "modified_date_in_millis" : 1607950970645
      }
    }
  }
}

{
  "indices" : {
    ".ds-agl-api-ds-000716" : {
      "index" : ".ds-agl-api-ds-000716",
      "managed" : true,
      "policy" : "agl-data-stream-policy",
      "lifecycle_date_millis" : 1611281597426,
      "age" : "7.88h",
      "phase" : "hot",
      "phase_time_millis" : 1611238998079,
      "action" : "complete",
      "action_time_millis" : 1611281598924,
      "step" : "complete",
      "step_time_millis" : 1611281598924,
      "phase_execution" : {
        "policy" : "agl-data-stream-policy",
        "phase_definition" : {
          "min_age" : "0ms",
          "actions" : {
            "rollover" : {
              "max_size" : "100gb"
            }
          }
        },
        "version" : 5,
        "modified_date_in_millis" : 1607950970645
      }
    }
  }
}

{
  "indices" : {
    ".ds-agl-api-ds-000715" : {
      "index" : ".ds-agl-api-ds-000715",
      "managed" : true,
      "policy" : "agl-data-stream-policy",
      "lifecycle_date_millis" : 1611238997442,
      "age" : "19.72h",
      "phase" : "hot",
      "phase_time_millis" : 1611282197374,
      "action" : "complete",
      "action_time_millis" : 1611282199019,
      "step" : "complete",
      "step_time_millis" : 1611282199019,
      "phase_execution" : {
        "policy" : "agl-data-stream-policy",
        "phase_definition" : {
          "min_age" : "0ms",
          "actions" : {
            "rollover" : {
              "max_size" : "100gb"
            }
          }
        },
        "version" : 5,
        "modified_date_in_millis" : 1607950970645
      }
    }
  }
}

Tuckson · January 22, 2021, 10:10am

Also with the / I get the same error when trying to execute this request in devtools

Tuckson · January 22, 2021, 10:16am

lol, it missed a curly bracket.

This request:

GET /ilm-history-*/_search
{
  "size": 50,
  "query": {
          "term": {
            "index": ".ds-agl-api-ds-000717"
          }
  },
  "sort": [
    {
      "@timestamp": {
        "order": "desc"
      }
    }
  ]
}

Gives this result for all 4 indices from the previous post:

{
  "took" : 0,
  "timed_out" : false,
  "_shards" : {
    "total" : 0,
    "successful" : 0,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 0,
      "relation" : "eq"
    },
    "max_score" : 0.0,
    "hits" : [ ]
  }
}

Tuckson · January 22, 2021, 10:38am

OK,

Because I saw the latest index was about to rollover, I kept watching.
The storage size increased to over 211Gb
Before rollover, both current (718) and the previous index (717) had as current action: rollover.
Now after rollover I see this:

The 717 index has as current action 'complete'.
The 718 shows this (stacktrace underneath picture):

ElasticsearchException[Concurrent modification of alias [agl-api-ds] during rollover]
	at org.elasticsearch.action.admin.indices.rollover.TransportRolloverAction$1$1.execute(TransportRolloverAction.java:143)
	at org.elasticsearch.cluster.ClusterStateUpdateTask.execute(ClusterStateUpdateTask.java:47)
	at org.elasticsearch.cluster.service.MasterService.executeTasks(MasterService.java:702)
	at org.elasticsearch.cluster.service.MasterService.calculateTaskOutputs(MasterService.java:324)
	at org.elasticsearch.cluster.service.MasterService.runTasks(MasterService.java:219)
	at org.elasticsearch.cluster.service.MasterService.access$000(MasterService.java:73)
	at org.elasticsearch.cluster.service.MasterService$Batcher.run(MasterService.java:151)
	at org.elasticsearch.cluster.service.TaskBatcher.runIfNotProcessed(TaskBatcher.java:150)
	at org.elasticsearch.cluster.service.TaskBatcher$BatchedTask.run(TaskBatcher.java:188)
	at org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingRunnable.run(ThreadContext.java:678)
	at org.elasticsearch.common.util.concurrent.PrioritizedEsThreadPoolExecutor$TieBreakingPrioritizedRunnable.runAndClean(PrioritizedEsThreadPoolExecutor.java:252)
	at org.elasticsearch.common.util.concurrent.PrioritizedEsThreadPoolExecutor$TieBreakingPrioritizedRunnable.run(PrioritizedEsThreadPoolExecutor.java:215)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1130)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:630)
	at java.base/java.lang.Thread.run(Thread.java:832)

After a few minutes it turns to this:

Looking at the index list I see this:

zooming in to the latest 2 indices:

Phase definition:

{
  "policy": "agl-data-stream-policy",
  "phase_definition": {
    "min_age": "0ms",
    "actions": {
      "rollover": {
        "max_size": "100gb"
      }
    }
  },
  "version": 5,
  "modified_date_in_millis": 1607950970645
}

Tuckson · January 22, 2021, 4:03pm

After a manual rollover no small index is created,it is just being filled like the previous one.

andreidan · January 25, 2021, 11:45am

@Tuckson thank you for all the information. I can now confirm that you are hitting this bug https://github.com/elastic/elasticsearch/issues/67777.
A manual data stream rollover at any stage will potentially trigger a situation where multiple backing indices will attempt to trigger the rollover via ILM.

The fix for this will be available in the 7.11 release line.

In the meantime, for 7.10.x this situation could be visible if a data stream's non-write index is in the check-rollover-ready step (as you pointed out: "Before rollover, both current (718) and the previous index (717) had as current action: rollover").
If that's the case a manual move-to-step to skip the check-rollover-ready and attempt-rollover steps on the older generation backing index (to wait-for-active-shards which waits for the data stream write index to be allocated) would avoid the odd sized rollover indices.

eg. (illustrative, please check which previous index is in the rollover action, likely in the check-rollover-ready step)

POST /_ilm/move/.ds-agl-api-ds-000719
{
  "current_step": { 
    "phase": "hot",
    "action": "rollover",
    "name": "check-rollover-ready"
  },
  "next_step": { 
    "phase": "hot",
    "action": "rollover
    "name": "wait-for-active-shards"
  }
}

Tuckson · January 25, 2021, 12:18pm

Thnx for your responses. Let's hope this release is delivered soon then. Was already looking forward to it for the runtime fields

system · February 22, 2021, 12:18pm

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
Delete default index Elasticsearch ilm-index-lifecycle-management	3	1522	August 3, 2022
Trouble with my ILM Elasticsearch ilm-index-lifecycle-management	5	299	March 22, 2023
How to fix retention log and how to manage data retention time in Elasticsearch Kibana ilm-index-lifecycle-management	7	1591	March 2, 2023
Datastream and indices not rolled over or deleted Kibana ilm-index-lifecycle-management	5	393	November 16, 2021
How to limit the storage size of my indeces and automatically generate new index after reaching the assign storage size? Kibana ilm-index-lifecycle-management	12	4019	June 8, 2022

Strange index size

Related topics