Slow refresh times for large indices

Hi,

We are experiencing a very high load on the cluster, which seems to happen from a very high iops rate (>5000), when indexing to very specific indices.

In this graph you can see our disk iops during indexing to those specific indices, and without it (a bit before 12PM)

--

When looking at Kibana at the times of indexing we see the following graphs:

The refresh times seems to be very high, which I suspect is the main reason for creating the load. (Our refresh interval is 60s).

Usually we split our indices by time, and creating monthly / daily indices. The specific indices here are not splitted by time, it's just one very large index. It is 2TB in size, contains 2.1B docs (700M with some nested docs), and split to 12 shards (and another 12 for replicas). We are indexing batches of 1000 docs.

All other indices show much better performance.

I understand that this index might not be optimal in terms of sizing and number of docs, but still I wish to understand this situation better before I make any changes, to make sure no other thing is happening here.

We also tried to index some of the data from this index to a fresh new index and the refresh times were much smaller:

Can the high number of documents / size have such a dramatic effect on the refresh times? Am I missing something else?

The cluster consists of 8 data nodes, 8CPU 64GB RAM (30 heap)

A few more details:

  • The high iops that we see are reads, not writes.
  • We use our own id's for the docs, could this be related?

The index stats does not look too good to me in terms of merges and deletions, can anyone shed more light about this:

 "primaries" : {
      "docs" : {
        "count" : 2114085279,
        "deleted" : 239169938
      },
      "store" : {
        "size_in_bytes" : 980900341018
      },
      "indexing" : {
        "index_total" : 139449476,
        "index_time_in_millis" : 371828274,
        "index_current" : 2,
        "index_failed" : 0,
        "delete_total" : 0,
        "delete_time_in_millis" : 0,
        "delete_current" : 0,
        "noop_update_total" : 0,
        "is_throttled" : false,
        "throttle_time_in_millis" : 0
      },
      "get" : {
        "total" : 0,
        "time_in_millis" : 0,
        "exists_total" : 0,
        "exists_time_in_millis" : 0,
        "missing_total" : 0,
        "missing_time_in_millis" : 0,
        "current" : 0
      },
      "search" : {
        "open_contexts" : 0,
        "query_total" : 33854,
        "query_time_in_millis" : 94515641,
        "query_current" : 0,
        "fetch_total" : 3501,
        "fetch_time_in_millis" : 161486,
        "fetch_current" : 0,
        "scroll_total" : 0,
        "scroll_time_in_millis" : 0,
        "scroll_current" : 0,
        "suggest_total" : 0,
        "suggest_time_in_millis" : 0,
        "suggest_current" : 0
      },
      "merges" : {
        "current" : 0,
        "current_docs" : 0,
        "current_size_in_bytes" : 0,
        "total" : 4196,
        "total_time_in_millis" : 201105329,
        "total_docs" : 1190123805,
        "total_size_in_bytes" : 547729871639,
        "total_stopped_time_in_millis" : 0,
        "total_throttled_time_in_millis" : 148310724,
        "total_auto_throttle_in_bytes" : 62914560
      },
      "refresh" : {
        "total" : 22955,
        "total_time_in_millis" : 57620593,
        "external_total" : 21486,
        "external_total_time_in_millis" : 57862703,
        "listeners" : 0
      },
      "flush" : {
        "total" : 1417,
        "periodic" : 481,
        "total_time_in_millis" : 2256616
      },
      "warmer" : {
        "current" : 0,
        "total" : 21438,
        "total_time_in_millis" : 193820
      },
      "query_cache" : {
        "memory_size_in_bytes" : 1334382201,
        "total_count" : 2433865,
        "hit_count" : 394945,
        "miss_count" : 2038920,
        "cache_size" : 17232,
        "cache_count" : 22214,
        "evictions" : 4982
      },
      "fielddata" : {
        "memory_size_in_bytes" : 0,
        "evictions" : 0
      },
      "completion" : {
        "size_in_bytes" : 0
      },
      "segments" : {
        "count" : 513,
        "memory_in_bytes" : 15122900,
        "terms_memory_in_bytes" : 5758896,
        "stored_fields_memory_in_bytes" : 8073576,
        "term_vectors_memory_in_bytes" : 0,
        "norms_memory_in_bytes" : 0,
        "points_memory_in_bytes" : 0,
        "doc_values_memory_in_bytes" : 1290428,
        "index_writer_memory_in_bytes" : 313299326,
        "version_map_memory_in_bytes" : 12766670,
        "fixed_bit_set_memory_in_bytes" : 294179424,
        "max_unsafe_auto_id_timestamp" : -1,
        "file_sizes" : { }
      },
      "translog" : {
        "operations" : 1442040,
        "size_in_bytes" : 3078067308,
        "uncommitted_operations" : 1442040,
        "uncommitted_size_in_bytes" : 3078067308,
        "earliest_last_modified_age" : 0
      },
      "request_cache" : {
        "memory_size_in_bytes" : 0,
        "evictions" : 0,
        "hit_count" : 10026,
        "miss_count" : 13098
      },
      "recovery" : {
        "current_as_source" : 0,
        "current_as_target" : 0,
        "throttle_time_in_millis" : 0
      }
    },
    "total" : {
      "docs" : {
        "count" : 4228175156,
        "deleted" : 474745694
      },
      "store" : {
        "size_in_bytes" : 1959565987467
      },
      "indexing" : {
        "index_total" : 278898733,
        "index_time_in_millis" : 500201863,
        "index_current" : 2,
        "index_failed" : 0,
        "delete_total" : 0,
        "delete_time_in_millis" : 0,
        "delete_current" : 0,
        "noop_update_total" : 0,
        "is_throttled" : false,
        "throttle_time_in_millis" : 0
      },
      "get" : {
        "total" : 0,
        "time_in_millis" : 0,
        "exists_total" : 0,
        "exists_time_in_millis" : 0,
        "missing_total" : 0,
        "missing_time_in_millis" : 0,
        "current" : 0
      },
      "search" : {
        "open_contexts" : 0,
        "query_total" : 69267,
        "query_time_in_millis" : 178840435,
        "query_current" : 0,
        "fetch_total" : 7313,
        "fetch_time_in_millis" : 287245,
        "fetch_current" : 0,
        "scroll_total" : 0,
        "scroll_time_in_millis" : 0,
        "scroll_current" : 0,
        "suggest_total" : 0,
        "suggest_time_in_millis" : 0,
        "suggest_current" : 0
      },
      "merges" : {
        "current" : 0,
        "current_docs" : 0,
        "current_size_in_bytes" : 0,
        "total" : 8590,
        "total_time_in_millis" : 391761859,
        "total_docs" : 2396782270,
        "total_size_in_bytes" : 1102953638942,
        "total_stopped_time_in_millis" : 0,
        "total_throttled_time_in_millis" : 300188326,
        "total_auto_throttle_in_bytes" : 125829120
      },
      "refresh" : {
        "total" : 48330,
        "total_time_in_millis" : 93493532,
        "external_total" : 44096,
        "external_total_time_in_millis" : 93537344,
        "listeners" : 0
      },
      "flush" : {
        "total" : 2854,
        "periodic" : 982,
        "total_time_in_millis" : 3355827
      },
      "warmer" : {
        "current" : 0,
        "total" : 44000,
        "total_time_in_millis" : 246839
      },
      "query_cache" : {
        "memory_size_in_bytes" : 2877436127,
        "total_count" : 5101079,
        "hit_count" : 823008,
        "miss_count" : 4278071,
        "cache_size" : 38979,
        "cache_count" : 44324,
        "evictions" : 5345
      },
      "fielddata" : {
        "memory_size_in_bytes" : 0,
        "evictions" : 0
      },
      "completion" : {
        "size_in_bytes" : 0
      },
      "segments" : {
        "count" : 999,
        "memory_in_bytes" : 29950350,
        "terms_memory_in_bytes" : 11218912,
        "stored_fields_memory_in_bytes" : 16123352,
        "term_vectors_memory_in_bytes" : 0,
        "norms_memory_in_bytes" : 0,
        "points_memory_in_bytes" : 0,
        "doc_values_memory_in_bytes" : 2608086,
        "index_writer_memory_in_bytes" : 599935328,
        "version_map_memory_in_bytes" : 23638770,
        "fixed_bit_set_memory_in_bytes" : 587908928,
        "max_unsafe_auto_id_timestamp" : -1,
        "file_sizes" : { }
      },
      "translog" : {
        "operations" : 2726124,
        "size_in_bytes" : 5820881529,
        "uncommitted_operations" : 2726124,
        "uncommitted_size_in_bytes" : 5820881529,
        "earliest_last_modified_age" : 0
      },
      "request_cache" : {
        "memory_size_in_bytes" : 0,
        "evictions" : 0,
        "hit_count" : 19891,
        "miss_count" : 27123
      },
      "recovery" : {
        "current_as_source" : 0,
        "current_as_target" : 0,
        "throttle_time_in_millis" : 0
      }
    }`

If you are using your own document id each insert or update requires Elasticsearch to first perform a read to identify whether the document already exists or not. This results in more IOPS than a pure insert with auto generated IDs.

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.