Tuning SSD Performance

Hi,
Is there an aggregated, and detailed, source of information regarding optimising nvme ssd config for indexing heavy use cases with elasticsearch?

Thx
D

How did you determine that the storage is the bottleneck? Have you followed these guidelines?

I haven't @Christian_Dahlqvist. I've taken ownership of a set of clusters which haven't been well configured or managed. I'm evaluating all levels of the stack, hence the question.

Looking into this some more I'm seeing hot_threads output which seems to point to i/o issues:

::: {generic-node}{uMrBwihLR6K1Af91obRTAg}{pCCp2d4rQmKOS9Teyy4X_w}{generic-node}{generic-node:9300}{aws_availability_zone=us-east-1a, ml.machine_memory=64384024576, ml.max_open_jobs=20, xpack.installed=true}
   Hot threads at 2020-08-13T07:44:01.157Z, interval=500ms, busiestThreads=3, ignoreIdleThreads=true:

   89.5% (447.5ms out of 500ms) cpu usage by thread 'elasticsearch[generic-node][write][T#3]'
     7/10 snapshots sharing following 52 elements
       app//org.apache.lucene.codecs.lucene80.IndexedDISI$Method$1.advanceExactWithinBlock(IndexedDISI.java:507)
       app//org.apache.lucene.codecs.lucene80.IndexedDISI.advanceExact(IndexedDISI.java:399)
       app//org.apache.lucene.codecs.lucene80.Lucene80NormsProducer$SparseNormsIterator.advanceExact(Lucene80NormsProducer.java:186)
       app//org.apache.lucene.codecs.lucene50.Lucene50PostingsWriter.startDoc(Lucene50PostingsWriter.java:264)
       app//org.apache.lucene.codecs.PushPostingsWriterBase.writeTerm(PushPostingsWriterBase.java:148)
       app//org.apache.lucene.codecs.blocktree.BlockTreeTermsWriter$TermsWriter.write(BlockTreeTermsWriter.java:865)
       app//org.apache.lucene.codecs.blocktree.BlockTreeTermsWriter.write(BlockTreeTermsWriter.java:344)
       app//org.apache.lucene.codecs.perfield.PerFieldPostingsFormat$FieldsWriter.write(PerFieldPostingsFormat.java:142)
       app//org.apache.lucene.index.FreqProxTermsWriter.flush(FreqProxTermsWriter.java:97)
       app//org.apache.lucene.index.DefaultIndexingChain.flush(DefaultIndexingChain.java:176)

These are AWS i3.2xlarge instances. Each node is general purpose. Indexing rates are around 40k/sec (replicated) but can fall to around 5k/sec which is when the snapshot above was taken.

How many indices and shards are you indexing into? What are your index settings? Is your data immutable? Which version are you using?

Use Case: logging
Version: 7.2.0
Number of Nodes: 13
Number of Active Indices: 1
Number of Shards: 39 (primary)

Index settings:

{
  "settings": {
    "index": {
      "mapping": {
        "total_fields": {
          "limit": "10000"
        },
        "ignore_malformed": "true"
      },
      "refresh_interval": "30s",
      "indexing": {
        "slowlog": {
          "level": "debug",
          "threshold": {
            "index": {
              "warn": "10s",
              "trace": "500ms",
              "debug": "2s",
              "info": "5s"
            }
          },
          "source": "500"
        }
      },
      "translog": {
        "flush_threshold_size": "1024MB"
      },
      "provided_name": "aws-logstash-log-2020.08.13",
      "query": {
        "default_field": "message"
      },
      "creation_date": "1597276800761",
      "number_of_replicas": "1",
      "uuid": "O0vK_wU8SwyaBMnKiruzCA",
      "version": {
        "created": "7020099"
      },
      "codec": "best_compression",
      "search": {
        "slowlog": {
          "level": "info",
          "threshold": {
            "fetch": {
              "warn": "1s",
              "trace": "200ms",
              "debug": "500ms",
              "info": "800ms"
            },
            "query": {
              "warn": "10s",
              "trace": "500ms",
              "debug": "2s",
              "info": "5s"
            }
          }
        }
      },
      "number_of_shards": "39"
    }
  },
  "defaults": {
    "index": {
      "max_inner_result_window": "100",
      "unassigned": {
        "node_left": {
          "delayed_timeout": "1m"
        }
      },
      "max_terms_count": "65536",
      "lifecycle": {
        "name": "",
        "rollover_alias": "",
        "indexing_complete": "false"
      },
      "routing_partition_size": "1",
      "force_memory_term_dictionary": "false",
      "max_docvalue_fields_search": "100",
      "merge": {
        "scheduler": {
          "max_thread_count": "4",
          "auto_throttle": "true",
          "max_merge_count": "9"
        },
        "policy": {
          "reclaim_deletes_weight": "2.0",
          "floor_segment": "2mb",
          "max_merge_at_once_explicit": "30",
          "max_merge_at_once": "10",
          "max_merged_segment": "5gb",
          "expunge_deletes_allowed": "10.0",
          "segments_per_tier": "10.0",
          "deletes_pct_allowed": "33.0"
        }
      },
      "max_refresh_listeners": "1000",
      "max_regex_length": "1000",
      "load_fixed_bitset_filters_eagerly": "true",
      "number_of_routing_shards": "1",
      "write": {
        "wait_for_active_shards": "1"
      },
      "verified_before_close": "false",
      "mapping": {
        "coerce": "false",
        "nested_fields": {
          "limit": "50"
        },
        "nested_objects": {
          "limit": "10000"
        },
        "depth": {
          "limit": "20"
        },
        "field_name_length": {
          "limit": "9223372036854775807"
        }
      },
      "source_only": "false",
      "soft_deletes": {
        "enabled": "false",
        "retention": {
          "operations": "0"
        },
        "retention_lease": {
          "period": "12h"
        }
      },
      "max_script_fields": "32",
      "query": {
        "parse": {
          "allow_unmapped_fields": "true"
        }
      },
      "format": "0",
      "frozen": "false",
      "sort": {
        "missing": [],
        "mode": [],
        "field": [],
        "order": []
      },
      "priority": "1",
      "max_rescore_window": "10000",
      "max_adjacency_matrix_filters": "100",
      "analyze": {
        "max_token_count": "10000"
      },
      "gc_deletes": "60s",
      "optimize_auto_generated_id": "true",
      "max_ngram_diff": "1",
      "translog": {
        "generation_threshold_size": "64mb",
        "sync_interval": "5s",
        "retention": {
          "size": "512mb",
          "age": "12h"
        },
        "durability": "REQUEST"
      },
      "auto_expand_replicas": "false",
      "mapper": {
        "dynamic": "true"
      },
      "requests": {
        "cache": {
          "enable": "true"
        }
      },
      "data_path": "",
      "highlight": {
        "max_analyzed_offset": "1000000"
      },
      "routing": {
        "rebalance": {
          "enable": "all"
        },
        "allocation": {
          "enable": "all",
          "total_shards_per_node": "-1"
        }
      },
      "search": {
        "idle": {
          "after": "30s"
        },
        "throttled": "false"
      },
      "fielddata": {
        "cache": "node"
      },
      "default_pipeline": "_none",
      "max_slices_per_scroll": "1024",
      "shard": {
        "check_on_startup": "false"
      },
      "xpack": {
        "watcher": {
          "template": {
            "version": ""
          }
        },
        "version": "",
        "ccr": {
          "following_index": "false"
        }
      },
      "percolator": {
        "map_unmapped_fields_as_text": "false"
      },
      "allocation": {
        "max_retries": "5"
      },
      "indexing": {
        "slowlog": {
          "reformat": "true"
        }
      },
      "compound_format": "0.1",
      "blocks": {
        "metadata": "false",
        "read": "false",
        "read_only_allow_delete": "false",
        "read_only": "false",
        "write": "false"
      },
      "max_result_window": "10000",
      "store": {
        "stats_refresh_interval": "10s",
        "type": "",
        "fs": {
          "fs_lock": "native"
        },
        "preload": []
      },
      "queries": {
        "cache": {
          "enabled": "true"
        }
      },
      "warmer": {
        "enabled": "true"
      },
      "max_shingle_diff": "3",
      "query_string": {
        "lenient": "false"
      }
    }
  }
}

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.