Thanks for the quick response first!
In my own ES cluster I am getting like 16 QPS in production with good results. I want to migrate to a better one with 100 QPS and I am testing with ESrally with a new one to see how I can get there. My own one only has 5 nodes and 5 gpus per data nodes, without replicas. The test one has the configs mentioned above
Here is my track:
{
"short-description": "Max QPS load test",
"description": "Progressive load test to find maximum sustainable throughput",
"indices": [
{
"name": "data-v0.0.7_2025_11_06",
"auto-managed": false
}
],
"operations": [
{
"name": "hybrid-search",
"operation-type": "search",
"param-source": "query-file-source",
"index": "data-v0.0.7_2025_11_06",
"cache": false
}
],
"schedule": [
{
"name": "hybrid-search-1-client",
"operation": "hybrid-search",
"warmup-time-period": 10,
"time-period": 60,
"clients": 1
},
{
"name": "hybrid-search-2-clients",
"operation": "hybrid-search",
"warmup-time-period": 10,
"time-period": 60,
"clients": 2
},
{
"name": "hybrid-search-4-clients",
"operation": "hybrid-search",
"warmup-time-period": 10,
"time-period": 60,
"clients": 4
},
{
"name": "hybrid-search-8-clients",
"operation": "hybrid-search",
"warmup-time-period": 10,
"time-period": 60,
"clients": 8
},
{
"name": "hybrid-search-16-clients",
"operation": "hybrid-search",
"warmup-time-period": 10,
"time-period": 60,
"clients": 16
},
{
"name": "hybrid-search-32-clients",
"operation": "hybrid-search",
"warmup-time-period": 10,
"time-period": 60,
"clients": 32
},
{
"name": "hybrid-search-64-clients",
"operation": "hybrid-search",
"warmup-time-period": 10,
"time-period": 60,
"clients": 64
},
{
"name": "hybrid-search-128-clients",
"operation": "hybrid-search",
"warmup-time-period": 10,
"time-period": 60,
"clients": 128
}
]
}
An example query is this one without the 4096 vector explicitly:
{"body": {"size": 100, "explain": false, "_source": {"includes": ["chunk_id", "document_id", "datastore_id", "date_created", "document_name", "chunk", "metadata.chunk_size", "metadata.file_format", "metadata.page", "metadata.coordinates", "metadata.document_date_humanized", "metadata.extras.custom_metadata", "metadata.custom_metadata_config", "metadata.extras.section_id", "metadata.section_id", "metadata.extras.document_title", "metadata.extras.section_title", "metadata.extras.is_figure", "metadata.extras.file_name", "metadata.link", "metadata.extras.link", "metadata.type", "metadata.extras.type", "metadata.extras.next_page_chunk_locations"], "excludes": }, "query": {"bool": {"must": [{"nested": {"path": "metadata", "query": {"knn": {"field": "metadata.extras.embeddings_model_1", "query_vector": [0.013909942,...]], "num_candidates": 100}}, "score_mode": "max", "boost": 1.0}}], "filter": [{"term": {"store_id": "9784afae-4af4-44c2-a5d7-c24f51728b2c"}}], "boost": 1.0}}}}
and the track.py is:
import json
import os
def register(registry):
registry.register_param_source("query-file-source", QueryFileParamSource)
class QueryFileParamSource:
def __init__(self, track, params, **kwargs):
self._index = params.get("index", "data-v0.0.7_2025_11_06")
queries_file = os.path.join(track.root, "queries.json")
self._queries = []
with open(queries_file, "r") as f:
for line in f:
line = line.strip()
if line:
self._queries.append(json.loads(line))
self._index_pos = 0
self._cache = params.get("cache", False)
def partition(self, partition_index, total_partitions):
return self
def params(self):
query = self._queries[self._index_pos % len(self._queries)]
self._index_pos += 1
return {"index": self._index, "cache": self._cache, "body": query["body"]}
and my index config is with an 8.19 ES is:
"settings" : {
"index" : {
"routing" : {
"allocation" : {
"include" : {
"_tier_preference" : "data_content"
}
}
},
"refresh_interval" : null,
"number_of_shards" : "35",
"provided_name" : "data-2025_11_06",
"creation_date" : "1762919828038",
"analysis" : {
"filter" : {
"synonym_filter" : {
"format" : "wordnet",
"updateable" : "true",
"type" : "synonym",
"synonyms_path" : "/usr/share/elasticsearch/config/synonyms.txt",
"lenient" : "true"
},
"custom_synonym_filter_ffd7095b-7ceb-47a8-80d9-fa35cf03627e" : {
"type" : "synonym_graph",
"updateable" : "true",
"synonyms_set" : "custom_synonym_set_ffd7095b-7ceb-47a8-80d9-fa35cf03627e"
},
"shingles_filter" : {
"max_shingle_size" : "4",
"min_shingle_size" : "2",
"output_unigrams" : "true",
"type" : "shingle"
},
"possessive_english_filter" : {
"name" : "possessive_english",
"type" : "stemmer"
},
"stopwords_english_filter" : {
"type" : "stop",
"stopwords" : "_english_"
},
"custom_synonym_filter_d26b3403-f35a-48e1-a7ce-ed1f1352149d" : {
"type" : "synonym_graph",
"updateable" : "true",
"synonyms_set" : "custom_synonym_set_d26b3403-f35a-48e1-a7ce-ed1f1352149d"
},
"custom_synonym_filter_08d04d9e-8933-4784-b17a-ce8accd5a5c3" : {
"type" : "synonym_graph",
"updateable" : "true",
"synonyms_set" : "custom_synonym_set_08d04d9e-8933-4784-b17a-ce8accd5a5c3"
}
},
"analyzer" : {
"custom_analyzer" : {
"filter" : [
"lowercase",
"stopwords_english_filter",
"possessive_english_filter"
],
"type" : "custom",
"tokenizer" : "standard"
},
"custom_synonyms_analyzer_d26b3403-f35a-48e1-a7ce-ed1f1352149d" : {
"filter" : [
"lowercase",
"custom_synonym_filter_d26b3403-f35a-48e1-a7ce-ed1f1352149d",
"possessive_english_filter",
"porter_stem"
],
"type" : "custom",
"tokenizer" : "standard"
},
"custom_search_analyzer" : {
"filter" : [
"lowercase",
"stopwords_english_filter",
"possessive_english_filter",
"synonym_filter"
],
"type" : "custom",
"tokenizer" : "standard"
},
"custom_synonyms_analyzer_ffd7095b-7ceb-47a8-80d9-fa35cf03627e" : {
"filter" : [
"lowercase",
"custom_synonym_filter_ffd7095b-7ceb-47a8-80d9-fa35cf03627e",
"possessive_english_filter",
"porter_stem"
],
"type" : "custom",
"tokenizer" : "standard"
},
"edge_ngram_analyzer" : {
"filter" : [
"lowercase",
"possessive_english_filter"
],
"type" : "custom",
"tokenizer" : "edge_ngram_tokenizer"
},
"custom_analyzer_with_shingles" : {
"filter" : [
"lowercase",
"stopwords_english_filter",
"possessive_english_filter",
"shingle"
],
"type" : "custom",
"tokenizer" : "standard"
},
"custom_porter_stem_analyzer" : {
"filter" : [
"lowercase",
"stopwords_english_filter",
"possessive_english_filter",
"porter_stem"
],
"type" : "custom",
"tokenizer" : "standard"
},
"custom_synonyms_analyzer_08d04d9e-8933-4784-b17a-ce8accd5a5c3" : {
"filter" : [
"lowercase",
"custom_synonym_filter_08d04d9e-8933-4784-b17a-ce8accd5a5c3",
"possessive_english_filter",
"porter_stem"
],
"type" : "custom",
"tokenizer" : "standard"
}
},
"tokenizer" : {
"edge_ngram_tokenizer" : {
"token_chars" : [
"letter",
"digit"
],
"min_gram" : "3",
"type" : "edge_ngram",
"max_gram" : "6"
}
}
},
"number_of_replicas" : "2",
"uuid" : "JE5DpNYiQ5eFpirKNSWsIA",
"version" : {
"created" : "8503000"
}
}
}
}
}
I just finished a test with 20 nodes and I got to 70 QPS but it is too much reosurces. I am thinking about just having two different ES clusters and balancing myself, since each one I can get 40 QPS with less vertical resources even. I know that for latency erasing the amount of source fields I am retrieving does help, So Maybe I will work for that, but if there are wuivcker solutions it would be really helpful!