In an index without replicas, with no data being written, some vector requests, when repeated, yield inconsistent results.
This issue is reproducible in versions 8.13.4, 8.15.1, and 8.17.0, but cannot be reproduced in version 8.7.0, indicating that there is no bug in 8.7.0.
- create index
curl --location --request PUT 'http://elasticsearch:9200/vector_test' \
--header 'Content-Type: application/json' \
--data '{
"mappings": {
"dynamic": "strict",
"properties": {
"vector": {
"type": "dense_vector",
"dims": 1024,
"index": true,
"similarity": "cosine",
"index_options": {
"type": "hnsw",
"m": 16,
"ef_construction": 100
}
}
}
},
"settings": {
"index": {
"routing": {
"allocation": {
"include": {
"_tier_preference": "data_content"
}
}
},
"refresh_interval": "30s",
"number_of_shards": "1",
"number_of_replicas": "0"
}
}
}'
- Write 10,000 random vector values and then force a _refresh.
# -*- coding:utf-8 -*-
import json
import time
import numpy as np
import requests
REFRESH_URL = 'http://elasticsearch:9200/vector_test/_refresh'
BULK_URL = 'http://elasticsearch:9200/vector_test/_bulk'
request = requests.session()
# Generate a random vector with 1024 dimensions, where each value is a floating-point number between -1 and 1
def float32_uniform(min_value, max_value):
random_float = np.random.uniform(min_value, max_value)
return float(random_float)
def write():
tmp_str = ''
count = 0
for id in range(10000):
#
vector = [float32_uniform(-1, 1) for _ in range(1024)]
data = {'vector': vector}
tmp_str += '{"index":{"_id":"' + str(id) + '"}}\n' + json.dumps(data) + '\n'
if count == 1000:
res = request.post(url=BULK_URL, headers={"Content-Type": "application/x-ndjson"}, data=tmp_str)
print(res.text)
tmp_str = ''
count = 0
time.sleep(0.2)
count += 1
if count != 0 and tmp_str != '':
print(request.post(url=BULK_URL, headers={"Content-Type": "application/x-ndjson"}, data=tmp_str).json())
request.post(REFRESH_URL)
print("write success.")
if __name__ == '__main__':
write()
- Begin testing to reproduce the issue. This experiment is repeated 100 times: for each iteration, a random vector is constructed and requested 100 times.
# -*- coding:utf-8 -*-
import json
import numpy as np
import requests
SEARCH_URL = 'http://elasticsearch:9200/vector_test/_search'
request = requests.session()
def float32_uniform(min_value, max_value):
random_float = np.random.uniform(min_value, max_value)
return float(random_float)
def request_test(loop_count, k, num_candidates):
vector = [float32_uniform(-1, 1) for _ in range(1024)]
body = {"from": 0, "size": 10,
"knn": {"field": "vector", "query_vector": vector, "k": k, "num_candidates": num_candidates},
"_source": False}
result_dict = {}
for i in range(loop_count):
response = request.post(url=SEARCH_URL, json=body).json()
hits = response['hits']['hits']
hits_str = json.dumps(hits, ensure_ascii=False)
if hits_str in result_dict:
result_dict[hits_str] += 1
else:
result_dict[hits_str] = 1
data_list = []
for res, count in result_dict.items():
data_list.append({"data": res, "count": count})
base_count = 0
for item in sorted(data_list, key=lambda s: s['count'], reverse=True):
base_count = item['count']
break
error_count = loop_count - base_count
print('{}/{}'.format(base_count, error_count))
return base_count, error_count
if __name__ == '__main__':
success = total = 0
for i in range(100):
base, error_count = request_test(loop_count=100, k=10, num_candidates=20)
success += base
total += base + error_count
print('{}/{}'.format(success, total))
Below are the test results from version 8.17.0, which show consistency issues; versions 8.13.4 and 8.15.1 also have the same problem.
The following are the test results from version 8.7.0, and I have assessed the consistency to be 100%.
If the index is forcibly merged into a single segment with forcemerge, the results become stable again.