Hi, I am trying to optimize my dense vector space usage of the _source
field but haven't got any luck so far. I tried with turning on synthetic source, but then I observed that I now see a new _recovery_source
field taking 4x of the space of my vector. I attempted to both shortening the soft delete retention using index.soft_deletes.retention_lease.period
, and force-merging my index using
client.indices.forcemerge(index=my_index)
None of these methods helps - the _recovery_source
remains even after a few days the index get force-merged. What are things I should look into here?
This is costing us a horrible amount of budget b/c Elasticsearch is storing 5x what it's supposed to store and so far there's no obvious documentation optimizing this. We are consider other Vector DB providers if we couldn't solve this issue.
Appendix:
My server version: v8.17.2
Disk usage breakdown:
{
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"54.346_embedding_1740616609": {
"store_size": "11.9mb",
"store_size_in_bytes": 12546243,
"all_fields": {
"total": "11.9mb",
"total_in_bytes": 12527444,
"inverted_index": {
"total": "12.4kb",
"total_in_bytes": 12753
},
"stored_fields": "9mb",
"stored_fields_in_bytes": 9444229,
"doc_values": "446b",
"doc_values_in_bytes": 446,
"points": "613b",
"points_in_bytes": 613,
"norms": "0b",
"norms_in_bytes": 0,
"term_vectors": "0b",
"term_vectors_in_bytes": 0,
"knn_vectors": "2.9mb",
"knn_vectors_in_bytes": 3069403
},
"fields": {
"__soft_deletes": {
"total": "18b",
"total_in_bytes": 18,
"inverted_index": {
"total": "0b",
"total_in_bytes": 0
},
"stored_fields": "0b",
"stored_fields_in_bytes": 0,
"doc_values": "18b",
"doc_values_in_bytes": 18,
"points": "0b",
"points_in_bytes": 0,
"norms": "0b",
"norms_in_bytes": 0,
"term_vectors": "0b",
"term_vectors_in_bytes": 0,
"knn_vectors": "0b",
"knn_vectors_in_bytes": 0
},
"_id": {
"total": "20.5kb",
"total_in_bytes": 21025,
"inverted_index": {
"total": "10.4kb",
"total_in_bytes": 10667
},
"stored_fields": "10.1kb",
"stored_fields_in_bytes": 10358,
"doc_values": "0b",
"doc_values_in_bytes": 0,
"points": "0b",
"points_in_bytes": 0,
"norms": "0b",
"norms_in_bytes": 0,
"term_vectors": "0b",
"term_vectors_in_bytes": 0,
"knn_vectors": "0b",
"knn_vectors_in_bytes": 0
},
"_primary_term": {
"total": "0b",
"total_in_bytes": 0,
"inverted_index": {
"total": "0b",
"total_in_bytes": 0
},
"stored_fields": "0b",
"stored_fields_in_bytes": 0,
"doc_values": "0b",
"doc_values_in_bytes": 0,
"points": "0b",
"points_in_bytes": 0,
"norms": "0b",
"norms_in_bytes": 0,
"term_vectors": "0b",
"term_vectors_in_bytes": 0,
"knn_vectors": "0b",
"knn_vectors_in_bytes": 0
},
"_recovery_source": {
"total": "8.9mb",
"total_in_bytes": 9416321,
"inverted_index": {
"total": "0b",
"total_in_bytes": 0
},
"stored_fields": "8.9mb",
"stored_fields_in_bytes": 9416321,
"doc_values": "0b",
"doc_values_in_bytes": 0,
"points": "0b",
"points_in_bytes": 0,
"norms": "0b",
"norms_in_bytes": 0,
"term_vectors": "0b",
"term_vectors_in_bytes": 0,
"knn_vectors": "0b",
"knn_vectors_in_bytes": 0
},
"_seq_no": {
"total": "1012b",
"total_in_bytes": 1012,
"inverted_index": {
"total": "0b",
"total_in_bytes": 0
},
"stored_fields": "0b",
"stored_fields_in_bytes": 0,
"doc_values": "399b",
"doc_values_in_bytes": 399,
"points": "613b",
"points_in_bytes": 613,
"norms": "0b",
"norms_in_bytes": 0,
"term_vectors": "0b",
"term_vectors_in_bytes": 0,
"knn_vectors": "0b",
"knn_vectors_in_bytes": 0
},
"_source": {
"total": "17.1kb",
"total_in_bytes": 17550,
"inverted_index": {
"total": "0b",
"total_in_bytes": 0
},
"stored_fields": "17.1kb",
"stored_fields_in_bytes": 17550,
"doc_values": "0b",
"doc_values_in_bytes": 0,
"points": "0b",
"points_in_bytes": 0,
"norms": "0b",
"norms_in_bytes": 0,
"term_vectors": "0b",
"term_vectors_in_bytes": 0,
"knn_vectors": "0b",
"knn_vectors_in_bytes": 0
},
"_version": {
"total": "29b",
"total_in_bytes": 29,
"inverted_index": {
"total": "0b",
"total_in_bytes": 0
},
"stored_fields": "0b",
"stored_fields_in_bytes": 0,
"doc_values": "29b",
"doc_values_in_bytes": 29,
"points": "0b",
"points_in_bytes": 0,
"norms": "0b",
"norms_in_bytes": 0,
"term_vectors": "0b",
"term_vectors_in_bytes": 0,
"knn_vectors": "0b",
"knn_vectors_in_bytes": 0
},
"content_id": {
"total": "2kb",
"total_in_bytes": 2086,
"inverted_index": {
"total": "2kb",
"total_in_bytes": 2086
},
"stored_fields": "0b",
"stored_fields_in_bytes": 0,
"doc_values": "0b",
"doc_values_in_bytes": 0,
"points": "0b",
"points_in_bytes": 0,
"norms": "0b",
"norms_in_bytes": 0,
"term_vectors": "0b",
"term_vectors_in_bytes": 0,
"knn_vectors": "0b",
"knn_vectors_in_bytes": 0
},
"embedding": {
"total": "2.9mb",
"total_in_bytes": 3069403,
"inverted_index": {
"total": "0b",
"total_in_bytes": 0
},
"stored_fields": "0b",
"stored_fields_in_bytes": 0,
"doc_values": "0b",
"doc_values_in_bytes": 0,
"points": "0b",
"points_in_bytes": 0,
"norms": "0b",
"norms_in_bytes": 0,
"term_vectors": "0b",
"term_vectors_in_bytes": 0,
"knn_vectors": "2.9mb",
"knn_vectors_in_bytes": 3069403
}
}
}
}