I just upgraded our es cluster to 8.12.0.
I created indices with quantized vectors, and started to migrate data.
After few hours of data insert, the servers reports marking and sending shard failed due to [shard failure, reason [merge failed]]
errors from two of the indices with quantized vectors.
The cluster health downgrades and recovers in few seconds.
So far there is no merge error reports from other indices.
index settings
{
"index": {
"routing": {
"allocation": {
"include": {
"_tier_preference": "data_content"
},
"require": {
"data": "imgsearch"
}
}
},
"refresh_interval": "600s",
"number_of_shards": "10",
"translog": {
"flush_threshold_size": "1gb"
},
"provided_name": "INDEX",
"merge": {
"policy": {
"max_merge_at_once": "10",
"max_merged_segment": "20gb",
"segments_per_tier": "10",
"floor_segment": "10mb"
}
},
"creation_date": "1705578247309",
"number_of_replicas": "1",
"uuid": "mVXArByIT-aQgiwOEpA3_Q",
"version": {
"created": "8500008"
}
}
}
mapping:
"vector": {
"type": "dense_vector",
"dims": 1024,
"index": true,
"similarity": "cosine",
"index_options": {
"type": "int8_hnsw",
"m": 16,
"ef_construction": 100
}
}
log
2024-01-19T13:44:29.987131191+08:00 stdout F {"@timestamp":"2024-01-19T05:44:29.986Z", "log.level": "WARN", "message":"failed engine [merge failed]", "ecs.version": "1.2.0","service.name":"ES_ECS","event.dataset":"elasticsearch.server","process.thread.name":"elasticsearch[es8-es-nodes-img-3][generic][T#2]","log.logger":"org.elasticsearch.index.engine.Engine","elasticsearch.cluster.uuid":"Plugwd1_QOKXhOHRjxHRcA","elasticsearch.node.id":"YKjgj5yoRH-k2XHSzlbxVg","elasticsearch.node.name":"es8-es-nodes-img-3","elasticsearch.cluster.name":"es8","tags":[" [INDEX][4]"],"error.type":"org.apache.lucene.index.MergePolicy$MergeException","error.message":"java.lang.IllegalStateException: this writer hit an unrecoverable error; cannot merge","error.stack_trace":"org.apache.lucene.index.MergePolicy$MergeException: java.lang.IllegalStateException: this writer hit an unrecoverable error; cannot merge\n\tat org.elasticsearch.server@8.12.0/org.elasticsearch.index.engine.InternalEngine$EngineMergeScheduler$2.doRun(InternalEngine.java:2853)\n\tat org.elasticsearch.server@8.12.0/org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:983)\n\tat org.elasticsearch.server@8.12.0/org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:26)\n\tat java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)\n\tat java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)\n\tat java.base/java.lang.Thread.run(Thread.java:1583)\nCaused by: java.lang.IllegalStateException: this writer hit an unrecoverable error; cannot merge\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.index.IndexWriter.hasPendingMerges(IndexWriter.java:2426)\n\tat org.elasticsearch.server@8.12.0/org.elasticsearch.index.engine.InternalEngine$EngineMergeScheduler.afterMerge(InternalEngine.java:2811)\n\tat org.elasticsearch.server@8.12.0/org.elasticsearch.index.engine.ElasticsearchConcurrentMergeScheduler.doMerge(ElasticsearchConcurrentMergeScheduler.java:123)\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.index.ConcurrentMergeScheduler$MergeThread.run(ConcurrentMergeScheduler.java:700)\nCaused by: java.lang.NullPointerException: Cannot invoke \"org.apache.lucene.index.DocIDMerger$Sub.nextMappedDoc()\" because \"this.current\" is null\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.index.DocIDMerger$SequentialDocIDMerger.next(DocIDMerger.java:123)\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.codecs.KnnVectorsWriter$MergedVectorValues$MergedFloat32VectorValues.nextDoc(KnnVectorsWriter.java:214)\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.util.ScalarQuantizer.fromVectors(ScalarQuantizer.java:252)\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsWriter.mergeAndRecalculateQuantiles(Lucene99ScalarQuantizedVectorsWriter.java:552)\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsWriter.mergeQuantiles(Lucene99ScalarQuantizedVectorsWriter.java:381)\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsWriter.mergeOneFieldToIndex(Lucene99ScalarQuantizedVectorsWriter.java:195)\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsWriter.mergeOneField(Lucene99HnswVectorsWriter.java:341)\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat$FieldsWriter.mergeOneField(PerFieldKnnVectorsFormat.java:122)\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.codecs.KnnVectorsWriter.merge(KnnVectorsWriter.java:98)\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.index.SegmentMerger.mergeVectorValues(SegmentMerger.java:255)\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.index.SegmentMerger.mergeWithLogging(SegmentMerger.java:298)\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.index.SegmentMerger.merge(SegmentMerger.java:149)\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.index.IndexWriter.mergeMiddle(IndexWriter.java:5245)\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.index.IndexWriter.merge(IndexWriter.java:4733)\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.index.IndexWriter$IndexWriterMergeSource.merge(IndexWriter.java:6534)\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.index.ConcurrentMergeScheduler.doMerge(ConcurrentMergeScheduler.java:639)\n\tat org.elasticsearch.server@8.12.0/org.elasticsearch.index.engine.ElasticsearchConcurrentMergeScheduler.doMerge(ElasticsearchConcurrentMergeScheduler.java:118)\n\t... 1 more\n"}
2024-01-19T13:44:29.98749643+08:00 stdout F {"@timestamp":"2024-01-19T05:44:29.987Z", "log.level": "WARN", "message":"[INDEX][4] marking and sending shard failed due to [shard failure, reason [merge failed]]", "ecs.version": "1.2.0","service.name":"ES_ECS","event.dataset":"elasticsearch.server","process.thread.name":"elasticsearch[es8-es-nodes-img-3][generic][T#1]","log.logger":"org.elasticsearch.indices.cluster.IndicesClusterStateService","elasticsearch.cluster.uuid":"Plugwd1_QOKXhOHRjxHRcA","elasticsearch.node.id":"YKjgj5yoRH-k2XHSzlbxVg","elasticsearch.node.name":"es8-es-nodes-img-3","elasticsearch.cluster.name":"es8","error.type":"org.apache.lucene.index.MergePolicy$MergeException","error.message":"java.lang.IllegalStateException: this writer hit an unrecoverable error; cannot merge","error.stack_trace":"org.apache.lucene.index.MergePolicy$MergeException: java.lang.IllegalStateException: this writer hit an unrecoverable error; cannot merge\n\tat org.elasticsearch.server@8.12.0/org.elasticsearch.index.engine.InternalEngine$EngineMergeScheduler$2.doRun(InternalEngine.java:2853)\n\tat org.elasticsearch.server@8.12.0/org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:983)\n\tat org.elasticsearch.server@8.12.0/org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:26)\n\tat java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)\n\tat java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)\n\tat java.base/java.lang.Thread.run(Thread.java:1583)\nCaused by: java.lang.IllegalStateException: this writer hit an unrecoverable error; cannot merge\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.index.IndexWriter.hasPendingMerges(IndexWriter.java:2426)\n\tat org.elasticsearch.server@8.12.0/org.elasticsearch.index.engine.InternalEngine$EngineMergeScheduler.afterMerge(InternalEngine.java:2811)\n\tat org.elasticsearch.server@8.12.0/org.elasticsearch.index.engine.ElasticsearchConcurrentMergeScheduler.doMerge(ElasticsearchConcurrentMergeScheduler.java:123)\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.index.ConcurrentMergeScheduler$MergeThread.run(ConcurrentMergeScheduler.java:700)\nCaused by: java.lang.NullPointerException: Cannot invoke \"org.apache.lucene.index.DocIDMerger$Sub.nextMappedDoc()\" because \"this.current\" is null\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.index.DocIDMerger$SequentialDocIDMerger.next(DocIDMerger.java:123)\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.codecs.KnnVectorsWriter$MergedVectorValues$MergedFloat32VectorValues.nextDoc(KnnVectorsWriter.java:214)\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.util.ScalarQuantizer.fromVectors(ScalarQuantizer.java:252)\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsWriter.mergeAndRecalculateQuantiles(Lucene99ScalarQuantizedVectorsWriter.java:552)\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsWriter.mergeQuantiles(Lucene99ScalarQuantizedVectorsWriter.java:381)\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsWriter.mergeOneFieldToIndex(Lucene99ScalarQuantizedVectorsWriter.java:195)\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsWriter.mergeOneField(Lucene99HnswVectorsWriter.java:341)\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat$FieldsWriter.mergeOneField(PerFieldKnnVectorsFormat.java:122)\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.codecs.KnnVectorsWriter.merge(KnnVectorsWriter.java:98)\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.index.SegmentMerger.mergeVectorValues(SegmentMerger.java:255)\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.index.SegmentMerger.mergeWithLogging(SegmentMerger.java:298)\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.index.SegmentMerger.merge(SegmentMerger.java:149)\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.index.IndexWriter.mergeMiddle(IndexWriter.java:5245)\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.index.IndexWriter.merge(IndexWriter.java:4733)\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.index.IndexWriter$IndexWriterMergeSource.merge(IndexWriter.java:6534)\n\tat org.apache.lucene.core@9.9.1/org.apache.lucene.index.ConcurrentMergeScheduler.doMerge(ConcurrentMergeScheduler.java:639)\n\tat org.elasticsearch.server@8.12.0/org.elasticsearch.index.engine.ElasticsearchConcurrentMergeScheduler.doMerge(ElasticsearchConcurrentMergeScheduler.java:118)\n\t... 1 more\n"}
2024-01-19T13:49:19.557185254+08:00 stdout F {"@timestamp":"2024-01-19T05:49:19.556Z", "log.level": "INFO", "message":"primary-replica resync completed with 0 operations", "ecs.version": "1.2.0","service.name":"ES_ECS","event.dataset":"elasticsearch.server","process.thread.name":"elasticsearch[es8-es-nodes-img-3][generic][T#19]","log.logger":"org.elasticsearch.index.shard.IndexShard","elasticsearch.cluster.uuid":"Plugwd1_QOKXhOHRjxHRcA","elasticsearch.node.id":"YKjgj5yoRH-k2XHSzlbxVg","elasticsearch.node.name":"es8-es-nodes-img-3","elasticsearch.cluster.name":"es8","tags":[" [INDEX][4]"]}