SearchContextMissingException No search context found for id on bulk query

I'm experiencing an issue on 5.6.4 during a bulk search composed of several match queries, over an index that is not being written to.

Almost all queries in the batch end up timing out (after 1h), and I can't grasp the cause of this, the cluster has enough heap and RAM, CPU usage is below 25% and path.data points to a RAID0 mount point that iotop reports reading at 600MB/s during the search.

The log has several entries for RemoteTransportException (...) Caused by: SearchContextMissingException: No search context found for id..._

Here's a log excerpt:

[2019-01-08T22:39:00,784][INFO ][o.e.n.Node ] [esnode1] started [2019-01-08T22:39:00,999][INFO ][o.e.g.GatewayService ] [esnode1] recovered [1] indices into cluster_state [2019-01-08T22:39:10,035][INFO ][o.e.c.r.a.AllocationService] [esnode1] Cluster health status changed from [RED] to [GREEN] (reason: [shards started [[wiki1][3]] ...]). [2019-01-08T22:40:32,824][INFO ][o.e.m.j.JvmGcMonitorService] [esnode1] [gc][95] overhead, spent [253ms] collecting in the last [1s] [2019-01-08T23:04:20,700][DEBUG][o.e.a.s.TransportSearchAction] [esnode1] [919] Failed to execute fetch phase org.elasticsearch.transport.RemoteTransportException: [esnode1][127.0.0.1:9300][indices:data/read/search[phase/fetch/id]] Caused by: org.elasticsearch.search.SearchContextMissingException: No search context found for id [919] at org.elasticsearch.search.SearchService.findContext(SearchService.java:447) ~[elasticsearch-5.6.4.jar:5.6.4] at org.elasticsearch.search.SearchService.executeFetchPhase(SearchService.java:414) ~[elasticsearch-5.6.4.jar:5.6.4] at org.elasticsearch.action.search.SearchTransportService$12.messageReceived(SearchTransportService.java:407) ~[elasticsearch-5.6.4.jar:5.6.4] at org.elasticsearch.action.search.SearchTransportService$12.messageReceived(SearchTransportService.java:404) ~[elasticsearch-5.6.4.jar:5.6.4] at org.elasticsearch.transport.RequestHandlerRegistry.processMessageReceived(RequestHandlerRegistry.java:69) ~[elasticsearch-5.6.4.jar:5.6.4] at org.elasticsearch.transport.TransportService$7.doRun(TransportService.java:654) [elasticsearch-5.6.4.jar:5.6.4] at org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:638) [elasticsearch-5.6.4.jar:5.6.4] at org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) [elasticsearch-5.6.4.jar:5.6.4] at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_191] at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_191] at java.lang.Thread.run(Thread.java:748) [?:1.8.0_191] [2019-01-08T23:06:06,766][DEBUG][o.e.a.s.TransportSearchAction] [esnode1] [775] Failed to execute fetch phase org.elasticsearch.transport.RemoteTransportException: [esnode1][127.0.0.1:9300][indices:data/read/search[phase/fetch/id]] Caused by: org.elasticsearch.search.SearchContextMissingException: No search context found for id [775] at org.elasticsearch.search.SearchService.findContext(SearchService.java:447) ...

Cluster stats:

{
	"_nodes": {
		"total": 1,
		"successful": 1,
		"failed": 0
	},
	"cluster_name": "escluster1",
	"timestamp": 1546987424259,
	"status": "green",
	"indices": {
		"count": 1,
		"shards": {
			"total": 10,
			"primaries": 10,
			"replication": 0.0,
			"index": {
				"shards": {
					"min": 10,
					"max": 10,
					"avg": 10.0
				},
				"primaries": {
					"min": 10,
					"max": 10,
					"avg": 10.0
				},
				"replication": {
					"min": 0.0,
					"max": 0.0,
					"avg": 0.0
				}
			}
		},
		"docs": {
			"count": 5746677,
			"deleted": 0
		},
		"store": {
			"size": "181.5gb",
			"size_in_bytes": 194959969930,
			"throttle_time": "0s",
			"throttle_time_in_millis": 0
		},
		"fielddata": {
			"memory_size": "0b",
			"memory_size_in_bytes": 0,
			"evictions": 0
		},
		"query_cache": {
			"memory_size": "0b",
			"memory_size_in_bytes": 0,
			"total_count": 0,
			"hit_count": 0,
			"miss_count": 0,
			"cache_size": 0,
			"cache_count": 0,
			"evictions": 0
		},
		"completion": {
			"size": "0b",
			"size_in_bytes": 0
		},
		"segments": {
			"count": 293,
			"memory": "897.2mb",
			"memory_in_bytes": 940811637,
			"terms_memory": "893.8mb",
			"terms_memory_in_bytes": 937313273,
			"stored_fields_memory": "3.2mb",
			"stored_fields_memory_in_bytes": 3415152,
			"term_vectors_memory": "0b",
			"term_vectors_memory_in_bytes": 0,
			"norms_memory": "54.9kb",
			"norms_memory_in_bytes": 56256,
			"points_memory": "0b",
			"points_memory_in_bytes": 0,
			"doc_values_memory": "26.3kb",
			"doc_values_memory_in_bytes": 26956,
			"index_writer_memory": "0b",
			"index_writer_memory_in_bytes": 0,
			"version_map_memory": "0b",
			"version_map_memory_in_bytes": 0,
			"fixed_bit_set": "0b",
			"fixed_bit_set_memory_in_bytes": 0,
			"max_unsafe_auto_id_timestamp": -1,
			"file_sizes": {
				
			}
		}
	},
	"nodes": {
		"count": {
			"total": 1,
			"data": 1,
			"coordinating_only": 0,
			"master": 1,
			"ingest": 1
		},
		"versions": ["5.6.4"],
		"os": {
			"available_processors": 4,
			"allocated_processors": 4,
			"names": [{
				"name": "Linux",
				"count": 1
			}],
			"mem": {
				"total": "15.3gb",
				"total_in_bytes": 16430301184,
				"free": "171.7mb",
				"free_in_bytes": 180113408,
				"used": "15.1gb",
				"used_in_bytes": 16250187776,
				"free_percent": 1,
				"used_percent": 99
			}
		},
		"process": {
			"cpu": {
				"percent": 7
			},
			"open_file_descriptors": {
				"min": 214,
				"max": 214,
				"avg": 214
			}
		},
		"jvm": {
			"max_uptime": "1h",
			"max_uptime_in_millis": 3900907,
			"versions": [{
				"version": "1.8.0_191",
				"vm_name": "OpenJDK 64-Bit Server VM",
				"vm_version": "25.191-b12",
				"vm_vendor": "Oracle Corporation",
				"count": 1
			}],
			"mem": {
				"heap_used": "1gb",
				"heap_used_in_bytes": 1154171376,
				"heap_max": "7.9gb",
				"heap_max_in_bytes": 8555069440
			},
			"threads": 45
		},
		"fs": {
			"total": "294.9gb",
			"total_in_bytes": 316734971904,
			"free": "113.3gb",
			"free_in_bytes": 121704624128,
			"available": "98.3gb",
			"available_in_bytes": 105591787520,
			"spins": "true"
		},
		"plugins": [],
		"network_types": {
			"transport_types": {
				"netty4": 1
			},
			"http_types": {
				"netty4": 1
			}
		}
	}
}

After more testing, this issue seems to be due to index data being stored in a software (mdadm) RAID-0 array composed of cloud storage volumes (e.g. EBS).

The same queries run fine on the same server, with the same settings, over the same index data, as long as the data is stored in a single volume. When using an mdadm RAID-0 array, the log is filled with the errors I previously described. Tested on two different mdadm arrays that had been successfully benchmarked with FIO.

Is there a known limitation with using ES on storage volumes belonging to an mdadm array?

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.