Random slow query, many futex time

System Info

  1. Debian GNU/Linux 8.11 (jessie)
  2. ElasticSearch 2.3.5
  3. 128G RAM, 1T SSD
  4. Elasticsearch 122node, 100 shard, Data size near 100G per shard.(may be too big)

Detail
We use elasticsearch in production, but there are random slow query(P99->200ms, P999->5000ms). I guess it is relate to cacahe or something?
Futher more,I use strace to show thread info, but find too much time in futex

strace -c -p 11872
% time seconds usecs/call calls errors syscall
#------ ----------- ----------- --------- --------- ----------------
99.78 2.110971 5441 388 55 futex
0.22 0.004741 0 39512 pread
0.00 0.000000 0 35 write
#------ ----------- ----------- --------- --------- ----------------
100.00 2.115712 39935 55 total

Question:

  1. why exist too much futex? It seems ES use mutex rarely
  2. desired to get some advise to reduce random slow query

Thanks

update information
I've discover that query_cahe has some problem. My index has 1 replica.
I've note that primaries query_cache hit rate about 20% (44765221451/214657219339),
but total hit rate only 10% (49212601996/461818150070). It means replica cache is really low.
What is the reason? Can anyone give me some advice? thanks

below is my index stats
{
"image_index":{
"primaries":{
"docs":{
"count":2086786254,
"deleted":70025697
},
"store":{
"size_in_bytes":18458523357144,
"throttle_time_in_millis":0
},
"indexing":{
"index_total":2075625388,
"index_time_in_millis":4400528868,
"index_current":1038176,
"index_failed":3466045,
"delete_total":2,
"delete_time_in_millis":0,
"delete_current":0,
"noop_update_total":5428790,
"is_throttled":false,
"throttle_time_in_millis":0
},
"search":{
"open_contexts":281,
"query_total":401787945,
"query_time_in_millis":6438147591,
"query_current":148,
"fetch_total":349453018,
"fetch_time_in_millis":1100806195,
"fetch_current":1,
"scroll_total":54588,
"scroll_time_in_millis":10107052529,
"scroll_current":0
},
"merges":{
"current":2,
"current_docs":24845,
"current_size_in_bytes":234600157,
"total":371718,
"total_time_in_millis":18541380533,
"total_docs":6158738869,
"total_size_in_bytes":55118571635584,
"total_stopped_time_in_millis":48077,
"total_throttled_time_in_millis":16850677176,
"total_auto_throttle_in_bytes":524288000
},
"refresh":{
"total":1446338,
"total_time_in_millis":304514067
},
"flush":{
"total":36480,
"total_time_in_millis":26856479
},
"warmer":{
"current":0,
"total":3329153,
"total_time_in_millis":199601
},
"query_cache":{
"memory_size_in_bytes":6019662792,
"total_count":214657219339,
"hit_count":44765221451,
"miss_count":169891997888,
"cache_size":160237,
"cache_count":163216,
"evictions":2979
},
"segments":{
"count":6195,
"memory_in_bytes":12727327254,
"terms_memory_in_bytes":10749641354,
"stored_fields_memory_in_bytes":1879112864,
"term_vectors_memory_in_bytes":0,
"norms_memory_in_bytes":13196096,
"doc_values_memory_in_bytes":85376940,
"index_writer_memory_in_bytes":118923993,
"index_writer_max_memory_in_bytes":53687091200,
"version_map_memory_in_bytes":282017,
"fixed_bit_set_memory_in_bytes":0
} ...
},
"total":{
"docs":{
"count":4173572512,
"deleted":140174964
},
"store":{
"size_in_bytes":36916810920129,
"throttle_time_in_millis":0
},
"indexing":{
"index_total":2148402539,
"index_time_in_millis":4585290546,
"index_current":1284700,
"index_failed":5420934,
"delete_total":2,
"delete_time_in_millis":0,
"delete_current":0,
"noop_update_total":5428790,
"is_throttled":false,
"throttle_time_in_millis":0
},
"search":{
"open_contexts":595,
"query_total":803304861,
"query_time_in_millis":14959068059,
"query_current":411,
"fetch_total":698641431,
"fetch_time_in_millis":2032548499,
"fetch_current":4,
"scroll_total":90797,
"scroll_time_in_millis":17631387614,
"scroll_current":0
},
"merges":{
"current":4,
"current_docs":238619,
"current_size_in_bytes":2063610536,
"total":490320,
"total_time_in_millis":19115685770,
"total_docs":6419529805,
"total_size_in_bytes":57346589363548,
"total_stopped_time_in_millis":72365,
"total_throttled_time_in_millis":17333190273,
"total_auto_throttle_in_bytes":1048576000
},
"refresh":{
"total":2221358,
"total_time_in_millis":341537448
},
"flush":{
"total":47732,
"total_time_in_millis":28205924
},
"warmer":{
"current":0,
"total":5015007,
"total_time_in_millis":301897
},
"query_cache":{
"memory_size_in_bytes":6530819480,
"total_count":461818150070,
"hit_count":49212601996,
"miss_count":412605548074,
"cache_size":184909,
"cache_count":187904,
"evictions":2995
} ...
},

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.