Why elasticsearch's query time per shard take little while the total query time is long

Elasticserach cluster has 5 nodes with 16 cores and 64 gb memory each,boot heap size is 24GB. elasticsearch version 5.6.16. the index size is 150G(exclude replica), 300 million documents,5 primary shard, 2 replica.

  • index mapping
    {
"properties": {
        "EventId": {
          "type": "keyword"
        },
        "Time": {
          "type": "long"
        },
        "SourceId": {
          "type": "keyword"
        },
        "Severity": {
          "type": "integer"
        },
        "Message": {
          "type": "text",
          "norms": false,
          "index_options": "docs"
        },
        "Area1": {
          "type": "keyword"
        },
        "Site": {
          "type": "keyword"
        }
    }
}
  • query DSL:
    {
    "profile":true,
    "from": 0,
    "size": 2000,
    "query": {
        "bool": {
            "filter": [{
                "range": {
                    "Time": {
                        "gte": 1564934400000,
                        "lte": 1565020800000
                    }
                }
            },
            {
                "term": {
                    "Area1": "ns=9310;s=POW.arAlHjOD"
                }
            },
            {
                "range": {
                    "Severity": {
                        "from": 0,
                        "to": 100
                    }
                }
            },
            {
                "match": {
                    "Message": "alarm"
                }
            }
            ],
            "must_not": [{
                "term": {
                    "Site": "ns=9490;s=PWE.iFjpfTWp"
                }
            }]
        }
    },
    "sort": [{
        "Time": {
            "order": "desc"
        }
    },
    {
        "EventId": {
            "order": "desc"
        }
    }]
}
  • profile result
    {
	"took": 5103,
	"timed_out": false,
	"_shards": {
		"total": 5,
		"successful": 5,
		"skipped": 0,
		"failed": 0
	},
	"hits": {
		"total": 1975,
		"max_score": null,
		"hits": []
	},
	"profile": {
		"shards": [{
			"id": "[PmkW-Sw-ThOLEmwpSeMMlA][events-history-201909][1]",
			"searches": [{
				"query": [{
					"type": "BooleanQuery",
					"description": "-Site:ns=9490;s=PWE.iFjpfTWp #Time:[1569345000000 TO 1569431400000] #Area1:ns=9310;s=POW.arAlHjOD #Severity:[0 TO 100] #Message:alarm",
					"time": "45.83703800ms",
					"time_in_nanos": 45837038,
					"children": [{
						"type": "TermQuery",
						"description": "Site:ns=9490;s=PWE.iFjpfTWp",
						"time": "0.7859000000ms",
						"time_in_nanos": 785900,
						
					},
					{
						"type": "IndexOrDocValuesQuery",
						"description": "Time:[1569345000000 TO 1569431400000]",
						"time": "64.51731000ms",
						"time_in_nanos": 64517310,
						
					},
					{
						"type": "TermQuery",
						"description": "Area1:ns=9310;s=POW.arAlHjOD",
						"time": "2.630864000ms",
						"time_in_nanos": 2630864,
						
					},
					{
						"type": "IndexOrDocValuesQuery",
						"description": "Severity:[0 TO 100]",
						"time": "2.076803000ms",
						"time_in_nanos": 2076803,
						
					},
					{
						"type": "TermQuery",
						"description": "Message:alarm",
						"time": "2.590450000ms",
						"time_in_nanos": 2590450,
						
					}]
				}],
				"rewrite_time": 10471,
				"collector": [{
					"name": "CancellableCollector",
					"reason": "search_cancelled",
					"time": "1.285638000ms",
					"time_in_nanos": 1285638,
					
				}]
			}],
			"aggregations": []
		},
		{
			"id": "[PmkW-Sw-ThOLEmwpSeMMlA][events-history-201909][2]",
			"searches": [{
				"query": [{
					"type": "BooleanQuery",
					"description": "-Site:ns=9490;s=PWE.iFjpfTWp #Time:[1569345000000 TO 1569431400000] #Area1:ns=9310;s=POW.arAlHjOD #Severity:[0 TO 100] #Message:alarm",
					"time": "18.19332900ms",
					"time_in_nanos": 18193329,
					"children": [{
						"type": "TermQuery",
						"description": "Site:ns=9490;s=PWE.iFjpfTWp",
						"time": "0.5694900000ms",
						"time_in_nanos": 569490,
						
					},
					{
						"type": "IndexOrDocValuesQuery",
						"description": "Time:[1569345000000 TO 1569431400000]",
						"time": "2.403989000ms",
						"time_in_nanos": 2403989,
						
					},
					{
						"type": "TermQuery",
						"description": "Area1:ns=9310;s=POW.arAlHjOD",
						"time": "12.52754700ms",
						"time_in_nanos": 12527547,
						
					},
					{
						"type": "IndexOrDocValuesQuery",
						"description": "Severity:[0 TO 100]",
						"time": "1.327001000ms",
						"time_in_nanos": 1327001,
						
					},
					{
						"type": "TermQuery",
						"description": "Message:alarm",
						"time": "1.660747000ms",
						"time_in_nanos": 1660747,
						
					}]
				}],
				"rewrite_time": 5156,
				"collector": [{
					"name": "CancellableCollector",
					"reason": "search_cancelled",
					"time": "1.262665000ms",
					"time_in_nanos": 1262665,
					
				}]
			}],
			"aggregations": []
		},
		{
			"id": "[cB5z4kUSTYaYam5XUGcPmA][events-history-201909][0]",
			"searches": [{
				"query": [{
					"type": "BooleanQuery",
					"description": "-Site:ns=9490;s=PWE.iFjpfTWp #Time:[1569345000000 TO 1569431400000] #Area1:ns=9310;s=POW.arAlHjOD #Severity:[0 TO 100] #Message:alarm",
					"time": "116.4847370ms",
					"time_in_nanos": 116484737,
					"children": [{
						"type": "TermQuery",
						"description": "Site:ns=9490;s=PWE.iFjpfTWp",
						"time": "0.5082780000ms",
						"time_in_nanos": 508278,
						
					},
					{
						"type": "IndexOrDocValuesQuery",
						"description": "Time:[1569345000000 TO 1569431400000]",
						"time": "35.07663200ms",
						"time_in_nanos": 35076632,
						
					},
					{
						"type": "TermQuery",
						"description": "Area1:ns=9310;s=POW.arAlHjOD",
						"time": "62.93598600ms",
						"time_in_nanos": 62935986,
						
					},
					{
						"type": "IndexOrDocValuesQuery",
						"description": "Severity:[0 TO 100]",
						"time": "1.492956000ms",
						"time_in_nanos": 1492956,
						
					},
					{
						"type": "TermQuery",
						"description": "Message:alarm",
						"time": "1.748266000ms",
						"time_in_nanos": 1748266,
						
					}]
				}],
				"rewrite_time": 5547,
				"collector": [{
					"name": "CancellableCollector",
					"reason": "search_cancelled",
					"time": "1.763664000ms",
					"time_in_nanos": 1763664,
					
				}]
			}],
			"aggregations": []
		},
		{
			"id": "[hJ6tl3mvSHiyJrI2H_foFQ][events-history-201909][3]",
			"searches": [{
				"query": [{
					"type": "BooleanQuery",
					"description": "-Site:ns=9490;s=PWE.iFjpfTWp #Time:[1569345000000 TO 1569431400000] #Area1:ns=9310;s=POW.arAlHjOD #Severity:[0 TO 100] #Message:alarm",
					"time": "43.90771200ms",
					"time_in_nanos": 43907712,
					"children": [{
						"type": "TermQuery",
						"description": "Site:ns=9490;s=PWE.iFjpfTWp",
						"time": "0.5759190000ms",
						"time_in_nanos": 575919,
						
					},
					{
						"type": "IndexOrDocValuesQuery",
						"description": "Time:[1569345000000 TO 1569431400000]",
						"time": "46.78477100ms",
						"time_in_nanos": 46784771,
						
					},
					{
						"type": "TermQuery",
						"description": "Area1:ns=9310;s=POW.arAlHjOD",
						"time": "1.522379000ms",
						"time_in_nanos": 1522379,
						
					},
					{
						"type": "IndexOrDocValuesQuery",
						"description": "Severity:[0 TO 100]",
						"time": "1.848593000ms",
						"time_in_nanos": 1848593,
						
					},
					{
						"type": "TermQuery",
						"description": "Message:alarm",
						"time": "2.170097000ms",
						"time_in_nanos": 2170097,
						
					}]
				}],
				"rewrite_time": 11487,
				"collector": [{
					"name": "CancellableCollector",
					"reason": "search_cancelled",
					"time": "4.127291000ms",
					"time_in_nanos": 4127291,
					
				}]
			}],
			"aggregations": []
		},
		{
			"id": "[uvVmQP6FTqubRRszz4ubIA][events-history-201909][4]",
			"searches": [{
				"query": [{
					"type": "BooleanQuery",
					"description": "-Site:ns=9490;s=PWE.iFjpfTWp #Time:[1569345000000 TO 1569431400000] #Area1:ns=9310;s=POW.arAlHjOD #Severity:[0 TO 100] #Message:alarm",
					"time": "59.86074200ms",
					"time_in_nanos": 59860742,
					"children": [{
						"type": "TermQuery",
						"description": "Site:ns=9490;s=PWE.iFjpfTWp",
						"time": "0.5899410000ms",
						"time_in_nanos": 589941,
						
					},
					{
						"type": "IndexOrDocValuesQuery",
						"description": "Time:[1569345000000 TO 1569431400000]",
						"time": "63.97123300ms",
						"time_in_nanos": 63971233,
						
					},
					{
						"type": "TermQuery",
						"description": "Area1:ns=9310;s=POW.arAlHjOD",
						"time": "1.385124000ms",
						"time_in_nanos": 1385124,
						
					},
					{
						"type": "IndexOrDocValuesQuery",
						"description": "Severity:[0 TO 100]",
						"time": "1.828161000ms",
						"time_in_nanos": 1828161,
						
					},
					{
						"type": "TermQuery",
						"description": "Message:alarm",
						"time": "1.892925000ms",
						"time_in_nanos": 1892925,
						
					}]
				}],
				"rewrite_time": 5311,
				"collector": [{
					"name": "CancellableCollector",
					"reason": "search_cancelled",
					"time": "1.171450000ms",
					"time_in_nanos": 1171450,
					
				}]
			}]
		}]
	}
}

I believe this is caused by:

"size": 2000,

You are asking basically to fetch 2000 documents per shard, send them back over the network to the coordinating node, rearrange the 10000 documents to take only the "best" 2000 and then send them to the client.

Depending on the size of your json documents, disk reads and network transfert might be the cost.
Are you using SSD drives?

What happens if you set size to 0 or size to 10? Can you see any difference?

yes ,use the ssd driver.
After changing the query size to 200, the query time will become 2600ms, the size of each document is 500 bytes, 2000 documents is almost 1MB, and the network bandwidth is 1000Mbps,I guess the network transfer should not be the bottleneck.

the first query is very slow, the second query starts to become very fast, I tried to execute the command echo 3> /proc/sys/vm/drop_caches to clear the operating system cache, but the query is still very fast

So what's wrong?

My question is when I cleared the operating system cache, shouldn't the query speed be slow as the first query,,why is it still very fast? thank s

Did you restart elasticsearch as well? All the nodes?

Did not restart all nodes, just cleared the operating system cache of each node and execute the command _cache/clear .

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.