Hello!
While investigating the performance of time range queries, I found some surprising things, and was wondering if anyone had insights to share as to what optimisations or heuristics happen behind the scenes that might explain (and perhaps ideas about the best way to build such queries).
For a bit of context: we make heavy use of time-sorted indices and time range filters, and at least in some cases it seems that the range filter slows queries down significantly. While investigating whether the improvements in ES 7.6 would help us (with that and in general), I still found that range queries could be problematic and was puzzled by how it behaved.
I then worked out this contrived but minimal example highlighting the situation in the hope that it can be reproduced and explained. Pasting in a golang program further below.
Essentially setting up an index with a single @timestamp
date field and index-sorting turned on. Then filling it with documents where the value is randomly picked, uniformly within some fixed time range.
Hopefully this table of results speaks for itself. Happy to provide any further clarifications otherwise. Times in milliseconds. The documents are all in the range [2020-01-01, 2020-02-01)
, and the date
on the rows is the upper limit set in the query. profiling: false/true
does mean that profiling of the search is turned off/on! Using ES 7.6.1.
The absolute performance of course varies with the number of documents, size of the ES node(s), etc. but the qualitative behaviour seems consistent.
testing query
{
"range": {
"@timestamp": {
"from": null,
"include_lower": true,
"include_upper": false,
"to": "2020-01-01"
}
}
}
date \ profiling false true
2019-09-23 0.0 0.0
2020-01-01 0.2 0.0
2020-01-04 4.3 6.9
2020-01-07 11.1 11.4
2020-01-10 15.2 15.1
2020-01-13 20.3 16.1
2020-01-16 20.2 19.6
2020-01-19 21.8 27.0
2020-01-22 16.1 15.8
2020-01-25 10.1 14.0
2020-01-28 6.5 7.0
2020-01-31 2.2 3.0
2020-02-01 105.1 0.1
2020-05-11 177.0 0.0
testing query
{
"bool": {
"filter": {
"range": {
"@timestamp": {
"from": null,
"include_lower": true,
"include_upper": false,
"to": "2020-01-01"
}
}
}
}
}
date \ profiling false true
2019-09-23 0.1 0.0
2020-01-01 0.1 0.9
2020-01-04 7.8 5.2
2020-01-07 15.1 10.4
2020-01-10 22.6 18.0
2020-01-13 31.7 18.3
2020-01-16 37.7 19.9
2020-01-19 37.0 20.0
2020-01-22 39.0 16.4
2020-01-25 36.2 13.3
2020-01-28 34.2 7.8
2020-01-31 33.2 4.8
2020-02-01 174.9 0.0
2020-05-11 175.7 0.0
And here's a go program for doing this:
package main
import (
"context"
"encoding/json"
"flag"
"fmt"
"log"
"math/rand"
"os"
"time"
"github.com/olivere/elastic/v7"
)
const (
indexName = "testindex"
timeField = "@timestamp"
numDocs = 1e7
progressUpdate = 1e6
)
var (
dateMin = newDate(2020, 1, 1)
dateMax = newDate(2020, 2, 1)
)
func main() {
var (
generate = flag.Bool("generate", false, "generate test data")
query = flag.Bool("query", false, "run the query test")
address = flag.String("address", "http://localhost:9200", "elasticsearch URL")
)
flag.Parse()
client := newClient(*address)
if *generate {
generateData(client)
} else if *query {
queryTest(client)
} else {
flag.Usage()
os.Exit(2)
}
}
type esClient struct {
ctx context.Context
*elastic.Client
}
func newClient(address string) esClient {
client, err := elastic.NewSimpleClient(
elastic.SetURL(address),
elastic.SetGzip(true),
)
if err != nil {
panic(err)
}
return esClient{
ctx: context.Background(),
Client: client,
}
}
func generateData(client esClient) {
client.DeleteIndex(indexName).Do(client.ctx)
indexSettings := map[string]interface{}{
"index.number_of_replicas": 0,
"sort.field": timeField,
"sort.order": "desc",
}
indexMappings := map[string]map[string]map[string]string{
"properties": {
timeField: {"type": "date"},
},
}
settings := map[string]interface{}{
"settings": indexSettings,
"mappings": indexMappings,
}
_, err := client.CreateIndex(indexName).
BodyJson(settings).
Do(client.ctx)
if err != nil {
panic(err)
}
log.Println("created index", indexName)
bulk, err := client.BulkProcessor().
BulkActions(-1).
Do(client.ctx)
if err != nil {
panic(err)
}
defer func() {
if err := bulk.Close(); err != nil {
panic(err)
}
}()
for i := 0; i < numDocs; i++ {
d := time.Duration(rand.Int63n(dateMax.Sub(dateMin.Time).Nanoseconds()))
doc := map[string]interface{}{
timeField: dateMin.Add(d),
}
r := elastic.NewBulkIndexRequest().
Index(indexName).
Doc(doc)
bulk.Add(r)
if num := i + 1; num%progressUpdate == 0 {
log.Printf("%10d docs", num)
}
}
}
func queryTest(client esClient) {
for _, asBoolQ := range []bool{false, true} {
printLine := func(endDate date) {
q := newQuery(endDate, asBoolQ)
fmt.Printf("%-20s%10.1f%10.1f\n", endDate, client.meanQueryTime(q, false), client.meanQueryTime(q, true))
}
fmt.Printf("testing query\n%s\n", queryToString(newQuery(dateMin, asBoolQ)))
fmt.Printf("%-20s%10s%10s\n", "date \\ profiling", "false", "true")
step := int(dateMax.Sub(dateMin.Time) / (240 * time.Hour))
printLine(dateMin.AddDays(-100))
for dt := dateMin; dt.Before(dateMax.Time); dt = dt.AddDays(step) {
printLine(dt)
}
printLine(dateMax)
printLine(dateMax.AddDays(100))
fmt.Println()
}
}
func (client esClient) meanQueryTime(q elastic.Query, profile bool) float64 {
n := 10
tot := 0
for i := 0; i < n; i++ {
client.clearCache()
tot += client.queryTook(q, profile)
}
return float64(tot) / float64(n)
}
func (client esClient) clearCache() {
_, err := client.ClearCache(indexName).Do(client.ctx)
if err != nil {
panic(err)
}
}
func (client esClient) queryTook(q elastic.Query, profile bool) int {
search := client.Search(indexName).
Query(q).
Profile(profile).
Size(1)
resp, err := search.Do(client.ctx)
if err != nil {
panic(err)
}
return int(resp.TookInMillis)
}
type date struct {
time.Time
}
func newDate(y, m, d int) date {
return date{time.Date(y, time.Month(m), d, 0, 0, 0, 0, time.UTC)}
}
func (dt date) String() string {
return dt.Format("2006-01-02")
}
func (dt date) AddDays(days int) date {
return date{dt.AddDate(0, 0, days)}
}
func newQuery(endDate date, boolQ bool) elastic.Query {
var q elastic.Query
q = elastic.NewRangeQuery(timeField).Lt(endDate.String())
if boolQ {
q = elastic.NewBoolQuery().Filter(q)
}
return q
}
func queryToString(q elastic.Query) string {
src, err := q.Source()
if err != nil {
panic(err)
}
data, err := json.MarshalIndent(src, "", " ")
if err != nil {
panic(err)
}
return string(data)
}