For the first time elasticsearch query time is too long on a collection with 400+ million documents.The problem basically is "first time query operations are pretty slow".
However, we want to understand the ROOT CAUSE and unable to identify the same.
My setup consists of:
2 nodes with 8 cores and 16 GB RAM each and 5 GB allocated to Elasticsearch.
500 GB hard disk
doc size : 9kb
Top Command screenshots
I'm using this kind of mapping
{
"event": {
"mappings": {
"doc": {
"properties": {
"@timestamp": {
"type": "date"
},
"@version": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"event": {
"properties": {
"contextProperties": {
"properties": {
"carId": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"eventName": {
"type": "keyword",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"projectId": {
"type": "keyword",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"time": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"ts": {
"type": "long"
},
"userId": {
"type": "keyword",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"value": {
"type": "long"
}
}
},
"id": {
"type": "keyword",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"timestamp": {
"type": "date"
},
"type": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
}
}
And the type of query I usually make:
{
"aggs": {
"catView": {
"date_histogram": {
"field": "timestamp",
"interval": "1d",
"time_zone": "Asia/Kolkata",
"min_doc_count": 1
},
"aggs": {
"sub_Data": {
"terms": {
"field": "event.contextProperties.carId.keyword",
"size": 13
},
"aggs": {
"mou_Count": {
"sum": {
"field": "event.value"
}
}
}
}
}
}
},
"size": 0,
"_source": {
"excludes": []
},
"stored_fields": [
"*"
],
"script_fields": {},
"docvalue_fields": [
{
"field": "timestamp",
"format": "date_time"
}
],
"query": {
"bool": {
"must": [
{
"bool": {
"minimum_should_match": 1,
"should": [
{
"match_phrase": {
"event.contextProperties.carId.keyword": "12345"
}
},
{
"match_phrase": {
"event.contextProperties.carId.keyword": "NA"
}
}
]
}
},
{
"match_phrase": {
"event.projectId.keyword": {
"query": "123456"
}
}
},
{
"range": {
"timestamp": {
"gte": 1569349800000,
"lte": 1569868200000,
"format": "epoch_millis"
}
}
},
{
"bool": {
"minimum_should_match": 1,
"should": [
{
"match_phrase": {
"event.contextProperties.carId.keyword": "45678"
}
},
{
"match_phrase": {
"event.contextProperties.carId.keyword": "NA"
}
}
]
}
},
{
"match_phrase": {
"event.projectId.keyword": {
"query": "8765"
}
}
}
],
"filter": [
{
"match_all": {}
},
{
"match_all": {}
}
],
"should": []
}
},
"timeout": "30000ms"
}
On this query it has to search on the fields carouselId, projectId, eventName and timestamp and doing the aggregation on value.
Any recommendation for boosting/caching the first query is welcome. Thanks in advance
I’m using Elasticsearch 6.8.3.