Hi elasticsearch,
We found that some tasks had been executed for a long time without completion.
You can see this task has been running for 17 hours.
{
"completed" : false,
"task" : {
"node" : "cpzJKvmdQM-lIj2d5wuSXA",
"id" : 53164505,
"type" : "transport",
"action" : "indices:data/read/search",
"description" : """The query is deleted here for desensitization, and the complete query is posted later""",
"start_time_in_millis" : 1686747521976,
"running_time_in_nanos" : 61862506791926,
"cancellable" : true,
"headers" : { }
}
}
Tasks have not been completed for a long time, causing the CPU usage of the node to be very high, close to 100%.
100.2% (500.7ms out of 500ms) cpu usage by thread 'elasticsearch[esdssslnges-datarno-rno-8][search][T#2]'
9/10 snapshots sharing following 45 elements
app//org.elasticsearch.search.aggregations.bucket.terms.LongKeyedBucketOrds$FromMany$1.next(LongKeyedBucketOrds.java:264)
app//org.elasticsearch.search.aggregations.bucket.terms.GlobalOrdinalsStringTermsAggregator$RemapGlobalOrds.forEach(GlobalOrdinalsStringTermsAggregator.java:535)
app//org.elasticsearch.search.aggregations.bucket.terms.GlobalOrdinalsStringTermsAggregator$ResultStrategy.buildAggregations(GlobalOrdinalsStringTermsAggregator.java:580)
app//org.elasticsearch.search.aggregations.bucket.terms.GlobalOrdinalsStringTermsAggregator$ResultStrategy.access$200(GlobalOrdinalsStringTermsAggregator.java:553)
app//org.elasticsearch.search.aggregations.bucket.terms.GlobalOrdinalsStringTermsAggregator.buildAggregations(GlobalOrdinalsStringTermsAggregator.java:197)
app//org.elasticsearch.search.aggregations.bucket.BucketsAggregator.buildSubAggsForBuckets(BucketsAggregator.java:192)
app//org.elasticsearch.search.aggregations.bucket.BucketsAggregator.buildSubAggsForAllBuckets(BucketsAggregator.java:255)
app//org.elasticsearch.search.aggregations.bucket.terms.GlobalOrdinalsStringTermsAggregator.access$700(GlobalOrdinalsStringTermsAggregator.java:67)
app//org.elasticsearch.search.aggregations.bucket.terms.GlobalOrdinalsStringTermsAggregator$StandardTermsResults.buildSubAggs(GlobalOrdinalsStringTermsAggregator.java:738)
app//org.elasticsearch.search.aggregations.bucket.terms.GlobalOrdinalsStringTermsAggregator$StandardTermsResults.buildSubAggs(GlobalOrdinalsStringTermsAggregator.java:688)
app//org.elasticsearch.search.aggregations.bucket.terms.GlobalOrdinalsStringTermsAggregator$ResultStrategy.buildAggregations(GlobalOrdinalsStringTermsAggregator.java:604)
app//org.elasticsearch.search.aggregations.bucket.terms.GlobalOrdinalsStringTermsAggregator$ResultStrategy.access$200(GlobalOrdinalsStringTermsAggregator.java:553)
app//org.elasticsearch.search.aggregations.bucket.terms.GlobalOrdinalsStringTermsAggregator.buildAggregations(GlobalOrdinalsStringTermsAggregator.java:197)
app//org.elasticsearch.search.aggregations.bucket.BucketsAggregator.buildSubAggsForBuckets(BucketsAggregator.java:192)
app//org.elasticsearch.search.aggregations.bucket.BucketsAggregator.buildAggregationsForVariableBuckets(BucketsAggregator.java:349)
app//org.elasticsearch.search.aggregations.bucket.histogram.DateHistogramAggregator.buildAggregations(DateHistogramAggregator.java:150)
app//org.elasticsearch.search.aggregations.Aggregator.buildTopLevel(Aggregator.java:160)
app//org.elasticsearch.search.aggregations.AggregationPhase.execute(AggregationPhase.java:123)
app//org.elasticsearch.search.query.QueryPhase.execute(QueryPhase.java:154)
app//org.elasticsearch.indices.IndicesService.lambda$loadIntoContext$21(IndicesService.java:1391)
app//org.elasticsearch.indices.IndicesService$$Lambda$5925/0x0000000801a36f08.accept(Unknown Source)
app//org.elasticsearch.indices.IndicesService.lambda$cacheShardLevelResult$22(IndicesService.java:1443)
app//org.elasticsearch.indices.IndicesService$$Lambda$5926/0x0000000801a37668.get(Unknown Source)
app//org.elasticsearch.indices.IndicesRequestCache$Loader.load(IndicesRequestCache.java:165)
app//org.elasticsearch.indices.IndicesRequestCache$Loader.load(IndicesRequestCache.java:148)
app//org.elasticsearch.common.cache.Cache.computeIfAbsent(Cache.java:433)
app//org.elasticsearch.indices.IndicesRequestCache.getOrCompute(IndicesRequestCache.java:120)
app//org.elasticsearch.indices.IndicesService.cacheShardLevelResult(IndicesService.java:1449)
app//org.elasticsearch.indices.IndicesService.loadIntoContext(IndicesService.java:1389)
app//org.elasticsearch.search.SearchService.loadOrExecuteQueryPhase(SearchService.java:370)
app//org.elasticsearch.search.SearchService.executeQueryPhase(SearchService.java:431)
app//org.elasticsearch.search.SearchService.access$500(SearchService.java:141)
app//org.elasticsearch.search.SearchService$2.lambda$onResponse$0(SearchService.java:401)
app//org.elasticsearch.search.SearchService$2$$Lambda$5912/0x0000000801a31f20.get(Unknown Source)
app//org.elasticsearch.search.SearchService$$Lambda$5913/0x0000000801a32148.get(Unknown Source)
app//org.elasticsearch.action.ActionRunnable.lambda$supply$0(ActionRunnable.java:58)
app//org.elasticsearch.action.ActionRunnable$$Lambda$5433/0x00000008019b9060.accept(Unknown Source)
app//org.elasticsearch.action.ActionRunnable$2.doRun(ActionRunnable.java:73)
app//org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37)
app//org.elasticsearch.common.util.concurrent.TimedRunnable.doRun(TimedRunnable.java:44)
app//org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:743)
app//org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37)
This is the stack output on node.
Our query was like,
{
"size": 0,
"field7": {
"match_all": {
"boost": 1.0
}
},
"aggregations": {
"time_buckets": {
"date_histogram": {
"field": "field2",
"format": "yyyy-MM-dd H:M:S",
"fixed_interval": "1h",
"offset": 0,
"order": {
"_key": "asc"
},
"keyed": false,
"min_doc_count": 0
},
"aggregations": {
"session_count": {
"value_count": {
"field": "field4"
}
},
"field4_group": {
"terms": {
"field": "field4",
"size": 100000,
"min_doc_count": 1,
"shard_min_doc_count": 0,
"show_term_doc_count_error": false,
"order": [
{
"_count": "desc"
},
{
"_key": "asc"
}
]
},
"aggregations": {
"field7_group": {
"terms": {
"field": "field7",
"size": 100000,
"min_doc_count": 2,
"shard_min_doc_count": 0,
"show_term_doc_count_error": false,
"order": [
{
"_count": "desc"
},
{
"_key": "asc"
}
]
}
},
"sumOfRefield7Data": {
"sum_bucket": {
"buckets_path": [
"field7_group._count"
],
"gap_policy": "skip"
}
},
"distinct_field4_refield7": {
"bucket_script": {
"buckets_path": {
"refield7Count": "sumOfRefield7Data"
},
"script": {
"source": "if(params.refield7Count != 0) {return 1;} else { return 0;}",
"lang": "painless"
},
"gap_policy": "skip"
}
}
}
},
"sumOfDistinctRefield7field4": {
"sum_bucket": {
"buckets_path": [
"field4_group>distinct_field4_refield7"
],
"gap_policy": "skip"
}
},
"Share_of_Refield7": {
"bucket_script": {
"buckets_path": {
"Refield7field4Count": "sumOfDistinctRefield7field4",
"sessionCount": "session_count"
},
"script": {
"source": "if(params.sessionCount != 0) {return params.Refield7field4Count / params.sessionCount;} else { return 0;}",
"lang": "painless"
},
"gap_policy": "skip"
}
}
}
}
}
}
Cat index output
health status index uuid pri rep docs.count docs.deleted store.size pri.store.size
green open index_name index_uuid 16 1 89681789 0 51.9gb 26gb
Index Mappings
{
"date_detection" : true,
"numeric_detection" : true,
"properties" : {
"field1" : {
"type" : "text"
},
"field2" : {
"type" : "date"
},
"field3" : {
"type" : "keyword"
},
"field4" : {
"type" : "keyword"
},
"field5" : {
"type" : "keyword"
},
"field6" : {
"type" : "integer"
},
"field7" : {
"type" : "keyword"
},
"field8" : {
"type" : "keyword"
},
"field9" : {
"type" : "keyword"
},
"field10" : {
"type" : "date"
},
"field11" : {
"type" : "integer",
"ignore_malformed" : false,
"coerce" : true
},
"field12" : {
"type" : "keyword"
}
}
}
ES Version: 7.10.2
There is no other large search&traffic in the cluster, but this query causes several nodes to be busy with CPU usage. So we are confused,
- why the task can be running for so a long time? Are there any invalid index settings/mappings, or bad design in queries?
- Are there any defense cluster settings for prevent tasks being executed for such a long time?
Thanks,
Qq