1. FunctionDescription
- Use terms to generate multiple buckets.
- Sort the buckets using the aggregated result.
2. My Previous Solution
-
At first, I use terms->order to sort buckets.
-
Then, part of the aggregate result must be achieved using bucket_script and need to support the paging function. The terms->sort can not be used. I use the bucket_sort to achieve the functions.
-
But soon discovered a problem.
The bucket_sort aggregation, like all pipeline aggregations, is executed after all other non-pipeline aggregations. This means the sorting only applies to whatever buckets are already returned from the parent aggregation.
- According to the above description, I need to set the terms->size to a number big enough. This will cause a lot of pressure on the cluster. And in the case of a large amount of concurrent, may lead to the cluster down.
- It is also impractical to return all the data for calculation. If the data is lager enough, data transmission can take a long time.
3. Please Help
- For the above problem, I need your good idea. Thanks a lot.
4. Code
"pv3": {
"terms": {
"field": "scene.cutpid",
"size": 20,
"shard_size": 1000,
"order": {
"probability_exit": "desc"
}
},
"aggs": {
"vi": {
"reverse_nested": {},
"aggs": {
"visits": {
"filter": {
"exists": {
"field": "vid"
}
},
"aggs": {
"visits_value": {
"value_count": {
"field": "vid"
}
}
}
},
"new_visits": {
"filter": {
"term": {
"nv": true
}
},
"aggs": {
"new_visits_value": {
"value_count": {
"field": "vid"
}
}
}
},
"bounce": {
"filter": {
"term": {
"bounce": true
}
},
"aggs": {
"bounce_value": {
"value_count": {
"field": "vid"
}
}
}
}
}
},
"url_title_pid_cutpath": {
"top_hits": {
"from": 0,
"size": 1,
"_source": {
"includes": [
"scene.fullpath",
"scene.title",
"scene.pid",
"scene.cutpath"
]
}
}
},
"st": {
"filter": {
"range": {
"scene.staytm": {
"gte": 0
}
}
},
"aggs": {
"stay_time": {
"sum": {
"field": "scene.staytm"
}
}
}
},
"page_visits": {
"filter": {
"exists": {
"field": "scene.pid"
}
},
"aggs": {
"page_visits_value": {
"value_count": {
"field": "scene.pid"
}
}
}
},
"land": {
"filter": {
"term": {
"scene.is_land": true
}
},
"aggs": {
"land_value": {
"value_count": {
"field": "scene.pid"
}
}
}
},
"exit": {
"filter": {
"term": {
"scene.is_exit": true
}
},
"aggs": {
"exit_value": {
"value_count": {
"field": "scene.pid"
}
}
}
},
"r_bucket_sort": {
"bucket_sort": {
"from": 10,
"size": 10
}
},
"probability_new_visits": {
"bucket_script": {
"buckets_path": {
"visits": "vi>visits>visits_value",
"new_visits": "vi>new_visits>new_visits_value"
},
"script": "(params.visits==0)?0:(params.new_visits/params.visits)"
}
},
"probability_exit": {
"bucket_script": {
"buckets_path": {
"exit": "exit>exit_value",
"page_visits": "page_visits>page_visits_value"
},
"script": "(params.page_visits==0)?0:(params.exit/params.page_visits)"
}
},
"probability_bounce": {
"bucket_script": {
"buckets_path": {
"bounce": "vi>bounce>bounce_value",
"land": "land>land_value"
},
"script": "(params.land==0)?0:(params.bounce/params.land)"
}
},
"avg_staytime": {
"bucket_script": {
"buckets_path": {
"page_visits": "page_visits>page_visits_value",
"staytime": "st>stay_time"
},
"script": "(params.page_visits==0)?0:(params.staytime/(params.page_visits*1000))"
}
},
"cutpid_count": {
"cardinality": {
"field": "scene.cutpid",
"precision_threshold": 10000
}
}
}
}