Hello,
I am struggling with a transform that when originally deployed was performing, but after a seemingly minor change has now become slow and not able to keep up. The transform queries an index for the past 3 hours, then buckets the records into 15min chunks (per mmsi) and takes the latest record from each chunk, writes to an index. Here is the JSON:
{
"description": "This transform will run continuously to generate our 2 year aggs for data in 15 minute intervals",
"source": {
"index": "agg-*",
"query": {
"range": {
"event_ts": {
"gte": "now-3h"
}
}
}
},
"dest" : {
"index" : "pol"
},
"sync": {
"time": {
"field": "event_ts",
"delay": "5m"
}
},
"frequency": "15m",
"pivot": {
"group_by": {
"event_ts" : { "date_histogram": {
"field": "event_ts",
"fixed_interval": "15m"
}
},
"mmsi": { "terms": { "field": "mmsi" }}
},
"aggregations": {
"last": {
"scripted_metric": {
"init_script": "state.latest_timestamp = 0L; state.last_doc = ''",
"map_script": "\n def current_timestamp = doc['event_ts'].getValue().toInstant().toEpochMilli();\n if (current_timestamp > state.latest_timestamp)\n {state.latest_timestamp = current_timestamp;\n state.last_doc = new HashMap(params['_source']);}\n ",
"combine_script": "return state",
"reduce_script": " \n def last_doc = '';\n def latest_timestamp = 0L;\n for (s in states) {if (s.latest_timestamp > (latest_timestamp))\n {latest_timestamp = s.latest_timestamp; last_doc = s.last_doc;}}\n return last_doc\n "
}
}
}
}
}
Typical record count for the 3 hr time period is around 4-5 million, and around 150k unique mmsi. The transform was running as expected for while, then we changed the "index.refresh_interval": "10s"
, and it started to drop. Though I don't see how the change could have effected it.
Here are the stats for the transform:
{
"count" : 1,
"transforms" : [
{
"id" : "pol",
"state" : "started",
"node" : {
"id" : "_nK2Y3EzRaeciO9cskXF7A",
"name" : "instance-0000000013",
"ephemeral_id" : "xxxxxxx",
"transport_address" : "x.x.x.x:19775",
"attributes" : { }
},
"stats" : {
"pages_processed" : 1496661,
"documents_processed" : 1237047649,
"documents_indexed" : 262989809,
"trigger_count" : 2582,
"index_time_in_ms" : 46010613,
"index_total" : 743598,
"index_failures" : 0,
"search_time_in_ms" : 143844792,
"search_total" : 1496661,
"search_failures" : 0,
"processing_time_in_ms" : 4575100,
"processing_total" : 1496661,
"exponential_avg_checkpoint_duration_ms" : 24918.603757424262,
"exponential_avg_documents_indexed" : 32201.281076275238,
"exponential_avg_documents_processed" : 214533.2860309495
},
"checkpointing" : {
"last" : {
"checkpoint" : 2207,
"timestamp_millis" : 1610653373630,
"time_upper_bound_millis" : 1610653073630
},
"operations_behind" : 772679,
"changes_last_detected_at" : 1610653373626
}
}
]
}
I am running v7.10.1 with 2 hot nodes with 58GB RAM and 2 warm nodes with 15GB RAM.
I am trying to sort out if this transform is trying to do too much. Maybe this is a bad use case for transforms. Any advice would be helpful. I have already increased max_page_search_size
to 10000.
Cheers,