Elastic Stack v8.13.3
Use the Elastic APM monitoring service and enable APM Machine Learning (ML). My service metrics data exhibits a regular pattern, with a higher number of visits during weekdays and a significantly lower number on weekends. Consequently, every Monday, when the number of service visits increases, anomaly detection assigns a high anomaly score, although this increase actually represents a normal behavior for this time period. Could you provide guidance on how to debug or adjust the APM anomaly detection settings?
GET _ml/anomaly_detectors/apm-prod-f59e-apm_tx_metrics?pretty
{
"count": 1,
"jobs": [
{
"job_id": "apm-prod-f59e-apm_tx_metrics",
"job_type": "anomaly_detector",
"job_version": "12.0.0",
"create_time": 1742416278745,
"model_snapshot_id": "1743470520",
"custom_settings": {
"managed": true,
"job_tags": {
"environment": "prod",
"apm_ml_version": 3
},
"custom_urls": []
},
"datafeed_config": {
"datafeed_id": "datafeed-apm-prod-f59e-apm_tx_metrics",
"job_id": "apm-prod-f59e-apm_tx_metrics",
"authorization": {
"roles": [
"superuser"
]
},
"query_delay": "120s",
"chunking_config": {
"mode": "off"
},
"indices_options": {
"ignore_unavailable": true,
"expand_wildcards": [
"open"
],
"allow_no_indices": true,
"ignore_throttled": true
},
"query": {
"bool": {
"filter": [
{
"term": {
"processor.event": "metric"
}
},
{
"term": {
"metricset.name": "transaction"
}
},
{
"term": {
"service.environment": "prod"
}
}
]
}
},
"indices": [
"metrics-apm*",
"apm-*"
],
"aggregations": {
"buckets": {
"composite": {
"size": 5000,
"sources": [
{
"date": {
"date_histogram": {
"field": "@timestamp",
"fixed_interval": "60s"
}
}
},
{
"transaction.type": {
"terms": {
"field": "transaction.type"
}
}
},
{
"service.name": {
"terms": {
"field": "service.name"
}
}
}
]
},
"aggs": {
"@timestamp": {
"max": {
"field": "@timestamp"
}
},
"transaction_throughput": {
"rate": {
"unit": "minute"
}
},
"transaction_latency": {
"avg": {
"field": "transaction.duration.histogram"
}
},
"error_count": {
"filter": {
"term": {
"event.outcome": "failure"
}
},
"aggs": {
"actual_error_count": {
"value_count": {
"field": "event.outcome"
}
}
}
},
"success_count": {
"filter": {
"term": {
"event.outcome": "success"
}
}
},
"failed_transaction_rate": {
"bucket_script": {
"buckets_path": {
"failure_count": "error_count>_count",
"success_count": "success_count>_count"
},
"script": "if ((params.failure_count + params.success_count)==0){return 0;}else{return 100 * (params.failure_count/(params.failure_count + params.success_count));}"
}
}
}
}
},
"scroll_size": 5000,
"delayed_data_check_config": {
"enabled": true
}
},
"groups": [
"apm"
],
"description": "Detects anomalies in transaction latency, throughput and error percentage for metric data.",
"analysis_config": {
"bucket_span": "15m",
"summary_count_field_name": "doc_count",
"detectors": [
{
"detector_description": "high latency by transaction type for an APM service",
"function": "high_mean",
"field_name": "transaction_latency",
"by_field_name": "transaction.type",
"partition_field_name": "service.name",
"detector_index": 0
},
{
"detector_description": "transaction throughput for an APM service",
"function": "mean",
"field_name": "transaction_throughput",
"by_field_name": "transaction.type",
"partition_field_name": "service.name",
"detector_index": 1
},
{
"detector_description": "failed transaction rate for an APM service",
"function": "high_mean",
"field_name": "failed_transaction_rate",
"by_field_name": "transaction.type",
"partition_field_name": "service.name",
"detector_index": 2
}
],
"influencers": [
"transaction.type",
"service.name"
],
"model_prune_window": "30d"
},
"analysis_limits": {
"model_memory_limit": "512mb",
"categorization_examples_limit": 4
},
"data_description": {
"time_field": "@timestamp",
"time_format": "epoch_ms"
},
"model_plot_config": {
"enabled": true,
"annotations_enabled": true
},
"model_snapshot_retention_days": 10,
"daily_model_snapshot_retention_after_days": 1,
"results_index_name": "custom-apm",
"allow_lazy_open": false
}
]
}