How can APM anomaly detection be fine-tuned or adjusted to effectively address periodic fluctuations in service metrics?

arT1 · April 1, 2025, 2:09am

Elastic Stack v8.13.3
Use the Elastic APM monitoring service and enable APM Machine Learning (ML). My service metrics data exhibits a regular pattern, with a higher number of visits during weekdays and a significantly lower number on weekends. Consequently, every Monday, when the number of service visits increases, anomaly detection assigns a high anomaly score, although this increase actually represents a normal behavior for this time period. Could you provide guidance on how to debug or adjust the APM anomaly detection settings?

GET _ml/anomaly_detectors/apm-prod-f59e-apm_tx_metrics?pretty
{
  "count": 1,
  "jobs": [
    {
      "job_id": "apm-prod-f59e-apm_tx_metrics",
      "job_type": "anomaly_detector",
      "job_version": "12.0.0",
      "create_time": 1742416278745,
      "model_snapshot_id": "1743470520",
      "custom_settings": {
        "managed": true,
        "job_tags": {
          "environment": "prod",
          "apm_ml_version": 3
        },
        "custom_urls": []
      },
      "datafeed_config": {
        "datafeed_id": "datafeed-apm-prod-f59e-apm_tx_metrics",
        "job_id": "apm-prod-f59e-apm_tx_metrics",
        "authorization": {
          "roles": [
            "superuser"
          ]
        },
        "query_delay": "120s",
        "chunking_config": {
          "mode": "off"
        },
        "indices_options": {
          "ignore_unavailable": true,
          "expand_wildcards": [
            "open"
          ],
          "allow_no_indices": true,
          "ignore_throttled": true
        },
        "query": {
          "bool": {
            "filter": [
              {
                "term": {
                  "processor.event": "metric"
                }
              },
              {
                "term": {
                  "metricset.name": "transaction"
                }
              },
              {
                "term": {
                  "service.environment": "prod"
                }
              }
            ]
          }
        },
        "indices": [
          "metrics-apm*",
          "apm-*"
        ],
        "aggregations": {
          "buckets": {
            "composite": {
              "size": 5000,
              "sources": [
                {
                  "date": {
                    "date_histogram": {
                      "field": "@timestamp",
                      "fixed_interval": "60s"
                    }
                  }
                },
                {
                  "transaction.type": {
                    "terms": {
                      "field": "transaction.type"
                    }
                  }
                },
                {
                  "service.name": {
                    "terms": {
                      "field": "service.name"
                    }
                  }
                }
              ]
            },
            "aggs": {
              "@timestamp": {
                "max": {
                  "field": "@timestamp"
                }
              },
              "transaction_throughput": {
                "rate": {
                  "unit": "minute"
                }
              },
              "transaction_latency": {
                "avg": {
                  "field": "transaction.duration.histogram"
                }
              },
              "error_count": {
                "filter": {
                  "term": {
                    "event.outcome": "failure"
                  }
                },
                "aggs": {
                  "actual_error_count": {
                    "value_count": {
                      "field": "event.outcome"
                    }
                  }
                }
              },
              "success_count": {
                "filter": {
                  "term": {
                    "event.outcome": "success"
                  }
                }
              },
              "failed_transaction_rate": {
                "bucket_script": {
                  "buckets_path": {
                    "failure_count": "error_count>_count",
                    "success_count": "success_count>_count"
                  },
                  "script": "if ((params.failure_count + params.success_count)==0){return 0;}else{return 100 * (params.failure_count/(params.failure_count + params.success_count));}"
                }
              }
            }
          }
        },
        "scroll_size": 5000,
        "delayed_data_check_config": {
          "enabled": true
        }
      },
      "groups": [
        "apm"
      ],
      "description": "Detects anomalies in transaction latency, throughput and error percentage for metric data.",
      "analysis_config": {
        "bucket_span": "15m",
        "summary_count_field_name": "doc_count",
        "detectors": [
          {
            "detector_description": "high latency by transaction type for an APM service",
            "function": "high_mean",
            "field_name": "transaction_latency",
            "by_field_name": "transaction.type",
            "partition_field_name": "service.name",
            "detector_index": 0
          },
          {
            "detector_description": "transaction throughput for an APM service",
            "function": "mean",
            "field_name": "transaction_throughput",
            "by_field_name": "transaction.type",
            "partition_field_name": "service.name",
            "detector_index": 1
          },
          {
            "detector_description": "failed transaction rate for an APM service",
            "function": "high_mean",
            "field_name": "failed_transaction_rate",
            "by_field_name": "transaction.type",
            "partition_field_name": "service.name",
            "detector_index": 2
          }
        ],
        "influencers": [
          "transaction.type",
          "service.name"
        ],
        "model_prune_window": "30d"
      },
      "analysis_limits": {
        "model_memory_limit": "512mb",
        "categorization_examples_limit": 4
      },
      "data_description": {
        "time_field": "@timestamp",
        "time_format": "epoch_ms"
      },
      "model_plot_config": {
        "enabled": true,
        "annotations_enabled": true
      },
      "model_snapshot_retention_days": 10,
      "daily_model_snapshot_retention_after_days": 1,
      "results_index_name": "custom-apm",
      "allow_lazy_open": false
    }
  ]
}

valeriy42 · April 1, 2025, 7:37am

Hello @arT1 ,

The anomaly detection job will automatically recognize weekday/weekend patterns after seeing enough examples. It usually would pick up this pattern after 30 days of observation.

arT1 · April 1, 2025, 8:49am

hello @valeriy42
Thank you for your reply!
The APM anomaly detection job has only been running for 10 days, so I'll keep watching

Topic		Replies	Views
Anomaly Detection Assistance APM elastic-stack-machine-learning , open-telemetry	7	606	March 15, 2023
Error Machine Learning in APM with Basic License APM docker , server , ui	11	1234	December 1, 2021
APM dropping metrics APM python	5	628	June 24, 2019
Unknown health status APM elastic-stack-machine-learning , ui	4	1630	April 5, 2021
How do I modify queue.mem.events property when the APM server is managed through Fleet integration? APM fleet , server	6	2347	February 23, 2022

How can APM anomaly detection be fine-tuned or adjusted to effectively address periodic fluctuations in service metrics?

Related topics