ML job behavior changed

dao · December 8, 2021, 2:41pm

Hello,
I have a ML jon that is started for a long time, but since a few monthes, there is a change (is it a bug?)
Explanations:

Here is what I have

As you can see, It seems that the ML runs OK and detects some anomalies.

However, when I go to single view, here is the display:

The raw data are not displayed for a few monthes. Only anomalies are displayed.
But my concern goes to the model bounds that seems to be not dynamic anymore.
Did the ML machine have changed of behavior?

Do you have an explanation? Is it a problem of display or is there a reall issue in my setup? here is the JSON of the job:

{
  "job_id": "egt21amb_4_aps5000",
  "job_type": "anomaly_detector",
  "job_version": "7.6.1",
  "create_time": 1587137050366,
  "finished_time": 1610094429213,
  "model_snapshot_id": "1638828000",
  "custom_settings": {
    "custom_urls": []
  },
  "groups": [
    "b787",
    "aps5000"
  ],
  "description": "B787 EGT 21 amb",
  "analysis_config": {
    "bucket_span": "23h",
    "detectors": [
      {
        "detector_description": "high_mean(egt21Amb)",
        "function": "high_mean",
        "field_name": "egt21Amb",
        "partition_field_name": "APU_SN_APS5000.keyword",
        "detector_index": 0
      }
    ],
    "influencers": [
      "APU_SN_APS5000.keyword",
      "fwot.keyword",
      "deident.keyword"
    ]
  },
  "analysis_limits": {
    "model_memory_limit": "50mb",
    "categorization_examples_limit": 4
  },
  "data_description": {
    "time_field": "@timestamp",
    "time_format": "epoch_ms"
  },
  "model_plot_config": {
    "enabled": true,
    "annotations_enabled": true
  },
  "model_snapshot_retention_days": 5,
  "results_index_name": "shared",
  "allow_lazy_open": false,
  "data_counts": {
    "job_id": "egt21amb_4_aps5000",
    "processed_record_count": 125139,
    "processed_field_count": 500219,
    "input_bytes": 18361936,
    "input_field_count": 500219,
    "invalid_date_count": 0,
    "missing_field_count": 337,
    "out_of_order_timestamp_count": 0,
    "empty_bucket_count": 6291,
    "sparse_bucket_count": 2,
    "bucket_count": 8113,
    "earliest_record_timestamp": 967246309000,
    "latest_record_timestamp": 1638823412000,
    "last_data_time": 1638871200122,
    "latest_empty_bucket_timestamp": 1597460400000,
    "latest_sparse_bucket_timestamp": 1630497600000,
    "input_record_count": 125139,
    "log_time": 1638871200122,
    "latest_bucket_timestamp": 1638777600000
  },
  "model_size_stats": {
    "job_id": "egt21amb_4_aps5000",
    "result_type": "model_size_stats",
    "model_bytes": 7740730,
    "peak_model_bytes": 7803798,
    "model_bytes_exceeded": 0,
    "model_bytes_memory_limit": 52428800,
    "total_by_field_count": 205,
    "total_over_field_count": 0,
    "total_partition_field_count": 204,
    "bucket_allocation_failures_count": 0,
    "memory_status": "ok",
    "assignment_memory_basis": "current_model_bytes",
    "categorized_doc_count": 0,
    "total_category_count": 0,
    "frequent_category_count": 0,
    "rare_category_count": 0,
    "dead_category_count": 0,
    "failed_category_count": 0,
    "categorization_status": "ok",
    "log_time": 1638838800899,
    "timestamp": 1638777600000
  },
  "forecasts_stats": {
    "total": 0,
    "forecasted_jobs": 0
  },
  "state": "opened",
  "node": {
    "id": "As76KsQqTNyBf4paYkfD1A",
    "name": "instance-0000000089",
    "ephemeral_id": "amzwEQ3RRw6RXS8t4F-MaQ",
    "transport_address": "172.27.137.171:19637",
    "attributes": {
      "logical_availability_zone": "zone-0",
      "server_name": "instance-0000000089.44ddb9be1e2b2933c4dd3c012baa1dc4",
      "availability_zone": "eu-west-1c",
      "ml.machine_memory": "2147483648",
      "xpack.installed": "true",
      "instance_configuration": "aws.ml.m5",
      "transform.node": "false",
      "ml.max_open_jobs": "512",
      "ml.max_jvm_size": "536870912",
      "region": "eu-west-1"
    }
  },
  "assignment_explanation": "",
  "open_time": "8215868s",
  "timing_stats": {
    "job_id": "egt21amb_4_aps5000",
    "bucket_count": 12403,
    "total_bucket_processing_time_ms": 30639.999999999996,
    "minimum_bucket_processing_time_ms": 0,
    "maximum_bucket_processing_time_ms": 88,
    "average_bucket_processing_time_ms": 2.470370071756833,
    "exponential_average_bucket_processing_time_ms": 5.694205394078761,
    "exponential_average_bucket_processing_time_per_hour_ms": 0
  },
  "datafeed_config": {
    "datafeed_id": "datafeed-egt21amb_4_aps5000",
    "job_id": "egt21amb_4_aps5000",
    "query_delay": "13h",
    "chunking_config": {
      "mode": "auto"
    },
    "indices_options": {
      "expand_wildcards": [
        "open"
      ],
      "ignore_unavailable": false,
      "allow_no_indices": true,
      "ignore_throttled": true
    },
    "query": {
      "bool": {
        "must": [
          {
            "exists": {
              "field": "egt21Amb",
              "boost": 1
            }
          },
          {
            "exists": {
              "field": "APU_SN_APS5000",
              "boost": 1
            }
          }
        ],
        "adjust_pure_negative": true,
        "boost": 1
      }
    },
    "frequency": "3h",
    "indices": [
      "wilco__revima*__b787__*"
    ],
    "scroll_size": 1000,
    "delayed_data_check_config": {
      "enabled": true
    },
    "state": "started",
    "node": {
      "id": "As76KsQqTNyBf4paYkfD1A",
      "name": "instance-0000000089",
      "ephemeral_id": "amzwEQ3RRw6RXS8t4F-MaQ",
      "transport_address": "172.27.137.171:19637",
      "attributes": {
        "ml.machine_memory": "2147483648",
        "ml.max_open_jobs": "512",
        "ml.max_jvm_size": "536870912"
      }
    },
    "assignment_explanation": "",
    "timing_stats": {
      "job_id": "egt21amb_4_aps5000",
      "search_count": 13966,
      "bucket_count": 8113,
      "total_search_time_ms": 2678682,
      "average_search_time_per_bucket_ms": 330.1715764821891,
      "exponential_average_search_time_per_hour_ms": 147.8951139393392
    },
    "running_state": {
      "real_time_configured": true,
      "real_time_running": true
    }
  },
  "alerting_rules": [
    {
      "id": "5c70ec80-c2ac-11eb-996e-a7d1a9bee3db",
      "notifyWhen": "onActiveAlert",
      "params": {
        "severity": 24,
        "resultType": "record",
        "includeInterim": false,
        "jobSelection": {
          "jobIds": [
            "egt21amb_4_aps5000"
          ],
          "groupIds": []
        },
        "lookbackInterval": null,
        "topNBuckets": null
      },
      "consumer": "alerts",
      "alertTypeId": "xpack.ml.anomaly_detection_alert",
      "schedule": {
        "interval": "1h"
      },
      "actions": [
        {
          "group": "anomaly_score_match",
          "params": {
            "message": """Elastic Stack Machine Learning Alert:
- Job IDs: {{context.jobIds}}
- Time: {{context.timestampIso8601}}
- Anomaly score: {{context.score}}

{{context.message}}

{{#context.topInfluencers.length}}
  Top influencers:
  {{#context.topInfluencers}}
    {{influencer_field_name}} = {{influencer_field_value}} [{{score}}]
  {{/context.topInfluencers}}
{{/context.topInfluencers.length}}

{{#context.topRecords.length}}
  Top records:
  {{#context.topRecords}}
    {{function}}({{field_name}}) {{by_field_value}} {{over_field_value}} {{partition_field_value}} [{{score}}]
  {{/context.topRecords}}
{{/context.topRecords.length}}

{{! Replace kibanaBaseUrl if not configured in Kibana }}
[Open in Anomaly Explorer]({{{kibanaBaseUrl}}}{{{context.anomalyExplorerUrl}}})
""",
            "subject": "{{alert.actionGroupName}}",
            "to": [
              "monmail@mail.com"
            ]
          },
          "actionTypeId": ".email",
          "id": "07f834ad-8fb8-4b53-9e09-dbef6d041662"
        }
      ],
      "tags": [],
      "name": "APS5000_EGT21Amb",
      "enabled": true,
      "throttle": null,
      "apiKeyOwner": "monmail@mail.com",
      "createdBy": "monmail@mail.com",
      "updatedBy": "monmail@mail.com",
      "muteAll": false,
      "mutedInstanceIds": [],
      "updatedAt": "2021-09-16T06:14:42.206Z",
      "createdAt": "2021-06-01T07:38:31.555Z",
      "scheduledTaskId": "5d9e21e0-c2ac-11eb-996e-a7d1a9bee3db",
      "executionStatus": {
        "lastExecutionDate": "2021-09-03T08:51:51.499Z",
        "status": "active"
      }
    }
  ]
}

droberts195 · December 9, 2021, 12:59pm

The difference is that the anomaly explorer is using the original source data to plot its chart whereas the single metric viewer is using model_plot results.

So it seems that there are large gaps in the model_plot results that exist in the .ml-anomalies-shared index for this job.

I think the next troubleshooting step should be to look at how many model_plot documents exist for this job over time. You can do this in Discover. If you don't already have an index pattern for .ml-anomalies-shared then create one in the Kibana management settings. Then, in Discover, search .ml-anomalies-shared for job_id:egt21amb_4_aps5000 and result_type:model_plot. The chart in Discover that shows count of documents by time should reveal if there are periods when lots of model plot documents don't exist.

We should also rule out simple explanations. I know the job config you posted shows "model_plot_config": { "enabled": true, but has it definitely been like that all year? It's not that you have updated the the value of that setting between true and false and back again?

dao · December 9, 2021, 1:52pm

Hello, thank's for your concern,

first: the model_plot_config has always been set to true. No forth and back

Second, here is the discover you mentionned. It seems that the rate of model_plot did not change

This is for all the partition fields. I did filter for my concerning partition_field_value, and it seems to keep the rate stable.

The mystery remains ...

dao · December 9, 2021, 2:08pm

I add a piece of evidence: with the same discover filters, I plot the model_lower/median/upper.
It shows that the single view maps exactly the .ml-anomalies-shared documents. It seems that the new data do not feed anymore the model

dao · December 9, 2021, 2:12pm

If I look on the model_upper graph for all the partition_field_values, I get the following. It shows that the model is not fed for almost all of the partitions. except the new ones???

droberts195 · December 9, 2021, 2:37pm

It shows that the model is not fed for almost all of the partitions. except the new ones???

Thanks for posting that screenshot.

I think the next step is to determine whether data literally doesn't get fed to the model, or whether it does get fed but the model doesn't update for some reason.

In your original post the job was showing these stats:

  "data_counts": {
    "job_id": "egt21amb_4_aps5000",
    "processed_record_count": 125139,
    "processed_field_count": 500219,
    "input_bytes": 18361936,
    "input_field_count": 500219,

What are the equivalent numbers now? Have those particular stats increased? If they have then it suggests that data is being fed to the model.

You also have:

    "delayed_data_check_config": {
      "enabled": true
    },

That should mean that if the datafeed feeds data and then later new data appears for the period that has already been fed then you get an annotation warning you that your query_delay is too low. Do you see any annotations warning you about delayed data if you look at the job's annotations?

If this evidence shows that data is getting through to the model but the model is ignoring it then I think the most likely reason is a bug with 23 hour bucket spans. Is there a good reason why you chose a 23 hour bucket span rather than a 24 hour bucket span? Our code that detects periodicity is tuned to detect it better when there are a whole number of bucket spans in 1 day and 1 week. It's more likely that you will be affected by a strange bug that we didn't catch in testing if your bucket span doesn't divide evenly into 1 day and 1 week.

dao · December 9, 2021, 3:30pm

The counts seems to have increased

    "data_counts": {
        "job_id": "egt21amb_4_aps5000",
        "processed_record_count": 125374,
        "processed_field_count": 501159,
        "input_bytes": 18396454,
        "input_field_count": 501159,
        "invalid_date_count": 0,
        "missing_field_count": 337,
        "out_of_order_timestamp_count": 0,
        "empty_bucket_count": 6291,
        "sparse_bucket_count": 2,
        "bucket_count": 8115,
        "earliest_record_timestamp": 967246309000,
        "latest_record_timestamp": 1639006525000,
        "last_data_time": 1639054800535,
        "latest_empty_bucket_timestamp": 1597460400000,
        "latest_sparse_bucket_timestamp": 1630497600000,
        "input_record_count": 125374,
        "log_time": 1639054800535,
        "latest_bucket_timestamp": 1638943200000
    },

Yes I have some occurences. Not a lot, but some. Is it significant up to you?

No, any good reasons, it is empirical. I can try 1d instead (I will clone the ML job and only change this parameter

system · January 6, 2022, 3:31pm

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
ML job behavior changed Discussions en français	2	648	January 6, 2022
Machine Learning data disapeared Elasticsearch elastic-stack-machine-learning	5	706	October 30, 2018
Machine learning jobs not reflecting new data Elasticsearch elastic-stack-machine-learning	5	828	October 30, 2018
Moving ML jobs and their data Elasticsearch elastic-stack-machine-learning	8	2079	October 13, 2021
ML jobs Elasticsearch elastic-stack-machine-learning	10	635	November 4, 2022

ML job behavior changed

Related topics