I have been using the new machine learning feature of 5.4 with great success with apache access logs and system metrics and wanted to try it out on my tomcat logs. I have about 50 tomcat applications all logging in the same format and indexing to the same indexes. The "message" field of each event has been overwritten in a grok parse to be only the actual message left over after parsing the timestamp, log level, java class, etc.
I originally couldn't start the job until I added "_source": true to the datafeed. After that, I was able to start the job but no matter what I do, it freezes after only a few buckets. Furthermore, after it freezes it cannot be stopped, closed, or deleted until I restart elasticsearch on the ML node it was assigned to.
Here is my job config:
{
"job_id": "log-tomcat",
"job_type": "anomaly_detector",
"description": "Abnormal log events per tomcat container",
"analysis_config": {
"bucket_span": "30m",
"categorization_field_name": "message",
"detectors": [
{
"detector_description": "ML Categories (log-tomcat)",
"function": "count",
"by_field_name": "mlcategory",
"partition_field_name": "container",
"detector_rules":
}
],
"influencers": [
"host",
"level",
"logger"
]
},
"data_description": {
"time_field": "@timestamp"
},
"model_snapshot_retention_days": 1,
"datafeed_config": {
"query_delay": "150s",
"frequency": "60s",
"indexes": [
"message"
],
"types": [
"tomcat"
],
"query": {
"bool": {
"must": [
{
"term": {
"environment": {
"value": "production",
"boost": 1
}
}
},
{
"term": {
"service": {
"value": "tomcat",
"boost": 1
}
}
}
],
"disable_coord": false,
"adjust_pure_negative": true,
"boost": 1
}
},
"scroll_size": 1000,
"_source": true,
"chunking_config": {
"mode": "auto"
}
}
}
Here is the JSON output of the job while it is frozen:
{
"job_id": "log-tomcat",
"job_type": "anomaly_detector",
"description": "Abnormal log events per tomcat container",
"create_time": 1494974221746,
"analysis_config": {
"bucket_span": "30m",
"categorization_field_name": "message",
"detectors": [
{
"detector_description": "ML Categories (log-tomcat)",
"function": "count",
"by_field_name": "mlcategory",
"partition_field_name": "container",
"detector_rules":
}
],
"influencers": [
"host",
"level",
"logger"
]
},
"data_description": {
"time_field": "@timestamp",
"time_format": "epoch_ms"
},
"model_snapshot_retention_days": 1,
"results_index_name": "shared",
"data_counts": {
"job_id": "log-tomcat",
"processed_record_count": 7728,
"processed_field_count": 38639,
"input_bytes": 2106764,
"input_field_count": 38640,
"invalid_date_count": 0,
"missing_field_count": 7729,
"out_of_order_timestamp_count": 0,
"empty_bucket_count": 0,
"sparse_bucket_count": 0,
"bucket_count": 14,
"earliest_record_timestamp": 1494734400400,
"latest_record_timestamp": 1494748795323,
"last_data_time": 1494974263892,
"input_record_count": 7728
},
"model_size_stats": {
"job_id": "log-tomcat",
"result_type": "model_size_stats",
"model_bytes": 1516800,
"total_by_field_count": 74,
"total_over_field_count": 0,
"total_partition_field_count": 12,
"bucket_allocation_failures_count": 0,
"memory_status": "ok",
"log_time": 1494974263000,
"timestamp": 1494745200000
},
"datafeed_config": {
"datafeed_id": "datafeed-log-tomcat",
"job_id": "log-tomcat",
"query_delay": "150s",
"frequency": "60s",
"indexes": [
"message"
],
"types": [
"tomcat"
],
"query": {
"bool": {
"must": [
{
"term": {
"environment": {
"value": "production",
"boost": 1
}
}
},
{
"term": {
"service": {
"value": "tomcat",
"boost": 1
}
}
}
],
"disable_coord": false,
"adjust_pure_negative": true,
"boost": 1
}
},
"scroll_size": 1000,
"_source": true,
"chunking_config": {
"mode": "auto"
},
"state": "started"
},
"state": "opened",
"node": {
"id": "syyOmfkhQ5qZ0BTuFoPWPA",
"name": "elasticsearch01",
"ephemeral_id": "utQPqCCdTayloJTBG5avcA",
"transport_address": "10.255.72.128:9300",
"attributes": {
"ml.enabled": "true"
}
},
"open_time": "378s"
}
Not sure what I'm doing wrong as I haven't heard of anyone else having this issue with unstructured log ML jobs. Thoughts?