Currently trying to develop a machine learning job to detect when a user identity is performing suspicious actions. I'm having a hard time wrapping my head around ML, so I may need a bit of help determining the best way to structure my data.
Currently I'm using our Cisco Umbrella dataset and parsing freq_rare identities over internal IP addresses, which seems to not be giving me anything. Now that I think about it, does the top value (by_fieldname) mean that I am currently looking for rare identities, because if so I'm completely missing the point.
Here's a dump of the current filtered JSON:
{
"job_id": "detect_rare_based_on_internal_ip_and_identity",
"job_type": "anomaly_detector",
"job_version": "7.1.1",
"groups": [
"suspicious_behavior_detection"
],
"description": "Detects suspicious behavior of users within the population of internal IPs.",
"create_time": 1565117280576,
"analysis_config": {
"bucket_span": "15m",
"detectors": [
{
"detector_description": "freq_rare by identities over internal_ip",
"function": "freq_rare",
"by_field_name": "identities",
"over_field_name": "internal_ip",
"detector_index": 0
}
],
"influencers": [
"most_granular_identity",
"action"
]
},
"analysis_limits": {
"model_memory_limit": "1024mb",
"categorization_examples_limit": 4
},
"data_description": {
"time_field": "@timestamp",
"time_format": "epoch_ms"
},
"model_snapshot_retention_days": 1,
"custom_settings": {
"custom_urls": []
},
"results_index_name": "shared",
"data_counts": {
"job_id": "detect_rare_based_on_internal_ip_and_identity",
"processed_record_count": 25325977,
"processed_field_count": 101303908,
"input_bytes": 4366538046,
"input_field_count": 101303908,
"invalid_date_count": 0,
"missing_field_count": 0,
"out_of_order_timestamp_count": 0,
"empty_bucket_count": 8,
"sparse_bucket_count": 0,
"bucket_count": 780,
"earliest_record_timestamp": 1556213692000,
"latest_record_timestamp": 1556915689000,
"last_data_time": 1565118610064,
"latest_empty_bucket_timestamp": 1556425800000,
"input_record_count": 25325977
},
"model_size_stats": {
"job_id": "detect_rare_based_on_internal_ip_and_identity",
"result_type": "model_size_stats",
"model_bytes": 38678638,
"total_by_field_count": 14084,
"total_over_field_count": 3255,
"total_partition_field_count": 2,
"bucket_allocation_failures_count": 0,
"memory_status": "ok",
"log_time": 1565118549000,
"timestamp": 1556896500000
},
"datafeed_config": {
"datafeed_id": "datafeed-detect_rare_based_on_internal_ip_and_identity",
"job_id": "detect_rare_based_on_internal_ip_and_identity",
"query_delay": "108588ms",
"indices": [
"umbrella-*"
],
"query": {
"match_all": {}
},
"scroll_size": 1000,
"chunking_config": {
"mode": "auto"
},
"delayed_data_check_config": {
"enabled": true
},
"state": "started"
},
"state": "opened",
"node": {
"id": "msWt_FOPQIqRu-OwzKG8HQ",
"name": "elastic-data03",
"ephemeral_id": "J1mp6JE0RB6ppLdTE4E78A",
"transport_address": "10.254.10.229:9300",
"attributes": {
"ml.machine_memory": "33566892032",
"ml.max_open_jobs": "20",
"xpack.installed": "true"
}
},
"open_time": "1322s"
}