Master node with 100% CPU

we are getting 100% CPU usage for a master node.
the cluster is hosted in the Elastic cloud.

any help will be appreciated!


attached below the tasks (it seems to run /admin/snapshot/delete constantly )

image

not sure about this snapshot task (not sure if it's actually a snapshot- same one you'll find in Kibana.. or maybe it's talking about memory snapshot or similar), but after changing the snapshot policy in Kibana , the retention policy, to delete after 10000 snapshots / 24 days, instead of 100 and 3 days, the load on the master node stopped.

it will be great to understand this issue, as it may be a bug in the cloud / Elastic etc

and the load is back.. not sure why I always see something in the running tasks related to the snapshot even that the interval is 1 hour (for example /admin/snapshot/status- 45min duration!)

What is the output from the _cluster/stats?pretty&human API?

attached. thanks a lot @warkolm

I'm still not sure why I see in the node's tasks the following actions all the time:

"action" : "cluster:admin/snapshot/status"
 "action" : "cluster:admin/snapshot/get",
 "action" : "cluster:admin/snapshot/delete",

while the (only) snapshot policy says

Schedule
0 0 * * * ?

 {
    "Mdnq4RAeRtqs004FxxQRgg:1340468" : {
      "node" : "Mdnq4RAeRtqs004FxxQRgg",
      "id" : 1340468,
      "type" : "transport",
      "action" : "cluster:monitor/xpack/ml/job/stats/get",
      "description" : "",
      "start_time_in_millis" : 1612349554476,
      "running_time_in_nanos" : 1308251137,
      "cancellable" : false,
      "headers" : { }
    },
    "Mdnq4RAeRtqs004FxxQRgg:466567" : {
      "node" : "Mdnq4RAeRtqs004FxxQRgg",
      "id" : 466567,
      "type" : "transport",
      "action" : "cluster:admin/snapshot/status",
      "description" : "",
      "start_time_in_millis" : 1612316979079,
      "running_time_in_nanos" : 32576705363709,
      "cancellable" : false,
      "parent_task_id" : "Ysr-3BHkT9aKvQXSQdtr2A:1225810",
      "headers" : { }
    },
    "Mdnq4RAeRtqs004FxxQRgg:1340470" : {
      "node" : "Mdnq4RAeRtqs004FxxQRgg",
      "id" : 1340470,
      "type" : "transport",
      "action" : "indices:monitor/stats",
      "description" : "",
      "start_time_in_millis" : 1612349554477,
      "running_time_in_nanos" : 1307514234,
      "cancellable" : false,
      "headers" : { }
    },
    "Mdnq4RAeRtqs004FxxQRgg:1340496" : {
      "node" : "Mdnq4RAeRtqs004FxxQRgg",
      "id" : 1340496,
      "type" : "transport",
      "action" : "cluster:monitor/tasks/lists[n]",
      "description" : "",
      "start_time_in_millis" : 1612349555784,
      "running_time_in_nanos" : 321901,
      "cancellable" : false,
      "parent_task_id" : "xLr4tF2ZSSKNscRDwYLllA:1954171",
      "headers" : { }
    },
    "Mdnq4RAeRtqs004FxxQRgg:357473" : {
      "node" : "Mdnq4RAeRtqs004FxxQRgg",
      "id" : 357473,
      "type" : "transport",
      "action" : "cluster:admin/snapshot/status",
      "description" : "",
      "start_time_in_millis" : 1612313393273,
      "running_time_in_nanos" : 36162511051383,
      "cancellable" : false,
      "parent_task_id" : "Ysr-3BHkT9aKvQXSQdtr2A:959206",
      "headers" : { }
    },
    "Mdnq4RAeRtqs004FxxQRgg:962091" : {
      "node" : "Mdnq4RAeRtqs004FxxQRgg",
      "id" : 962091,
      "type" : "transport",
      "action" : "cluster:admin/snapshot/status",
      "description" : "",
      "start_time_in_millis" : 1612334913474,
      "running_time_in_nanos" : 14642310541533,
      "cancellable" : false,
      "parent_task_id" : "Ysr-3BHkT9aKvQXSQdtr2A:2495902",
      "headers" : { }
    },
    "Mdnq4RAeRtqs004FxxQRgg:1340476" : {
      "node" : "Mdnq4RAeRtqs004FxxQRgg",
      "id" : 1340476,
      "type" : "transport",
      "action" : "indices:data/read/search",
      "description" : """indices[.security], types[], search_type[QUERY_THEN_FETCH], scroll[5m], source[{"size":1000,"query":{"term":{"doc_type":{"value":"role-mapping","boost":1.0}}},"_source":{"includes":[],"excludes":[]}}]""",
      "start_time_in_millis" : 1612349554873,
      "running_time_in_nanos" : 910793110,
      "cancellable" : true,
      "headers" : { }
    },
    "Mdnq4RAeRtqs004FxxQRgg:1340477" : {
      "node" : "Mdnq4RAeRtqs004FxxQRgg",
      "id" : 1340477,
      "type" : "transport",
      "action" : "indices:data/read/search",
      "description" : """indices[.security], types[], search_type[QUERY_THEN_FETCH], source[{"size":0,"query":{"term":{"type":{"value":"user","boost":1.0}}},"track_total_hits":2147483647}]""",
      "start_time_in_millis" : 1612349554874,
      "running_time_in_nanos" : 910431309,
      "cancellable" : true,
      "headers" : { }
    },
    "Mdnq4RAeRtqs004FxxQRgg:1328664" : {
      "node" : "Mdnq4RAeRtqs004FxxQRgg",
      "id" : 1328664,
      "type" : "transport",
      "action" : "cluster:admin/snapshot/delete",
      "description" : "",
      "start_time_in_millis" : 1612349116182,
      "running_time_in_nanos" : 439601908076,
      "cancellable" : false,
      "headers" : { }
    },
    "Mdnq4RAeRtqs004FxxQRgg:1340472" : {
      "node" : "Mdnq4RAeRtqs004FxxQRgg",
      "id" : 1340472,
      "type" : "transport",
      "action" : "indices:data/read/msearch",
      "description" : """,indices[.security], types[], search_type[QUERY_THEN_FETCH], source[{"size":0,"query":{"term":{"type":{"value":"role","boost":1.0}}},"track_total_hits":2147483647}]indices:data/read/msearch[indices[.security], types[], search_type[QUERY_THEN_FETCH], source[{"size":0,"terminate_after":1,"query":{"bool":{"must":[{"term":{"type":{"value":"role","boost":1.0}}},{"bool":{"should":[{"exists":{"field":"indices.field_security.grant","boost":1.0}},{"exists":{"field":"indices.field_security.except","boost":1.0}},{"exists":{"field":"indices.fields","boost":1.0}}],"adjust_pure_negative":true,"boost":1.0}}],"adjust_pure_negative":true,"boost":1.0}},"track_total_hits":2147483647}]indices:data/read/msearch[indices[.security], types[], search_type[QUERY_THEN_FETCH], source[{"size":0,"terminate_after":1,"query":{"bool":{"must":[{"term":{"type":{"value":"role","boost":1.0}}}],"filter":[{"exists":{"field":"indices.query","boost":1.0}}],"adjust_pure_negative":true,"boost":1.0}},"track_total_hits":2147483647}]]""",
      "start_time_in_millis" : 1612349554780,
      "running_time_in_nanos" : 1005474302,
      "cancellable" : true,
      "headers" : { }
    },
    "Mdnq4RAeRtqs004FxxQRgg:1340473" : {
      "node" : "Mdnq4RAeRtqs004FxxQRgg",
      "id" : 1340473,
      "type" : "transport",
      "action" : "indices:data/read/search",
      "description" : """indices[.security], types[], search_type[QUERY_THEN_FETCH], source[{"size":0,"query":{"term":{"type":{"value":"role","boost":1.0}}},"track_total_hits":2147483647}]""",
      "start_time_in_millis" : 1612349554780,
      "running_time_in_nanos" : 1005439702,
      "cancellable" : true,
      "parent_task_id" : "Mdnq4RAeRtqs004FxxQRgg:1340472",
      "headers" : { }
    },
    "Mdnq4RAeRtqs004FxxQRgg:1340474" : {
      "node" : "Mdnq4RAeRtqs004FxxQRgg",
      "id" : 1340474,
      "type" : "transport",
      "action" : "indices:data/read/search",
      "description" : """indices[.security], types[], search_type[QUERY_THEN_FETCH], source[{"size":0,"terminate_after":1,"query":{"bool":{"must":[{"term":{"type":{"value":"role","boost":1.0}}},{"bool":{"should":[{"exists":{"field":"indices.field_security.grant","boost":1.0}},{"exists":{"field":"indices.field_security.except","boost":1.0}},{"exists":{"field":"indices.fields","boost":1.0}}],"adjust_pure_negative":true,"boost":1.0}}],"adjust_pure_negative":true,"boost":1.0}},"track_total_hits":2147483647}]""",
      "start_time_in_millis" : 1612349554780,
      "running_time_in_nanos" : 1004923500,
      "cancellable" : true,
      "parent_task_id" : "Mdnq4RAeRtqs004FxxQRgg:1340472",
      "headers" : { }
    },
    "Mdnq4RAeRtqs004FxxQRgg:1340475" : {
      "node" : "Mdnq4RAeRtqs004FxxQRgg",
      "id" : 1340475,
      "type" : "transport",
      "action" : "indices:data/read/search",
      "description" : """indices[.security], types[], search_type[QUERY_THEN_FETCH], source[{"size":0,"terminate_after":1,"query":{"bool":{"must":[{"term":{"type":{"value":"role","boost":1.0}}}],"filter":[{"exists":{"field":"indices.query","boost":1.0}}],"adjust_pure_negative":true,"boost":1.0}},"track_total_hits":2147483647}]""",
      "start_time_in_millis" : 1612349554781,
      "running_time_in_nanos" : 1004689399,
      "cancellable" : true,
      "parent_task_id" : "Mdnq4RAeRtqs004FxxQRgg:1340472",
      "headers" : { }
    },
    "Mdnq4RAeRtqs004FxxQRgg:873372" : {
      "node" : "Mdnq4RAeRtqs004FxxQRgg",
      "id" : 873372,
      "type" : "transport",
      "action" : "cluster:admin/snapshot/status",
      "description" : "",
      "start_time_in_millis" : 1612331322773,
      "running_time_in_nanos" : 18233011841285,
      "cancellable" : false,
      "parent_task_id" : "Ysr-3BHkT9aKvQXSQdtr2A:2250652",
      "headers" : { }
    },
    "Mdnq4RAeRtqs004FxxQRgg:1340455" : {
      "node" : "Mdnq4RAeRtqs004FxxQRgg",
      "id" : 1340455,
      "type" : "direct",
      "action" : "internal:cluster/coordination/publish_state",
      "description" : "",
      "start_time_in_millis" : 1612349554277,
      "running_time_in_nanos" : 1508437753,
      "cancellable" : false,
      "headers" : { }
    },
    "Mdnq4RAeRtqs004FxxQRgg:1340419" : {
      "node" : "Mdnq4RAeRtqs004FxxQRgg",
      "id" : 1340419,
      "type" : "transport",
      "action" : "indices:admin/template/put",
      "description" : "",
      "start_time_in_millis" : 1612349553374,
      "running_time_in_nanos" : 2411770240,
      "cancellable" : false,
      "parent_task_id" : "n86rCLPyT-ebf6e4uaMNew:2893588",
      "headers" : { }
    },
    "Mdnq4RAeRtqs004FxxQRgg:777628" : {
      "node" : "Mdnq4RAeRtqs004FxxQRgg",
      "id" : 777628,
      "type" : "transport",
      "action" : "cluster:admin/snapshot/status",
      "description" : "",
      "start_time_in_millis" : 1612327726073,
      "running_time_in_nanos" : 21829712341262,
      "cancellable" : false,
      "parent_task_id" : "Ysr-3BHkT9aKvQXSQdtr2A:1984186",
      "headers" : { }
    },
    "Mdnq4RAeRtqs004FxxQRgg:1330988" : {
      "node" : "Mdnq4RAeRtqs004FxxQRgg",
      "id" : 1330988,
      "type" : "transport",
      "action" : "cluster:admin/snapshot/status",
      "description" : "",
      "start_time_in_millis" : 1612349202279,
      "running_time_in_nanos" : 353506669293,
      "cancellable" : false,
      "parent_task_id" : "Ysr-3BHkT9aKvQXSQdtr2A:3515287",
      "headers" : { }
    },
    "Mdnq4RAeRtqs004FxxQRgg:1339885" : {
      "node" : "Mdnq4RAeRtqs004FxxQRgg",
      "id" : 1339885,
      "type" : "transport",
      "action" : "cluster:monitor/xpack/usage",
      "description" : "",
      "start_time_in_millis" : 1612349532976,
      "running_time_in_nanos" : 22809192767,
      "cancellable" : false,
      "headers" : { }
    },
    "Mdnq4RAeRtqs004FxxQRgg:1340463" : {
      "node" : "Mdnq4RAeRtqs004FxxQRgg",
      "id" : 1340463,
      "type" : "transport",
      "action" : "indices:admin/template/put",
      "description" : "",
      "start_time_in_millis" : 1612349554374,
      "running_time_in_nanos" : 1411500354,
      "cancellable" : false,
      "parent_task_id" : "xLr4tF2ZSSKNscRDwYLllA:1954036",
      "headers" : { }
    },
    "Mdnq4RAeRtqs004FxxQRgg:1339688" : {
      "node" : "Mdnq4RAeRtqs004FxxQRgg",
      "id" : 1339688,
      "type" : "transport",
      "action" : "cluster:admin/snapshot/get",
      "description" : "",
      "start_time_in_millis" : 1612349527879,
      "running_time_in_nanos" : 27906168691,
      "cancellable" : false,
      "parent_task_id" : "aIsgB69pTfCzDRhxS2qqEw:2724639",
      "headers" : { }
    },
    "Mdnq4RAeRtqs004FxxQRgg:1340490" : {
      "node" : "Mdnq4RAeRtqs004FxxQRgg",
      "id" : 1340490,
      "type" : "transport",
      "action" : "cluster:monitor/xpack/usage",
      "description" : "",
      "start_time_in_millis" : 1612349555776,
      "running_time_in_nanos" : 9515129,
      "cancellable" : false,
      "headers" : { }
    },
    "Mdnq4RAeRtqs004FxxQRgg:1340491" : {
      "node" : "Mdnq4RAeRtqs004FxxQRgg",
      "id" : 1340491,
      "type" : "transport",
      "action" : "cluster:monitor/nodes/stats",
      "description" : "",
      "start_time_in_millis" : 1612349555776,
      "running_time_in_nanos" : 9364729,
      "cancellable" : false,
      "headers" : { }
    }
  }

Thanks! Can you post the hot threads from the master as well please?

the above is from the master node. this is the node I was referring to, that has 100% CPU. the others are kind of sleeping..

current state:

tasks:

Thanks for looking

** I'm trying now to upgrade the masters, and also to freeze old indices

There's not a tonne jumping out to me. Have you tried raising an issue with the Support team?

It was due to an increase in index rate. An upgrade of the master nodes resolved it
Thanks a lot

1 Like

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.