Elasticsearch 7.13 - monitoring execution failed

Hi Folks ,

I have lot of issue with my ELK Stack . This stack is composed of 3 nodes ( Master , Data , Data ) . Logs are populated with the following errors . When I'm trying to make a transform on an index it's failed , I also see some monitoring indexe with red colors on Kibana .

[2021-06-28T10:20:48,267][WARN ][o.e.x.m.e.l.LocalExporter] [es-node-02] unexpected error while indexing monitoring document
org.elasticsearch.xpack.monitoring.exporter.ExportException: RemoteTransportException[[es-node-03][10.205.224.98:9300][indices:data/write/bulk[s]]]; nested: UnavailableShardsException[[.monitoring-es-7-2021.06.28][0] primary shard is not active Timeout: [1m], request: [BulkShardRequest [[.monitoring-es-7-2021.06.28][0]] containing [index {[.monitoring-es-7-2021.06.28][_doc][lqEhUnoBC2Hhzjs8S-go], source[{"cluster_uuid":"eysukA7-R62aU8RgRgA6uQ","timestamp":"2021-06-28T10:19:11.452Z","interval_ms":10000,"type":"node_stats","source_node":{"uuid":"AZCoXzJVRqilE_-BXdmUnw","host":"10.205.224.114","transport_address":"10.205.224.114:9300","ip":"10.205.224.114","name":"es-node-02","timestamp":"2021-06-28T10:19:11.447Z"},"node_stats":{"node_id":"AZCoXzJVRqilE_-BXdmUnw","node_master":false,"mlockall":false,"indices":{"docs":{"count":2015863},"store":{"size_in_bytes":648557047},"indexing":{"index_total":13,"index_time_in_millis":63,"throttle_time_in_millis":0},"search":{"query_total":38,"query_time_in_millis":258},"query_cache":{"memory_size_in_bytes":0,"hit_count":0,"miss_count":36,"evictions":0},"fielddata":{"memory_size_in_bytes":2328,"evictions":0},"segments":{"count":88,"memory_in_bytes":1559424,"terms_memory_in_bytes":659408,"stored_fields_memory_in_bytes":59504,"term_vectors_memory_in_bytes":0,"norms_memory_in_bytes":26944,"points_memory_in_bytes":0,"doc_values_memory_in_bytes":813568,"index_writer_memory_in_bytes":0,"version_map_memory_in_bytes":219,"fixed_bit_set_memory_in_bytes":113472},"request_cache":{"memory_size_in_bytes":4736,"evictions":0,"hit_count":0,"miss_count":3}},"os":{"cpu":{"load_average":{"1m":6.64,"5m":5.95,"15m":5.96}},"cgroup":{"cpuacct":{"control_group":"/system.slice/elasticsearch.service","usage_nanos":77567141859},"cpu":{"control_group":"/system.slice/elasticsearch.service","cfs_period_micros":100000,"cfs_quota_micros":-1,"stat":{"number_of_elapsed_periods":0,"number_of_times_throttled":0,"time_throttled_nanos":0}},"memory":{"control_group":"/system.slice/elasticsearch.service","limit_in_bytes":"9223372036854771712","usage_in_bytes":"36011626496"}}},"process":{"open_file_descriptors":494,"max_file_descriptors":65535,"cpu":{"percent":37}},"jvm":{"mem":{"heap_used_in_bytes":353281920,"heap_used_percent":1,"heap_max_in_bytes":34359738368},"gc":{"collectors":{"young":{"collection_count":5,"collection_time_in_millis":201},"old":{"collection_count":0,"collection_time_in_millis":0}}}},"thread_pool":{"generic":{"threads":4,"queue":0,"rejected":0},"get":{"threads":0,"queue":0,"rejected":0},"management":{"threads":4,"queue":0,"rejected":0},"search":{"threads":13,"queue":0,"rejected":0},"watcher":{"threads":0,"queue":0,"rejected":0},"write":{"threads":0,"queue":0,"rejected":0}},"fs":{"total":{"total_in_bytes":214643507200,"free_in_bytes":40383897600,"available_in_bytes":40383897600},"io_stats":{"total":{"operations":889,"read_operations":100,"write_operations":789,"read_kilobytes":50356,"write_kilobytes":5291}}}}}]}]]];
        at org.elasticsearch.xpack.monitoring.exporter.local.LocalBulk.lambda$throwExportException$2(LocalBulk.java:126) ~[x-pack-monitoring-7.13.2.jar:7.13.2]
        at java.util.stream.ReferencePipeline$3$1.accept(ReferencePipeline.java:197) ~[?:?]
        at java.util.stream.ReferencePipeline$2$1.accept(ReferencePipeline.java:179) ~[?:?]
        at java.util.Spliterators$ArraySpliterator.forEachRemaining(Spliterators.java:948) ~[?:?]
        at java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:484) ~[?:?]
        at java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:474) ~[?:?]
        at java.util.stream.ForEachOps$ForEachOp.evaluateSequential(ForEachOps.java:150) ~[?:?]
        at java.util.stream.ForEachOps$ForEachOp$OfRef.evaluateSequential(ForEachOps.java:173) ~[?:?]
        at java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234) ~[?:?]
        at java.util.stream.ReferencePipeline.forEach(ReferencePipeline.java:596) ~[?:?]

Caused by: org.elasticsearch.transport.RemoteTransportException: [es-node-03][10.205.224.98:9300][indices:data/write/bulk[s]]
Caused by: org.elasticsearch.action.UnavailableShardsException: [.monitoring-es-7-2021.06.28][0] primary shard is not active Timeout: [1m], request: [BulkShardRequest [[.monitoring-es-7-2021.06.28][0]] containing [index {[.monitoring-es-7-2021.06.28][_doc][lqEhUnoBC2Hhzjs8S-go], source[{"cluster_uuid":"eysukA7-R62aU8RgRgA6uQ","timestamp":"2021-06-28T10:19:11.452Z","interval_ms":10000,"type":"node_stats","source_node":{"uuid":"AZCoXzJVRqilE_-BXdmUnw","host":"10.205.224.114","transport_address":"10.205.224.114:9300","ip":"10.205.224.114","name":"es-node-02","timestamp":"2021-06-28T10:19:11.447Z"},"node_stats":{"node_id":"AZCoXzJVRqilE_-BXdmUnw","node_master":false,"mlockall":false,"indices":{"docs":{"count":2015863},"store":{"size_in_bytes":648557047},"indexing":{"index_total":13,"index_time_in_millis":63,"throttle_time_in_millis":0},"search":{"query_total":38,"query_time_in_millis":258},"query_cache":{"memory_size_in_bytes":0,"hit_count":0,"miss_count":36,"evictions":0},"fielddata":{"memory_size_in_bytes":2328,"evictions":0},"segments":{"count":88,"memory_in_bytes":1559424,"terms_memory_in_bytes":659408,"stored_fields_memory_in_bytes":59504,"term_vectors_memory_in_bytes":0,"norms_memory_in_bytes":26944,"points_memory_in_bytes":0,"doc_values_memory_in_bytes":813568,"index_writer_memory_in_bytes":0,"version_map_memory_in_bytes":219,"fixed_bit_set_memory_in_bytes":113472},"request_cache":{"memory_size_in_bytes":4736,"evictions":0,"hit_count":0,"miss_count":3}},"os":{"cpu":{"load_average":{"1m":6.64,"5m":5.95,"15m":5.96}},"cgroup":{"cpuacct":{"control_group":"/system.slice/elasticsearch.service","usage_nanos":77567141859},"cpu":{"control_group":"/system.slice/elasticsearch.service","cfs_period_micros":100000,"cfs_quota_micros":-1,"stat":{"number_of_elapsed_periods":0,"number_of_times_throttled":0,"time_throttled_nanos":0}},"memory":{"control_group":"/system.slice/elasticsearch.service","limit_in_bytes":"9223372036854771712","usage_in_bytes":"36011626496"}}},"process":{"open_file_descriptors":494,"max_file_descriptors":65535,"cpu":{"percent":37}},"jvm":{"mem":{"heap_used_in_bytes":353281920,"heap_used_percent":1,"heap_max_in_bytes":34359738368},"gc":{"collectors":{"young":{"collection_count":5,"collection_time_in_millis":201},"old":{"collection_count":0,"collection_time_in_millis":0}}}},"thread_pool":{"generic":{"threads":4,"queue":0,"rejected":0},"get":{"threads":0,"queue":0,"rejected":0},"management":{"threads":4,"queue":0,"rejected":0},"search":{"threads":13,"queue":0,"rejected":0},"watcher":{"threads":0,"queue":0,"rejected":0},"write":{"threads":0,"queue":0,"rejected":0}},"fs":{"total":{"total_in_bytes":214643507200,"free_in_bytes":40383897600,"available_in_bytes":40383897600},"io_stats":{"total":{"operations":889,"read_operations":100,"write_operations":789,"read_kilobytes":50356,"write_kilobytes":5291}}}}}]}]]
        at org.elasticsearch.action.support.replication.TransportReplicationAction$ReroutePhase.retryBecauseUnavailable(TransportReplicationAction.java:884) ~[elasticsearch-7.13.2.jar:7.13.2]
        at org.elasticsearch.action.support.replication.TransportReplicationAction$ReroutePhase.doRun(TransportReplicationAction.java:734) ~[elasticsearch-7.13.2.jar:7.13.2]
        at org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:26) ~[elasticsearch-7.13.2.jar:7.13.2]
        at org.elasticsearch.action.support.replication.TransportReplicationAction$ReroutePhase$2.onTimeout(TransportReplicationAction.java:844) ~[elasticsearch-7.13.2.jar:7.13.2]
        at org.elasticsearch.cluster.ClusterStateObserver$ContextPreservingListener.onTimeout(ClusterStateObserver.java:324) ~[elasticsearch-7.13.2.jar:7.13.2]
        at org.elasticsearch.cluster.ClusterStateObserver$ObserverClusterStateListener.onTimeout(ClusterStateObserver.java:241) ~[elasticsearch-7.13.2.jar:7.13.2]
        at org.elasticsearch.cluster.service.ClusterApplierService$NotifyTimeout.run(ClusterApplierService.java:590) ~[elasticsearch-7.13.2.jar:7.13.2]
        at org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingRunnable.run(ThreadContext.java:673) ~[elasticsearch-7.13.2.jar:7.13.2]
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1130) ~[?:?]
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:630) ~[?:?]
        at java.lang.Thread.run(Thread.java:831) ~[?:?]
[2021-06-28T10:20:48,277][WARN ][o.e.x.m.MonitoringService] [es-node-02] monitoring execution failed
org.elasticsearch.xpack.monitoring.exporter.ExportException: failed to flush export bulks
        at org.elasticsearch.xpack.monitoring.exporter.ExportBulk$Compound.lambda$doFlush$0(ExportBulk.java:110) [x-pack-monitoring-7.13.2.jar:7.13.2]
        at org.elasticsearch.action.ActionListener$1.onFailure(ActionListener.java:142) [elasticsearch-7.13.2.jar:7.13.2]
        at org.elasticsearch.xpack.monitoring.exporter.local.LocalBulk.throwExportException(LocalBulk.java:133) [x-pack-monitoring-7.13.2.jar:7.13.2]
        at org.elasticsearch.xpack.monitoring.exporter.local.LocalBulk.lambda$doFlush$0(LocalBulk.java:109) [x-pack-monitoring-7.13.2.jar:7.13.2]

Can you please help me to resolve this issue

Monitoring and transform are separate features and should not have any interaction. Best if we tackle the problems separately. The stack trace only relates to monitoring, that's why I will remove the transforms tag.

Regarding your transform problem, please open a new thread. In order to help you, we need essential information like:

  • version of the stack
  • the transform configuration or at least the essential parts that make problems
  • the error message (if transform failed, the output of _transform/{id}/_stats contains the reason of the failure)

Some sample data that shows the problem might be helpful as well.

I am afraid I can't help with the monitoring problem, I hope someone else picks this up. If you have a support contract or if you are a cloud customer I suggest to contact support. Especially for cluster instability problems, support can help getting diagnostics from your cluster and therefore better analyze your problems.

Hi ,

Please find below the output of the _transform/{id}/_stats . I'm currently logged as supersuser via kibana

{
  "node_failures" : [
    {
      "type" : "failed_node_exception",
      "reason" : "Failed to retrieve checkpointing info",
      "node_id" : "7ZFlsLhaTIu4qh01by-fgA",
      "caused_by" : {
        "type" : "checkpoint_exception",
        "reason" : "checkpoint_exception: Failed to retrieve configuration",
        "caused_by" : {
          "type" : "search_phase_execution_exception",
          "reason" : "all shards failed",
          "phase" : "query",
          "grouped" : true,
          "failed_shards" : [
            {
              "shard" : 0,
              "index" : ".transform-internal-007",
              "node" : "AZCoXzJVRqilE_-BXdmUnw",
              "reason" : {
                "type" : "security_exception",
                "reason" : "action [indices:data/read/search[phase/query]] is unauthorized for user [_xpack] with roles [_xpack], this action is granted by the index privileges [read,all]",
                "caused_by" : {
                  "type" : "illegal_state_exception",
                  "reason" : "There are no external requests known to support wildcards that don't support replacing their indices"
                }
              }
            }
          ],
          "caused_by" : {
            "type" : "security_exception",
            "reason" : "action [indices:data/read/search[phase/query]] is unauthorized for user [_xpack] with roles [_xpack], this action is granted by the index privileges [read,all]",
            "caused_by" : {
              "type" : "illegal_state_exception",
              "reason" : "There are no external requests known to support wildcards that don't support replacing their indices"
            }
          }
        }
      }
    }
  ],
  "count" : 1,
  "transforms" : [
    {
      "id" : "tr300test",
      "state" : "failed",
      "reason" : "Failed to load transform configuration for transform [tr300test]",
      "node" : {
        "id" : "7ZFlsLhaTIu4qh01by-fgA",
        "name" : "es-node-03",
        "ephemeral_id" : "BTnd2cvoQ6ya0nEWpt2HOg",
        "transport_address" : "xxxx:9300",
        "attributes" : { }
      },
      "stats" : {
        "pages_processed" : 0,
        "documents_processed" : 0,
        "documents_indexed" : 0,
        "documents_deleted" : 0,
        "trigger_count" : 0,
        "index_time_in_ms" : 0,
        "index_total" : 0,
        "index_failures" : 0,
        "search_time_in_ms" : 0,
        "search_total" : 0,
        "search_failures" : 0,
        "processing_time_in_ms" : 0,
        "processing_total" : 0,
        "delete_time_in_ms" : 0,
        "exponential_avg_checkpoint_duration_ms" : 0.0,
        "exponential_avg_documents_indexed" : 0.0,
        "exponential_avg_documents_processed" : 0.0
      },
      "checkpointing" : {
        "last" : {
          "checkpoint" : 0
        }
      }
    }
  ]
}

{
  "cluster_name" : "01-cluster",
  "status" : "green",
  "timed_out" : false,
  "number_of_nodes" : 3,
  "number_of_data_nodes" : 3,
  "active_primary_shards" : 34,
  "active_shards" : 68,
  "relocating_shards" : 0,
  "initializing_shards" : 0,
  "unassigned_shards" : 0,
  "delayed_unassigned_shards" : 0,
  "number_of_pending_tasks" : 0,
  "number_of_in_flight_fetch" : 0,
  "task_max_waiting_in_queue_millis" : 0,
  "active_shards_percent_as_number" : 100.0
}

{
  "username" : "elastic",
  "roles" : [
    "superuser"
  ],
  "full_name" : null,
  "email" : null,
  "metadata" : {
    "_reserved" : true
  },
  "enabled" : true,
  "authentication_realm" : {
    "name" : "reserved",
    "type" : "reserved"
  },
  "lookup_realm" : {
    "name" : "reserved",
    "type" : "reserved"
  },
  "authentication_type" : "realm"

Thanks for the update.

It seems to me that your cluster is in bad shape and we need to fix this 1st before looking into your transform issue. The permission problem in the _stats output should not happen. The output tells me that you are using 7.13.

  • Are you coming from an older version, with other words: Is this an upgraded cluster or is this a fresh install?
  • Are all nodes using the same version?
  • As you are using security, did you enable security on all nodes?

I'm using security on all node . it's a fresh installed on a redhat server All node are in the same version . I've used the same rpm ( 7.13 ) on the 3 servers .This means that it's not an upgraded cluster .

Any Idea about this issue ?