[ERROR][o.e.x.m.c.n.NodeStatsCollector] collector [node_stats] timed out when collecting data

Hi Guys , we just have an issue with part of the logs haven't been indexed ,
in almost all elastic nodes logs I see
error :

[ERROR][o.e.x.m.c.n.NodeStatsCollector]  collector [node_stats] timed out when collecting data

From my knowledge the node_stats collector has nothing to do with index content (data indexed )

moreover the appearance of this error started long time before the absence of the data(a few days ago )

Am I right ?

What is the full output of the cluster stats API?

{
  "cluster_name": "es",
  "status": "green",
  "timed_out": false,
  "number_of_nodes": 9,
  "number_of_data_nodes": 9,
  "active_primary_shards": 14330,
  "active_shards": 28660,
  "relocating_shards": 0,
  "initializing_shards": 0,
  "unassigned_shards": 0,
  "delayed_unassigned_shards": 0,
  "number_of_pending_tasks": 0,
  "number_of_in_flight_fetch": 0,
  "task_max_waiting_in_queue_millis": 0,
  "active_shards_percent_as_number": 100
}
{
  "_nodes": {
    "total": 9,
    "successful": 9,
    "failed": 0
  },
  "cluster_name": "es",
  "timestamp": 1596953800396,
  "status": "green",
  "indices": {
    "count": 3915,
    "shards": {
      "total": 28710,
      "primaries": 14355,
      "replication": 1,
      "index": {
        "shards": {
          "min": 2,
          "max": 10,
          "avg": 7.333333333333333
        },
        "primaries": {
          "min": 1,
          "max": 5,
          "avg": 3.6666666666666665
        },
        "replication": {
          "min": 1,
          "max": 1,
          "avg": 1
        }
      }
    },
    "docs": {
      "count": 8547917092,
      "deleted": 5658175
    },
    "store": {
      "size_in_bytes": 6949711276970
    },
    "fielddata": {
      "memory_size_in_bytes": 1190315264,
      "evictions": 0
    },
    "query_cache": {
      "memory_size_in_bytes": 0,
      "total_count": 0,
      "hit_count": 0,
      "miss_count": 0,
      "cache_size": 0,
      "cache_count": 0,
      "evictions": 0
    },
    "completion": {
      "size_in_bytes": 0
    },
    "segments": {
      "count": 254703,
      "memory_in_bytes": 18128415488,
      "terms_memory_in_bytes": 13752476790,
      "stored_fields_memory_in_bytes": 3213846712,
      "term_vectors_memory_in_bytes": 0,
      "norms_memory_in_bytes": 369479488,
      "points_memory_in_bytes": 395877454,
      "doc_values_memory_in_bytes": 396735044,
      "index_writer_memory_in_bytes": 167550394,
      "version_map_memory_in_bytes": 1534854,
      "fixed_bit_set_memory_in_bytes": 0,
      "max_unsafe_auto_id_timestamp": 1596948769499,
      "file_sizes": {}
    }
  },
  "nodes": {
    "count": {
      "total": 9,
      "data": 9,
      "coordinating_only": 0,
      "master": 3,
      "ingest": 9
    },
    "versions": [
      "6.3.1"
    ],
    "os": {
      "available_processors": 36,
      "allocated_processors": 36,
      "names": [
        {
          "name": "Linux",
          "count": 9
        }
      ],
      "mem": {
        "total_in_bytes": 606504185856,
        "free_in_bytes": 7980625920,
        "used_in_bytes": 598523559936,
        "free_percent": 1,
        "used_percent": 99
      }
    },
    "process": {
      "cpu": {
        "percent": 263
      },
      "open_file_descriptors": {
        "min": 11319,
        "max": 17443,
        "avg": 14510
      }
    },
    "jvm": {
      "max_uptime_in_millis": 33153082551,
      "versions": [
        {
          "version": "1.8.0_102",
          "vm_name": "Java HotSpot(TM) 64-Bit Server VM",
          "vm_version": "25.102-b14",
          "vm_vendor": "Oracle Corporation",
          "count": 9
        }
      ],
      "mem": {
        "heap_used_in_bytes": 103098245904,
        "heap_max_in_bytes": 169337421824
      },
      "threads": 1746
    },
    "fs": {
      "total_in_bytes": 9739008036864,
      "free_in_bytes": 2711540731904,
      "available_in_bytes": 2313655341056
    },
    "plugins": [],
    "network_types": {
      "transport_types": {
        "security4": 9
      },
      "http_types": {
        "security4": 9
      }
    }
  }
}

Your problem is that you have far too many shards, which makes gathering cluster information slow. Having lots of small shards is very inefficient so you will need to reduce this dramatically. Please read this blog post for guidance.

But can it be the reason for receiving this specific ERROR of timeout and causing data to be lost on almost all nodes , I mean Index data and not information regarding nodes ?

If getting node statistics time out it is possible that other cluster operations like creating new indices or relocating shards may be slow too which could cause problems.

Thank you .

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.