Node responding slowly to master, management threads spiked

We're in the process of expanding our cluster (running 5.6.3), including upgrading and adding new dedicated masters (5.6.9), but we've had to suspend that after adding the masters (possibly unrelated, since the active master is still a data node) as we're running into issues where one node (not always the same) will be slow to respond to node stats queries from the master, which makes our monitoring and Kibana time out until the "bad" node is restarted. Things we've observed while this is happening:

  • Five cores at 100% (machines have 16 or 32 cores), appears to be the management thread pool
  • hot threads shows a lot of "completion stats" operations running
  • seems to only happen on machines that are over the low disk watermark (this is why we're expanding)

Any ideas or suggestions would be appreciated, I've exhausted what I can find on the forums and GitHub, and our JVM knowledge is limited, so debugging the "bad" node is troublesome.

Cluster stats:

{
  "_nodes" : {
    "total" : 20,
    "successful" : 20,
    "failed" : 0
  },
  "cluster_name" : "lsprod",
  "timestamp" : 1528898724083,
  "status" : "green",
  "indices" : {
    "count" : 1058,
    "shards" : {
      "total" : 3984,
      "primaries" : 1992,
      "replication" : 1.0,
      "index" : {
        "shards" : {
          "min" : 2,
          "max" : 10,
          "avg" : 3.765595463137996
        },
        "primaries" : {
          "min" : 1,
          "max" : 5,
          "avg" : 1.882797731568998
        },
        "replication" : {
          "min" : 1.0,
          "max" : 1.0,
          "avg" : 1.0
        }
      }
    },
    "docs" : {
      "count" : 5729902598,
      "deleted" : 1436753
    },
    "store" : {
      "size" : "8.9tb",
      "size_in_bytes" : 9790158306584,
      "throttle_time" : "0s",
      "throttle_time_in_millis" : 0
    },
    "fielddata" : {
      "memory_size" : "12.5mb",
      "memory_size_in_bytes" : 13165864,
      "evictions" : 0
    },
    "query_cache" : {
      "memory_size" : "3.2gb",
      "memory_size_in_bytes" : 3536164652,
      "total_count" : 434506102,
      "hit_count" : 316315062,
      "miss_count" : 118191040,
      "cache_size" : 368167,
      "cache_count" : 398960,
      "evictions" : 30793
    },
    "completion" : {
      "size" : "0b",
      "size_in_bytes" : 0
    },
    "segments" : {
      "count" : 73508,
      "memory" : "18.5gb",
      "memory_in_bytes" : 19916389816,
      "terms_memory" : "14.6gb",
      "terms_memory_in_bytes" : 15697350626,
      "stored_fields_memory" : "2.7gb",
      "stored_fields_memory_in_bytes" : 2966700800,
      "term_vectors_memory" : "0b",
      "term_vectors_memory_in_bytes" : 0,
      "norms_memory" : "10.3mb",
      "norms_memory_in_bytes" : 10872704,
      "points_memory" : "430.9mb",
      "points_memory_in_bytes" : 451883754,
      "doc_values_memory" : "753mb",
      "doc_values_memory_in_bytes" : 789581932,
      "index_writer_memory" : "264.1mb",
      "index_writer_memory_in_bytes" : 276950308,
      "version_map_memory" : "3.2mb",
      "version_map_memory_in_bytes" : 3381304,
      "fixed_bit_set" : "0b",
      "fixed_bit_set_memory_in_bytes" : 0,
      "max_unsafe_auto_id_timestamp" : 1528871745560,
      "file_sizes" : { }
    }
  },
  "nodes" : {
    "count" : {
      "total" : 20,
      "data" : 15,
      "coordinating_only" : 0,
      "master" : 18,
      "ingest" : 20
    },
    "versions" : [
      "5.6.3",
      "5.6.9"
    ],
    "os" : {
      "available_processors" : 352,
      "allocated_processors" : 352,
      "names" : [
        {
          "name" : "Linux",
          "count" : 20
        }
      ],
      "mem" : {
        "total" : "719.6gb",
        "total_in_bytes" : 772697792512,
        "free" : "109.7gb",
        "free_in_bytes" : 117813415936,
        "used" : "609.9gb",
        "used_in_bytes" : 654884376576,
        "free_percent" : 15,
        "used_percent" : 85
      }
    },
    "process" : {
      "cpu" : {
        "percent" : 213
      },
      "open_file_descriptors" : {
        "min" : 844,
        "max" : 1525,
        "avg" : 1357
      }
    },
    "jvm" : {
      "max_uptime" : "26.9d",
      "max_uptime_in_millis" : 2326618800,
      "versions" : [
        {
          "version" : "1.8.0_171",
          "vm_name" : "OpenJDK 64-Bit Server VM",
          "vm_version" : "25.171-b10",
          "vm_vendor" : "Oracle Corporation",
          "count" : 5
        },
        {
          "version" : "1.8.0_161",
          "vm_name" : "OpenJDK 64-Bit Server VM",
          "vm_version" : "25.161-b14",
          "vm_vendor" : "Oracle Corporation",
          "count" : 15
        }
      ],
      "mem" : {
        "heap_used" : "145.9gb",
        "heap_used_in_bytes" : 156733736656,
        "heap_max" : "317.7gb",
        "heap_max_in_bytes" : 341155446784
      },
      "threads" : 3883
    },
    "fs" : {
      "total" : "23.7tb",
      "total_in_bytes" : 26118701961216,
      "free" : "14.8tb",
      "free_in_bytes" : 16319050977280,
      "available" : "14.8tb",
      "available_in_bytes" : 16310393409536,
      "spins" : "true"
    },
    "plugins" : [ ],
    "network_types" : {
      "transport_types" : {
        "netty4" : 20
      },
      "http_types" : {
        "netty4" : 20
      }
    }
  }
}

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.