Nodes leave and then rejoin the cluster randomly

#1

Hi,

We have a 5 node elasticsearch cluster (7.0.0). In elasticsearch logs i see nodes randomly leave and then rejoin the cluster. They were not restarted by us.

Here's a sample log:

org.elasticsearch.action.FailedNodeException: Failed node [aB_fCt7SRUeGdGJF_j_DDg]
        at org.elasticsearch.action.support.nodes.TransportNodesAction$AsyncAction.onFailure(TransportNodesAction.java:223) [elasticsearch-7.0.0.jar:7.0.0]
        at org.elasticsearch.action.support.nodes.TransportNodesAction$AsyncAction$1.handleException(TransportNodesAction.java:198) [elasticsearch-7.0.0.jar:7.0.0]
        at org.elasticsearch.transport.TransportService.sendRequest(TransportService.java:533) [elasticsearch-7.0.0.jar:7.0.0]
        at org.elasticsearch.action.support.nodes.TransportNodesAction.doExecute(TransportNodesAction.java:82) [elasticsearch-7.0.0.jar:7.0.0]
        at org.elasticsearch.action.support.nodes.TransportNodesAction.doExecute(TransportNodesAction.java:51) [elasticsearch-7.0.0.jar:7.0.0]
        at org.elasticsearch.action.support.TransportAction$RequestFilterChain.proceed(TransportAction.java:145) [elasticsearch-7.0.0.jar:7.0.0]
        at org.elasticsearch.action.support.TransportAction.execute(TransportAction.java:121) [elasticsearch-7.0.0.jar:7.0.0]
        at org.elasticsearch.action.support.TransportAction.execute(TransportAction.java:64) [elasticsearch-7.0.0.jar:7.0.0]
        at org.elasticsearch.indices.store.TransportNodesListShardStoreMetaData.list(TransportNodesListShardStoreMetaData.java:89) [elasticsearch-7.0.0.jar:7.0.0]
        at org.elasticsearch.gateway.AsyncShardFetch.asyncFetch(AsyncShardFetch.java:283) [elasticsearch-7.0.0.jar:7.0.0]
        at org.elasticsearch.gateway.AsyncShardFetch.fetchData(AsyncShardFetch.java:126) [elasticsearch-7.0.0.jar:7.0.0]
        at org.elasticsearch.gateway.GatewayAllocator$InternalReplicaShardAllocator.fetchData(GatewayAllocator.java:183) [elasticsearch-7.0.0.jar:7.0.0]
        at org.elasticsearch.gateway.ReplicaShardAllocator.makeAllocationDecision(ReplicaShardAllocator.java:167) [elasticsearch-7.0.0.jar:7.0.0]
        at org.elasticsearch.gateway.BaseGatewayShardAllocator.allocateUnassigned(BaseGatewayShardAllocator.java:59) [elasticsearch-7.0.0.jar:7.0.0]
        at org.elasticsearch.gateway.GatewayAllocator.innerAllocatedUnassigned(GatewayAllocator.java:116) [elasticsearch-7.0.0.jar:7.0.0]
        at org.elasticsearch.gateway.GatewayAllocator.allocateUnassigned(GatewayAllocator.java:104) [elasticsearch-7.0.0.jar:7.0.0]
        at org.elasticsearch.cluster.routing.allocation.AllocationService.reroute(AllocationService.java:410) [elasticsearch-7.0.0.jar:7.0.0]
        at org.elasticsearch.cluster.routing.allocation.AllocationService.reroute(AllocationService.java:378) [elasticsearch-7.0.0.jar:7.0.0]
        at org.elasticsearch.cluster.routing.allocation.AllocationService.reroute(AllocationService.java:361) [elasticsearch-7.0.0.jar:7.0.0]
        at org.elasticsearch.cluster.coordination.JoinTaskExecutor.execute(JoinTaskExecutor.java:155) [elasticsearch-7.0.0.jar:7.0.0]
        at org.elasticsearch.cluster.coordination.JoinHelper$1.execute(JoinHelper.java:118) [elasticsearch-7.0.0.jar:7.0.0]
        at org.elasticsearch.cluster.service.MasterService.executeTasks(MasterService.java:687) [elasticsearch-7.0.0.jar:7.0.0]
        at org.elasticsearch.cluster.service.MasterService.calculateTaskOutputs(MasterService.java:310) [elasticsearch-7.0.0.jar:7.0.0]
        at org.elasticsearch.cluster.service.MasterService.runTasks(MasterService.java:210) [elasticsearch-7.0.0.jar:7.0.0]
        at org.elasticsearch.cluster.service.MasterService$Batcher.run(MasterService.java:142) [elasticsearch-7.0.0.jar:7.0.0]
        at org.elasticsearch.cluster.service.TaskBatcher.runIfNotProcessed(TaskBatcher.java:150) [elasticsearch-7.0.0.jar:7.0.0]
        at org.elasticsearch.cluster.service.TaskBatcher$BatchedTask.run(TaskBatcher.java:188) [elasticsearch-7.0.0.jar:7.0.0]
        at org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingRunnable.run(ThreadContext.java:681) [elasticsearch-7.0.0.jar:7.0.0]
        at org.elasticsearch.common.util.concurrent.PrioritizedEsThreadPoolExecutor$TieBreakingPrioritizedRunnable.runAndClean(PrioritizedEsThreadPoolExecutor.java:252) [elasticsearch-7.0.0.jar:7.0.0]
        at org.elasticsearch.common.util.concurrent.PrioritizedEsThreadPoolExecutor$TieBreakingPrioritizedRunnable.run(PrioritizedEsThreadPoolExecutor.java:215) [elasticsearch-7.0.0.jar:7.0.0]
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) [?:?]
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) [?:?]
        at java.lang.Thread.run(Thread.java:835) [?:?]
Caused by: org.elasticsearch.transport.NodeNotConnectedException: [node02-new][x.x.x.x:9300] Node not connected
        at org.elasticsearch.transport.ConnectionManager.getConnection(ConnectionManager.java:151) ~[elasticsearch-7.0.0.jar:7.0.0]
        at org.elasticsearch.transport.TransportService.getConnection(TransportService.java:557) ~[elasticsearch-7.0.0.jar:7.0.0]
        at org.elasticsearch.transport.TransportService.sendRequest(TransportService.java:529) ~[elasticsearch-7.0.0.jar:7.0.0]
        ... 31 more
node-join[{node04-new}{N3JatoQ7Ri2z-r3l5v1h0Q}{NYSmoaRFQQeo1Emn2SdsgQ}{x.x.x.x}{x.x.x.x:9300} join existing leader, {node02-new}{aB_fCt7SRUeGdGJF_j_DDg}{OQODFjYNSaifpXZMh7lBJQ}{x.x.x.x}{x.x.x.x:9300} join existing leader], term: 5, version: 30965, reason: added {{node02-new}{aB_fCt7SRUeGdGJF_j_DDg}{OQODFjYNSaifpXZMh7lBJQ}{x.x.x.x}{x.x.x.x:9300},}
added {{node02-new}{aB_fCt7SRUeGdGJF_j_DDg}{OQODFjYNSaifpXZMh7lBJQ}{x.x.x.x}{x.x.x.x:9300},}, term: 5, version: 30965, reason: Publication{term=5, version=30965}
node-left[{node02-new}{aB_fCt7SRUeGdGJF_j_DDg}{OQODFjYNSaifpXZMh7lBJQ}{x.x.x.x}{x.x.x.x:9300} disconnected], term: 5, version: 30966, reason: removed {{node02-new}{aB_fCt7SRUeGdGJF_j_DDg}{OQODFjYNSaifpXZMh7lBJQ}{x.x.x.x}{x.x.x.x:9300},}
removed {{node02-new}{aB_fCt7SRUeGdGJF_j_DDg}{OQODFjYNSaifpXZMh7lBJQ}{x.x.x.x}{x.x.x.x:9300},}, term: 5, version: 30966, reason: Publication{term=5, version=30966}
node-join[{node02-new}{aB_fCt7SRUeGdGJF_j_DDg}{OQODFjYNSaifpXZMh7lBJQ}{x.x.x.x}{x.x.x.x:9300} join existing leader], term: 5, version: 30967, reason: added {{node02-new}{aB_fCt7SRUeGdGJF_j_DDg}{OQODFjYNSaifpXZMh7lBJQ}{x.x.x.x}{x.x.x.x:9300},}
added {{node02-new}{aB_fCt7SRUeGdGJF_j_DDg}{OQODFjYNSaifpXZMh7lBJQ}{x.x.x.x}{x.x.x.x3:9300},}, term: 5, version: 30967, reason: Publication{term=5, version=30967}
Cluster health status changed from [YELLOW] to [GREEN] (reason: [shards started [[.kibana_1][0]] ...]).