All was well until the recent reboot of the VM. Nothing changes except the OS patching. Somwhow the network connection between the nodes is dropping but they are not stayinh up for more than 15 minutes.
Here is my setup
Elasticsearch Version :- 5.3.0
4 Data nodes - 5 Gig Heap ( No Issues there)
2 Master Nodes - 1 Gig heap ( Crashing with Fatal network error and Heap OOM)
2 Coordinating Nodes - 2 Gig Heap ( Crashing with Fatal network error and Heap OOM)
Here is the Error Stack
[2018-11-06T14:54:38,421][ERROR][o.e.t.n.Netty4Utils ] fatal error on the network layer
at org.elasticsearch.transport.netty4.Netty4Utils.maybeDie(Netty4Utils.java:140)
at org.elasticsearch.transport.netty4.Netty4MessageChannelHandler.exceptionCaught(Netty4MessageChannelHandler.java:83)
at io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:286)
at io.netty.channel.AbstractChannelHandlerContext.notifyHandlerException(AbstractChannelHandlerContext.java:851)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:349)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:341)
at io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:293)
at io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:280)
at io.netty.handler.codec.ByteToMessageDecoder.callDecode(ByteToMessageDecoder.java:396)
at io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:248)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:363)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:349)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:341)
at io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1334)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:363)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:349)
at io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:926)
at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:129)
at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:642)
at io.netty.channel.nio.NioEventLoop.processSelectedKeysPlain(NioEventLoop.java:527)
at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:481)
at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:441)
at io.netty.util.concurrent.SingleThreadEventExecutor$5.run(SingleThreadEventExecutor.java:858)
at java.lang.Thread.run(Thread.java:745)
[2018-11-06T14:54:38,409][ERROR][o.e.b.ElasticsearchUncaughtExceptionHandler] [NLOG3_Master] fatal error in thread [elasticsearch[NLOG3_Master][clusterService#updateTask][T#1]], exiting
java.lang.OutOfMemoryError: Java heap space
at java.util.Arrays.copyOf(Arrays.java:3181) ~[?:1.8.0_73]
at java.util.ArrayList.grow(ArrayList.java:261) ~[?:1.8.0_73]
at java.util.ArrayList.ensureExplicitCapacity(ArrayList.java:235) ~[?:1.8.0_73]
at java.util.ArrayList.ensureCapacityInternal(ArrayList.java:227) ~[?:1.8.0_73]
at java.util.ArrayList.add(ArrayList.java:458) ~[?:1.8.0_73]
at org.elasticsearch.cluster.routing.IndexShardRoutingTable$Builder.addShard(IndexShardRoutingTable.java:585) ~[elasticsearch-5.3.0.jar:5.3.0]
at org.elasticsearch.cluster.routing.IndexRoutingTable$Builder.addShard(IndexRoutingTable.java:532) ~[elasticsearch-5.3.0.jar:5.3.0]
at org.elasticsearch.cluster.routing.RoutingTable$Builder.updateNodes(RoutingTable.java:449) ~[elasticsearch-5.3.0.jar:5.3.0]
at org.elasticsearch.cluster.routing.allocation.AllocationService.buildResultAndLogHealthChange(AllocationService.java:101) ~[elasticsearch-5.3.0.jar:5.3.0]
at org.elasticsearch.cluster.routing.allocation.AllocationService.applyStartedShards(AllocationService.java:95) ~[elasticsearch-5.3.0.jar:5.3.0]
at org.elasticsearch.cluster.action.shard.ShardStateAction$ShardStartedClusterStateTaskExecutor.execute(ShardStateAction.java:438) ~[elasticsearch-5.3.0.jar:5.3.0]
at org.elasticsearch.cluster.service.ClusterService.executeTasks(ClusterService.java:679) ~[elasticsearch-5.3.0.jar:5.3.0]
at org.elasticsearch.cluster.service.ClusterService.calculateTaskOutputs(ClusterService.java:658) ~[elasticsearch-5.3.0.jar:5.3.0]
at org.elasticsearch.cluster.service.ClusterService.runTasks(ClusterService.java:617) ~[elasticsearch-5.3.0.jar:5.3.0]
at org.elasticsearch.cluster.service.ClusterService$UpdateTask.run(ClusterService.java:1117) ~[elasticsearch-5.3.0.jar:5.3.0]
at org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingRunnable.run(ThreadContext.java:544) ~[elasticsearch-5.3.0.jar:5.3.0]
at org.elasticsearch.common.util.concurrent.PrioritizedEsThreadPoolExecutor$TieBreakingPrioritizedRunnable.runAndClean(PrioritizedEsThreadPoolExecutor.java:238) ~[elasticsearch-5.3.0.jar:5.3.0]
at org.elasticsearch.common.util.concurrent.PrioritizedEsThreadPoolExecutor$TieBreakingPrioritizedRunnable.run(PrioritizedEsThreadPoolExecutor.java:201) ~[elasticsearch-5.3.0.jar:5.3.0]
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) ~[?:1.8.0_73]
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) ~[?:1.8.0_73]
at java.lang.Thread.run(Thread.java:745) [?:1.8.0_73]