[2018-12-19T05:10:26,684][DEBUG][o.e.a.a.i.c.TransportCreateIndexAction] [epcvLK5] #[org.elasticsearch.cluster.block.ClusterBlockException]#timed out while retrying [indices:admin/create] after failure (timeout [30s])
org.elasticsearch.cluster.block.ClusterBlockException: blocked by: [SERVICE_UNAVAILABLE/1/state not recovered / initialized];
at org.elasticsearch.cluster.block.ClusterBlocks.indexBlockedException(ClusterBlocks.java:189) ~[elasticsearch-6.2.3.jar:6.2.3]
at org.elasticsearch.action.admin.indices.create.TransportCreateIndexAction.checkBlock(TransportCreateIndexAction.java:64) ~[elasticsearch-6.2.3.jar:6.2.3]
at org.elasticsearch.action.admin.indices.create.TransportCreateIndexAction.checkBlock(TransportCreateIndexAction.java:39) ~[elasticsearch-6.2.3.jar:6.2.3]
at org.elasticsearch.action.support.master.TransportMasterNodeAction$AsyncSingleAction.doStart(TransportMasterNodeAction.java:135) ~[elasticsearch-6.2.3.jar:6.2.3]
at org.elasticsearch.action.support.master.TransportMasterNodeAction$AsyncSingleAction.start(TransportMasterNodeAction.java:127) ~[elasticsearch-6.2.3.jar:6.2.3]
at org.elasticsearch.action.support.master.TransportMasterNodeAction.doExecute(TransportMasterNodeAction.java:105) ~[elasticsearch-6.2.3.jar:6.2.3]
at org.elasticsearch.action.support.master.TransportMasterNodeAction.doExecute(TransportMasterNodeAction.java:55) ~[elasticsearch-6.2.3.jar:6.2.3]
at org.elasticsearch.action.support.TransportAction$RequestFilterChain.proceed(TransportAction.java:167) ~[elasticsearch-6.2.3.jar:6.2.3]
at org.elasticsearch.action.support.TransportAction.execute(TransportAction.java:139) ~[elasticsearch-6.2.3.jar:6.2.3]
at org.elasticsearch.action.support.HandledTransportAction$TransportHandler.messageReceived(HandledTransportAction.java:79) ~[elasticsearch-6.2.3.jar:6.2.3]
at org.elasticsearch.action.support.HandledTransportAction$TransportHandler.messageReceived(HandledTransportAction.java:69) ~[elasticsearch-6.2.3.jar:6.2.3]
at org.elasticsearch.transport.RequestHandlerRegistry.processMessageReceived(RequestHandlerRegistry.java:66) ~[elasticsearch-6.2.3.jar:6.2.3]
at org.elasticsearch.transport.TcpTransport$RequestHandler.doRun(TcpTransport.java:1555) ~[elasticsearch-6.2.3.jar:6.2.3]
at org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) ~[elasticsearch-6.2.3.jar:6.2.3]
at org.elasticsearch.common.util.concurrent.EsExecutors$1.execute(EsExecutors.java:135) ~[elasticsearch-6.2.3.jar:6.2.3]
at org.elasticsearch.transport.TcpTransport.handleRequest(TcpTransport.java:1512) ~[elasticsearch-6.2.3.jar:6.2.3]
at org.elasticsearch.transport.TcpTransport.messageReceived(TcpTransport.java:1382) ~[elasticsearch-6.2.3.jar:6.2.3]
at org.elasticsearch.transport.netty4.Netty4MessageChannelHandler.channelRead(Netty4MessageChannelHandler.java:64) ~[?:?]
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:362) ~[?:?]
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:348) ~[?:?]
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:340) ~[?:?]
at io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:310) ~[?:?]
at io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:297) ~[?:?]
at io.netty.handler.codec.ByteToMessageDecoder.callDecode(ByteToMessageDecoder.java:413) ~[?:?]
at io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:265) ~[?:?]
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:362) ~[?:?]
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:348) ~[?:?]
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:340) ~[?:?]
at io.netty.handler.logging.LoggingHandler.channelRead(LoggingHandler.java:241) ~[?:?]
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:362) ~[?:?]
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:348) ~[?:?]
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:340) ~[?:?]
at io.netty.handler.ssl.SslHandler.unwrap(SslHandler.java:1336) ~[?:?]
at io.netty.handler.ssl.SslHandler.decodeJdkCompatible(SslHandler.java:1127) ~[?:?]
at io.netty.handler.ssl.SslHandler.decode(SslHandler.java:1162) ~[?:?]
at io.netty.handler.codec.ByteToMessageDecoder.decodeRemovalReentryProtection(ByteToMessageDecoder.java:489) ~[?:?]
at io.netty.handler.codec.ByteToMessageDecoder.callDecode(ByteToMessageDecoder.java:428) ~[?:?]
at io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:265) ~[?:?]
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:362) ~[?:?]
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:348) ~[?:?]
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:340) ~[?:?]
at io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1359) ~[?:?]
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:362) ~[?:?]
Cluster has 400 nodes with 20k shards with dedicated master instance with 16 cores and looks like it's stuck for ever getting cluster state recovered. There are no additional settings on gateway.recover*. All through this time the cluster is unresponsive
[GJueMRG] discovered [{epcvLK5}{epcvLK51S-2841WxJWTQcg}{N_OZPZ_cSA-Dy5RL7zb1dQ}{10.xxx.xx.xxx}{10.xxx.xx.xxx:9300}{ zone=us-east-1b}] which is also master but with an older cluster_state, telling [{epcvLK5}{epcvLK51S-2841WxJWTQcg}{N_OZPZ_cSA-Dy5RL7zb1dQ}{10.xxx.xx.xxx}{10.xxx.xx.xxx:9300}{zone=us-east-1b}] to rejoin the cluster ([node fd ping])