Elastic going down - failed to execute watch _logstash_version_mismatch

I have 3 node cluster, deployed in AWS. The elastic service going down now and then with following exception.

I am using 3 node(each of 3 node acting as data, master, ingest).
Using 6.1.2 version elastic, kibana and x-pack.

[2018-02-02T11:52:41,999][INFO ][o.e.c.r.a.AllocationService] [onepay-es-master3] Cluster health status changed from [YELLOW] to [GREEN] (reason: [shards started [[.monitoring-es-6-2018.02.02][0]] ...]).
[2018-02-02T11:52:47,852][INFO ][o.e.c.m.MetaDataCreateIndexService] [onepay-es-master3] [.monitoring-alerts-6] creating index, cause [auto(bulk api)], templates [.monitoring-alerts], shards [1]/[1], mappings [doc]
[2018-02-02T11:52:47,949][INFO ][o.e.c.m.MetaDataMappingService] [onepay-es-master3] [.watcher-history-7-2018.02.02/iXFiL4hST6GSFq_wUwwxiA] update_mapping [doc]
[2018-02-02T11:52:48,360][INFO ][o.e.c.r.a.AllocationService] [onepay-es-master3] Cluster health status changed from [YELLOW] to [GREEN] (reason: [shards started [[.monitoring-alerts-6][0]] ...]).
[2018-02-02T11:52:48,399][INFO ][o.e.c.m.MetaDataMappingService] [onepay-es-master3] [.watcher-history-7-2018.02.02/iXFiL4hST6GSFq_wUwwxiA] update_mapping [doc]
[2018-02-02T12:00:48,531][WARN ][o.e.x.w.e.ExecutionService] [onepay-es-master3] failed to execute watch [RycnPEM1QUePp-5XkubQJg_logstash_version_mismatch]
[2018-02-02T12:19:42,092][ERROR][o.e.x.m.c.c.ClusterStatsCollector] [onepay-es-master3] collector [cluster_stats] timed out when collecting data
[2018-02-02T12:20:05,609][ERROR][o.e.x.m.c.n.NodeStatsCollector] [onepay-es-master3] collector [node_stats] timed out when collecting data
[2018-02-02T12:21:27,669][WARN ][o.e.x.w.e.ExecutionService] [onepay-es-master3] failed to execute watch [RycnPEM1QUePp-5XkubQJg_elasticsearch_version_mismatch]
[2018-02-02T12:21:27,669][WARN ][o.e.x.w.e.ExecutionService] [onepay-es-master3] failed to execute watch [RycnPEM1QUePp-5XkubQJg_xpack_license_expiration]
[2018-02-02T12:21:53,647][WARN ][o.e.x.w.e.ExecutionService] [onepay-es-master3] failed to execute watch [RycnPEM1QUePp-5XkubQJg_elasticsearch_cluster_status]
[2018-02-02T12:21:53,647][WARN ][o.e.x.w.e.ExecutionService] [onepay-es-master3] failed to execute watch [RycnPEM1QUePp-5XkubQJg_logstash_version_mismatch]
[2018-02-02T12:22:47,931][INFO ][o.e.x.m.j.p.NativeController] Native controller process has stopped - no new native processes can be started

What is the full output of the cluster stats API?

I restarted all nodes and following is the output for cluster stats:

{
"_nodes": {
"total": 3,
"successful": 3,
"failed": 0
},
"cluster_name": "onepay-es-cluster",
"timestamp": 1517578926158,
"status": "green",
"indices": {
"count": 13,
"shards": {
"total": 42,
"primaries": 19,
"replication": 1.2105263157894737,
"index": {
"shards": {
"min": 2,
"max": 10,
"avg": 3.230769230769231
},
"primaries": {
"min": 1,
"max": 5,
"avg": 1.4615384615384615
},
"replication": {
"min": 1,
"max": 2,
"avg": 1.1538461538461537
}
}
},
"docs": {
"count": 23835664,
"deleted": 852
},
"store": {
"size": "13.1gb",
"size_in_bytes": 14121432723
},
"fielddata": {
"memory_size": "11.8kb",
"memory_size_in_bytes": 12168,
"evictions": 0
},
"query_cache": {
"memory_size": "94.9kb",
"memory_size_in_bytes": 97273,
"total_count": 633,
"hit_count": 176,
"miss_count": 457,
"cache_size": 24,
"cache_count": 26,
"evictions": 2
},
"completion": {
"size": "0b",
"size_in_bytes": 0
},
"segments": {
"count": 326,
"memory": "28.4mb",
"memory_in_bytes": 29878902,
"terms_memory": "18.7mb",
"terms_memory_in_bytes": 19695622,
"stored_fields_memory": "6.4mb",
"stored_fields_memory_in_bytes": 6713456,
"term_vectors_memory": "0b",
"term_vectors_memory_in_bytes": 0,
"norms_memory": "198kb",
"norms_memory_in_bytes": 202816,
"points_memory": "2.7mb",
"points_memory_in_bytes": 2932096,
"doc_values_memory": "327kb",
"doc_values_memory_in_bytes": 334912,
"index_writer_memory": "354.1kb",
"index_writer_memory_in_bytes": 362636,
"version_map_memory": "39.8kb",
"version_map_memory_in_bytes": 40814,
"fixed_bit_set": "824b",
"fixed_bit_set_memory_in_bytes": 824,
"max_unsafe_auto_id_timestamp": 1517578238075,
"file_sizes": {}
}
},
"nodes": {
"count": {
"total": 3,
"data": 3,
"coordinating_only": 0,
"master": 3,
"ingest": 3
},
"versions": [
"6.1.2"
],
"os": {
"available_processors": 12,
"allocated_processors": 12,
"names": [
{
"name": "Linux",
"count": 3
}
],
"mem": {
"total": "46.5gb",
"total_in_bytes": 49970884608,
"free": "476.1mb",
"free_in_bytes": 499290112,
"used": "46gb",
"used_in_bytes": 49471594496,
"free_percent": 1,
"used_percent": 99
}
},
"process": {
"cpu": {
"percent": 1
},
"open_file_descriptors": {
"min": 308,
"max": 313,
"avg": 311
}
},
"jvm": {
"max_uptime": "1.8h",
"max_uptime_in_millis": 6665113,
"versions": [
{
"version": "1.8.0_45",
"vm_name": "Java HotSpot(TM) 64-Bit Server VM",
"vm_version": "25.45-b02",
"vm_vendor": "Oracle Corporation",
"count": 3
}
],
"mem": {
"heap_used": "994.2mb",
"heap_used_in_bytes": 1042595560,
"heap_max": "41.9gb",
"heap_max_in_bytes": 44992561152
},
"threads": 241
},
"fs": {
"total": "149.9gb",
"total_in_bytes": 161023463424,
"free": "126.7gb",
"free_in_bytes": 136076742656,
"available": "126.7gb",
"available_in_bytes": 136076742656
},
"plugins": [
{
"name": "repository-s3",
"version": "6.1.2",
"description": "The S3 repository plugin adds S3 repositories",
"classname": "org.elasticsearch.repositories.s3.S3RepositoryPlugin",
"has_native_controller": false,
"requires_keystore": false
},
{
"name": "x-pack",
"version": "6.1.2",
"description": "Elasticsearch Expanded Pack Plugin",
"classname": "org.elasticsearch.xpack.XPackPlugin",
"has_native_controller": true,
"requires_keystore": true
}
],
"network_types": {
"transport_types": {
"security4": 3
},
"http_types": {
"security4": 3
}
}
}
}

What kind of instances are they deployed on? What type of storage?

Is monitoring enabled?

Its T2.xlarge, using 50GB using 50GB root EBS volume. It has 4 cpu and 16GB RAM.
Monitoring enabled. Brought the service up around 8AM. Its down again at 9:15 am.

node2, node3 was down and node1 was up looking for other master to join the cluster. So whole cluster is down. I took backup the logs from node2 and node3.

Node3: error

[2018-02-02T13:51:14,345][INFO ][o.e.d.z.ZenDiscovery ] [onepay-es-master3] master_left [{onepay-es-master2}{_N0_iSumRlaGfXGAU3Mjfg}{HqUxv29ISryNalY9trHDDw}{onepay-es-m2.aws-cb-dqa.cb4good.com}{10.203.152.214:9300}{ml.machine_memory=16656961536, ml.max_open_jobs=20, ml.enabled=true}], reason [transport disconnected]
[2018-02-02T13:51:14,346][WARN ][o.e.d.z.ZenDiscovery ] [onepay-es-master3] master left (reason = transport disconnected), current nodes: nodes:
{onepay-es-master3}{AtigueY0QGWp1ktUbYnmLA}{Xvt4SerWSgqVYzgSzkB_pA}{onepay-es-m3.aws-cb-dqa.cb4good.com}{10.203.156.247:9300}{ml.machine_memory=16656961536, ml.max_open_jobs=20, ml.enabled=true}, local
{onepay-es-master2}{_N0_iSumRlaGfXGAU3Mjfg}{HqUxv29ISryNalY9trHDDw}{onepay-es-m2.aws-cb-dqa.cb4good.com}{10.203.152.214:9300}{ml.machine_memory=16656961536, ml.max_open_jobs=20, ml.enabled=true}, master
{onepay-es-master1}{dxuKRkzuRlqDuGv_NQKaCw}{Crkxak1HRV2fj6TOx8q58w}{onepay-es-m1.aws-cb-dqa.cb4good.com}{10.203.155.219:9300}{ml.machine_memory=16656961536, ml.max_open_jobs=20, ml.enabled=true}

[2018-02-02T13:51:14,355][WARN ][o.e.c.NodeConnectionsService] [onepay-es-master3] failed to connect to node {onepay-es-master2}{_N0_iSumRlaGfXGAU3Mjfg}{HqUxv29ISryNalY9trHDDw}{onepay-es-m2.aws-cb-dqa.cb4good.com}{10.203.152.214:9300}{ml.machine_memory=16656961536, ml.max_open_jobs=20, ml.enabled=true} (tried [1] times)
org.elasticsearch.transport.ConnectTransportException: [onepay-es-master2][10.203.152.214:9300] connect_exception
at org.elasticsearch.transport.TcpChannel.awaitConnected(TcpChannel.java:165) ~[elasticsearch-6.1.2.jar:6.1.2]
at org.elasticsearch.transport.TcpTransport.openConnection(TcpTransport.java:615) ~[elasticsearch-6.1.2.jar:6.1.2]
at org.elasticsearch.transport.TcpTransport.connectToNode(TcpTransport.java:513) ~[elasticsearch-6.1.2.jar:6.1.2]
at org.elasticsearch.transport.TransportService.connectToNode(TransportService.java:327) ~[elasticsearch-6.1.2.jar:6.1.2]
at org.elasticsearch.transport.TransportService.connectToNode(TransportService.java:314) ~[elasticsearch-6.1.2.jar:6.1.2]
at org.elasticsearch.cluster.NodeConnectionsService.validateAndConnectIfNeeded(NodeConnectionsService.java:154) [elasticsearch-6.1.2.jar:6.1.2]
at org.elasticsearch.cluster.NodeConnectionsService$1.doRun(NodeConnectionsService.java:107) [elasticsearch-6.1.2.jar:6.1.2]
at org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:637) [elasticsearch-6.1.2.jar:6.1.2]
at org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) [elasticsearch-6.1.2.jar:6.1.2]
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) [?:1.8.0_45]
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) [?:1.8.0_45]
at java.lang.Thread.run(Thread.java:745) [?:1.8.0_45]
Caused by: io.netty.channel.AbstractChannel$AnnotatedConnectException: Connection refused: onepay-es-m2.aws-cb-dqa.cb4good.com/10.203.152.214:9300
at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method) ~[?:?]
at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:717) ~[?:?]
at io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:352) ~[?:?]
at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:340) ~[?:?]
at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:632) ~[?:?]
at io.netty.channel.nio.NioEventLoop.processSelectedKeysPlain(NioEventLoop.java:544) ~[?:?]
at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:498) ~[?:?]
at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:458) ~[?:?]
at io.netty.util.concurrent.SingleThreadEventExecutor$5.run(SingleThreadEventExecutor.java:858) ~[?:?]
... 1 more
Caused by: java.net.ConnectException: Connection refused
at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method) ~[?:?]
at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:717) ~[?:?]
at io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:352) ~[?:?]
at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:340) ~[?:?]
at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:632) ~[?:?]
at io.netty.channel.nio.NioEventLoop.processSelectedKeysPlain(NioEventLoop.java:544) ~[?:?]
at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:498) ~[?:?]
at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:458) ~[?:?]
at io.netty.util.concurrent.SingleThreadEventExecutor$5.run(SingleThreadEventExecutor.java:858) ~[?:?]
... 1 more
[2018-02-02T13:51:14,362][WARN ][o.e.c.s.ClusterApplierService] [onepay-es-master3] failed to notify ClusterStateListener
java.lang.NullPointerException: null
at org.elasticsearch.xpack.watcher.support.WatcherIndexTemplateRegistry.clusterChanged(WatcherIndexTemplateRegistry.java:100) ~[?:?]
at org.elasticsearch.cluster.service.ClusterApplierService.lambda$callClusterStateListeners$7(ClusterApplierService.java:506) [elasticsearch-6.1.2.jar:6.1.2]
at org.elasticsearch.cluster.service.ClusterApplierService$$Lambda$2287/1316524301.accept(Unknown Source) [elasticsearch-6.1.2.jar:6.1.2]
at java.util.Spliterators$ArraySpliterator.forEachRemaining(Spliterators.java:948) [?:1.8.0_45]
at java.util.stream.Streams$ConcatSpliterator.forEachRemaining(Streams.java:742) [?:1.8.0_45]
at java.util.stream.ReferencePipeline$Head.forEach(ReferencePipeline.java:580) [?:1.8.0_45]
at org.elasticsearch.cluster.service.ClusterApplierService.callClusterStateListeners(ClusterApplierService.java:503) [elasticsearch-6.1.2.jar:6.1.2]
at org.elasticsearch.cluster.service.ClusterApplierService.applyChanges(ClusterApplierService.java:486) [elasticsearch-6.1.2.jar:6.1.2]
at org.elasticsearch.cluster.service.ClusterApplierService.runTask(ClusterApplierService.java:429) [elasticsearch-6.1.2.jar:6.1.2]
at org.elasticsearch.cluster.service.ClusterApplierService$UpdateTask.run(ClusterApplierService.java:158) [elasticsearch-6.1.2.jar:6.1.2]
at org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingRunnable.run(ThreadContext.java:568) [elasticsearch-6.1.2.jar:6.1.2]
at org.elasticsearch.common.util.concurrent.PrioritizedEsThreadPoolExecutor$TieBreakingPrioritizedRunnable.runAndClean(PrioritizedEsThreadPoolExecutor.java:247) [elasticsearch-6.1.2.jar:6.1.2]

Node2 Error

[2018-02-02T13:32:14,687][INFO ][o.e.c.m.MetaDataMappingService] [onepay-es-master2] [.watcher-history-7-2018.02.02/iXFiL4hST6GSFq_wUwwxiA] update_mapping [doc]

Based on the stats you provided, it seems like you have set the heap > 50% of available RAM. I would recommend changing this to give Elasticsearch enough file system cache to work with.

I have also never deployed Elasticsearch on t2 instance types, so am not sure how suitable they are due to the burstable performance. I think it is much more common to use m4/m5 instances, so it might be worthwhile switching to see if that helps.

Thanks, I will change it to M4.xlarge. It has 16GB memory.

How much you suggest to give elasticsearch? I gave 14 GB with previous configuration.

8GB (50%), assuming there is nothing else running on the host.

Christian, I have upgraded to M4.xlarge and assigned 8 GB memory. Now cluster looks stable(Thanks for that :slight_smile: ). I am getting following warning. From console I can see incense expires March 4, 2018. Our Prod subscription will start from 14th Feb.

[2018-02-03T08:04:39,862][WARN ][o.e.x.w.e.ExecutionService] [onepay-es-master3] failed to execute watch [b9fmMTipRZiP48F150M0ew_elasticsearch_version_mismatch]
[2018-02-03T09:04:39,808][WARN ][o.e.x.w.e.ExecutionService] [onepay-es-master3] failed to execute watch [b9fmMTipRZiP48F150M0ew_xpack_license_expiration]
[2018-02-03T10:21:40,014][WARN ][o.e.x.w.e.ExecutionService] [onepay-es-master3] failed to execute watch [b9fmMTipRZiP48F150M0ew_xpack_license_expiration]
[2018-02-03T10:36:39,932][WARN ][o.e.x.w.e.ExecutionService] [onepay-es-master3] failed to execute watch [b9fmMTipRZiP48F150M0ew_xpack_license_expiration]
[2018-02-03T11:53:40,000][WARN ][o.e.x.w.e.ExecutionService] [onepay-es-master3] failed to execute watch [b9fmMTipRZiP48F150M0ew_logstash_version_mismatch]

Cluster stat as follows:

{
"_nodes": {
"total": 3,
"successful": 3,
"failed": 0
},
"cluster_name": "onepay-es-cluster",
"timestamp": 1517665949336,
"status": "yellow",
"indices": {
"count": 14,
"shards": {
"total": 48,
"primaries": 22,
"replication": 1.1818181818181819,
"index": {
"shards": {
"min": 2,
"max": 10,
"avg": 3.4285714285714284
},
"primaries": {
"min": 1,
"max": 5,
"avg": 1.5714285714285714
},
"replication": {
"min": 1,
"max": 2,
"avg": 1.1428571428571428
}
}
},
"docs": {
"count": 23875045,
"deleted": 447
},
"store": {
"size": "8.8gb",
"size_in_bytes": 9513897678
},
"fielddata": {
"memory_size": "12.2kb",
"memory_size_in_bytes": 12544,
"evictions": 0
},
"query_cache": {
"memory_size": "340.2kb",
"memory_size_in_bytes": 348432,
"total_count": 18147,
"hit_count": 7713,
"miss_count": 10434,
"cache_size": 271,
"cache_count": 1745,
"evictions": 1474
},
"completion": {
"size": "0b",
"size_in_bytes": 0
},
"segments": {
"count": 273,
"memory": "19.7mb",
"memory_in_bytes": 20711657,
"terms_memory": "12.9mb",
"terms_memory_in_bytes": 13590057,
"stored_fields_memory": "4.3mb",
"stored_fields_memory_in_bytes": 4523896,
"term_vectors_memory": "0b",
"term_vectors_memory_in_bytes": 0,
"norms_memory": "168.2kb",
"norms_memory_in_bytes": 172288,
"points_memory": "1.8mb",
"points_memory_in_bytes": 1980772,
"doc_values_memory": "434.2kb",
"doc_values_memory_in_bytes": 444644,
"index_writer_memory": "379.1kb",
"index_writer_memory_in_bytes": 388292,
"version_map_memory": "48.5kb",
"version_map_memory_in_bytes": 49680,
"fixed_bit_set": "1.8kb",
"fixed_bit_set_memory_in_bytes": 1904,
"max_unsafe_auto_id_timestamp": 1517664777669,
"file_sizes": {}
}
},
"nodes": {
"count": {
"total": 3,
"data": 3,
"coordinating_only": 0,
"master": 3,
"ingest": 3
},
"versions": [
"6.1.2"
],
"os": {
"available_processors": 12,
"allocated_processors": 12,
"names": [
{
"name": "Linux",
"count": 3
}
],
"mem": {
"total": "46.5gb",
"total_in_bytes": 49970884608,
"free": "3.6gb",
"free_in_bytes": 3969036288,
"used": "42.8gb",
"used_in_bytes": 46001848320,
"free_percent": 8,
"used_percent": 92
}
},
"process": {
"cpu": {
"percent": 6
},
"open_file_descriptors": {
"min": 303,
"max": 345,
"avg": 330
}
},
"jvm": {
"max_uptime": "16.6h",
"max_uptime_in_millis": 59931207,
"versions": [
{
"version": "1.8.0_45",
"vm_name": "Java HotSpot(TM) 64-Bit Server VM",
"vm_version": "25.45-b02",
"vm_vendor": "Oracle Corporation",
"count": 3
}
],
"mem": {
"heap_used": "1.3gb",
"heap_used_in_bytes": 1415763072,
"heap_max": "23.9gb",
"heap_max_in_bytes": 25665208320
},
"threads": 258
},
"fs": {
"total": "149.9gb",
"total_in_bytes": 161023463424,
"free": "129gb",
"free_in_bytes": 138518507520,
"available": "129gb",
"available_in_bytes": 138518507520
},
"plugins": [
{
"name": "repository-s3",
"version": "6.1.2",
"description": "The S3 repository plugin adds S3 repositories",
"classname": "org.elasticsearch.repositories.s3.S3RepositoryPlugin",
"has_native_controller": false,
"requires_keystore": false
},
{
"name": "x-pack",
"version": "6.1.2",
"description": "Elasticsearch Expanded Pack Plugin",
"classname": "org.elasticsearch.xpack.XPackPlugin",
"has_native_controller": true,
"requires_keystore": true
}
],
"network_types": {
"transport_types": {
"security4": 3
},
"http_types": {
"security4": 3
}
}
}
}

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.