I have the following configuration for a multi node elastic cluster. both nodes are running part of a docker swarm. here is the docker yaml config
elasticsearch:
image: {imagerepo}/elasticsearch:0.0.7-6892bb5
healthcheck:
test: curl --fail -s --cacert /run/secrets/elasticsearch-ca.crt -u elastic:$$(cat /run/secrets/elasticsearch.password) https://localhost:9200/_cat/health?v || exit 1
interval: 30s
retries: 12
start_period: 15s
timeout: 30s
environment:
- node.master=true
- node.name=ehr-master
- cluster.initial_master_nodes=ehr-master
- xpack.security.enabled=true
- xpack.security.http.ssl.supported_protocols=TLSv1.2
- xpack.security.transport.ssl.supported_protocols=TLSv1.2
- xpack.security.http.ssl.enabled=true
- xpack.security.transport.ssl.enabled=true
- xpack.security.transport.ssl.verification_mode=certificate
- xpack.security.transport.ssl.client_authentication=required
- xpack.security.transport.ssl.keystore.path=/usr/share/elasticsearch/config/certificates/elasticsearch.p12
- xpack.security.transport.ssl.truststore.path=/usr/share/elasticsearch/config/certificates/elasticsearch.p12
- xpack.security.http.ssl.keystore.path=/usr/share/elasticsearch/config/certificates/elasticsearch.p12
- xpack.security.http.ssl.truststore.path=/usr/share/elasticsearch/config/certificates/elasticsearch.p12
deploy:
placement:
constraints:
- node.labels.type != web
restart_policy:
condition: any
replicas: 1
networks:
- esnetwork
ports:
- "9200:9200"
volumes:
- /var/elasticsearch:/usr/share/elasticsearch/data
secrets:
- elasticsearch.environment.java-opts
- elasticsearch.password
- source: elasticsearch.ssl.keystore.file
target: /usr/share/elasticsearch/config/certificates/elasticsearch.p12
- source: elasticsearch.ssl.keystore.password
target: /usr/share/elasticsearch/config/certificates/elasticsearch.ssl.keystore.password
- source: elasticsearch-ca.crt.file
target: elasticsearch-ca.crt
elasticsearch1:
image:{imagerepo}/elasticsearch:0.0.7-6892bb5/elasticsearch:0.0.7-6892bb5
healthcheck:
test: curl --fail -s --cacert /run/secrets/elasticsearch-ca.crt -u elastic:$$(cat /run/secrets/elasticsearch.password) https://localhost:9200/_cat/health?v || exit 1
interval: 30s
retries: 12
start_period: 15s
timeout: 30s
environment:
- node.master=false
- node.name=ehr-slave1
- cluster.initial_master_nodes=ehr-master
- discovery.seed_hosts=elasticsearch
- xpack.security.enabled=true
- xpack.security.http.ssl.enabled=true
- xpack.security.transport.ssl.enabled=true
- xpack.security.http.ssl.supported_protocols=TLSv1.2
- xpack.security.transport.ssl.supported_protocols=TLSv1.2
- xpack.security.transport.ssl.verification_mode=certificate
- xpack.security.transport.ssl.client_authentication=required
- xpack.security.transport.ssl.keystore.path=/usr/share/elasticsearch/config/certificates/elasticsearch1.p12
- xpack.security.transport.ssl.truststore.path=/usr/share/elasticsearch/config/certificates/elasticsearch1.p12
- xpack.security.http.ssl.keystore.path=/usr/share/elasticsearch/config/certificates/elasticsearch1.p12
- xpack.security.http.ssl.truststore.path=/usr/share/elasticsearch/config/certificates/elasticsearch1.p12
deploy:
placement:
constraints:
- node.labels.type != web
restart_policy:
condition: any
replicas: 1
networks:
- esnetwork
volumes:
- /var/elasticsearch1:/usr/share/elasticsearch/data
secrets:
- elasticsearch.environment.java-opts
- elasticsearch.password
- source: elasticsearch1.ssl.keystore.file
target: /usr/share/elasticsearch/config/certificates/elasticsearch1.p12
- source: elasticsearch1.ssl.keystore.password
target: /usr/share/elasticsearch/config/certificates/elasticsearch.ssl.keystore.password
- source: elasticsearch-ca.crt.file
target: elasticsearch-ca.crt
elasticsearch1 is refusing to connect to the master node on 9300. Not sure why. certs are correct it just keeps timing out. Is it a container issue?
elk_elasticsearch1.1.fgvyvgm4wyuh@ehrelk-rh7s01 | {"type": "server", "timestamp": "2021-02-22T03:58:50,545Z", "level": "DEBUG", "component": "o.e.a.s.m.TransportMasterNodeAction", "cluster.name": "docker-cluster", "node.name": "ehr-slave1", "message": "timed out while retrying [cluster:monitor/health] after failure (timeout [30s])" }
elk_elasticsearch1.1.fgvyvgm4wyuh@ehrelk-rh7s01 | {"type": "server", "timestamp": "2021-02-22T03:58:50,545Z", "level": "WARN", "component": "r.suppressed", "cluster.name": "docker-cluster", "node.name": "ehr-slave1", "message": "path: /_cat/health, params: {v=}",
elk_elasticsearch1.1.fgvyvgm4wyuh@ehrelk-rh7s01 | "stacktrace": ["org.elasticsearch.discovery.MasterNotDiscoveredException: null",
elk_elasticsearch1.1.fgvyvgm4wyuh@ehrelk-rh7s01 | "at org.elasticsearch.action.support.master.TransportMasterNodeAction$AsyncSingleAction$2.onTimeout(TransportMasterNodeAction.java:220) [elasticsearch-7.8.1.jar:7.8.1]",
elk_elasticsearch1.1.fgvyvgm4wyuh@ehrelk-rh7s01 | "at org.elasticsearch.cluster.ClusterStateObserver$ContextPreservingListener.onTimeout(ClusterStateObserver.java:325) [elasticsearch-7.8.1.jar:7.8.1]",
elk_elasticsearch1.1.fgvyvgm4wyuh@ehrelk-rh7s01 | "at org.elasticsearch.cluster.ClusterStateObserver$ObserverClusterStateListener.onTimeout(ClusterStateObserver.java:252) [elasticsearch-7.8.1.jar:7.8.1]",
elk_elasticsearch1.1.fgvyvgm4wyuh@ehrelk-rh7s01 | "at org.elasticsearch.cluster.service.ClusterApplierService$NotifyTimeout.run(ClusterApplierService.java:598) [elasticsearch-7.8.1.jar:7.8.1]",
elk_elasticsearch1.1.fgvyvgm4wyuh@ehrelk-rh7s01 | "at org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingRunnable.run(ThreadContext.java:636) [elasticsearch-7.8.1.jar:7.8.1]",
elk_elasticsearch1.1.fgvyvgm4wyuh@ehrelk-rh7s01 | "at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1130) [?:?]",
elk_elasticsearch1.1.fgvyvgm4wyuh@ehrelk-rh7s01 | "at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:630) [?:?]",
elk_elasticsearch1.1.fgvyvgm4wyuh@ehrelk-rh7s01 | "at java.lang.Thread.run(Thread.java:832) [?:?]"] }
elk_elasticsearch1.1.fgvyvgm4wyuh@ehrelk-rh7s01 | {"type": "server", "timestamp": "2021-02-22T03:58:53,435Z", "level": "WARN", "component": "o.e.d.HandshakingTransportAddressConnector", "cluster.name": "docker-cluster", "node.name": "ehr-slave1", "message": "[connectToRemoteMasterNode[10.0.1.7:9300]] completed handshake with [{ehr-master}{UwnmFOjiQeq8qxloXbii4A}{Bmfd38NTTlK8nbYISVuuTA}{10.0.0.6}{10.0.0.6:9300}{dilmrt}{ml.machine_memory=16656900096, ml.max_open_jobs=20, xpack.installed=true, transform.node=true}] but followup connection failed",
elk_elasticsearch1.1.fgvyvgm4wyuh@ehrelk-rh7s01 | "stacktrace": ["org.elasticsearch.transport.ConnectTransportException: [ehr-master][10.0.0.6:9300] connect_timeout[30s]",
elk_elasticsearch1.1.fgvyvgm4wyuh@ehrelk-rh7s01 | "at org.elasticsearch.transport.TcpTransport$ChannelsConnectedListener.onTimeout(TcpTransport.java:972) ~[elasticsearch-7.8.1.jar:7.8.1]",
elk_elasticsearch1.1.fgvyvgm4wyuh@ehrelk-rh7s01 | "at org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingRunnable.run(ThreadContext.java:636) ~[elasticsearch-7.8.1.jar:7.8.1]",
elk_elasticsearch1.1.fgvyvgm4wyuh@ehrelk-rh7s01 | "at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1130) [?:?]",
elk_elasticsearch1.1.fgvyvgm4wyuh@ehrelk-rh7s01 | "at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:630) [?:?]",
elk_elasticsearch1.1.fgvyvgm4wyuh@ehrelk-rh7s01 | "at java.lang.Thread.run(Thread.java:832) [?:?]"] }
elk_elasticsearch1.1.fgvyvgm4wyuh@ehrelk-rh7s01 | {"type": "server", "timestamp": "2021-02-22T03:58:54,354Z", "level": "WARN", "component": "o.e.c.c.ClusterFormationFailureHelper", "cluster.name": "docker-cluster", "node.name": "ehr-slave1", "message": "master not discovered yet: have discovered [{ehr-slave1}{7gnA3Q4EQT6gxPFilXcXXQ}{w4ShHyK-Rbi8anUONHuJLQ}{10.0.1.9}{10.0.1.9:9300}{dilrt}{ml.machine_memory=16656900096, xpack.installed=true, transform.node=true, ml.max_open_jobs=20}]; discovery will continue using [10.0.1.7:9300] from hosts providers and [] from last-known cluster state; node term 57, last-accepted version 66297 in term 57" }