After deleting elasticsearch pod, it is not able to come up. (Ready state 0/1)
Cluster health is yellow.
fluentd is not able to connect to ES.
elasticsearch@elasticsearch-master-0:~$ curl -XGET 'http://localhost:9200/_cluster/health?pretty=true'
{
"cluster_name" : "elasticsearch",
"status" : "yellow",
"timed_out" : false,
"number_of_nodes" : 1,
"number_of_data_nodes" : 1,
"active_primary_shards" : 15,
"active_shards" : 15,
"relocating_shards" : 0,
"initializing_shards" : 0,
"unassigned_shards" : 7,
"delayed_unassigned_shards" : 0,
"number_of_pending_tasks" : 0,
"number_of_in_flight_fetch" : 0,
"task_max_waiting_in_queue_millis" : 0,
"active_shards_percent_as_number" : 68.18181818181817
}
elasticsearch@elasticsearch-master-0:~$
elasticsearch-master-0 0/1 Running 0 40m
fluentd-lbzgp 1/1 Running 0 56m
kibana-kibana-5988749c45-cwmwc 0/1 Running 0 70m
<elastic.yaml>
apiVersion: apps/v1
kind: StatefulSet
metadata:
annotations:
meta.helm.sh/release-name: efk
meta.helm.sh/release-namespace: default
creationTimestamp: "2022-07-27T05:53:38Z"
generation: 1
labels:
app: elasticsearch-master
app.kubernetes.io/managed-by: Helm
name: elasticsearch-master
namespace: default
resourceVersion: "17974675"
uid: ef543fee-faf2-4516-8695-4e330438268a
spec:
podManagementPolicy: OrderedReady
replicas: 1
revisionHistoryLimit: 10
selector:
matchLabels:
app: elasticsearch-master
serviceName: elasticsearch-master-headless
template:
metadata:
creationTimestamp: null
labels:
app: elasticsearch-master
spec:
containers:
- env:
- name: node.name
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.name
- name: discovery.seed_hosts
value: elasticsearch-master-headless
- name: cluster.name
value: elasticsearch
- name: network.host
value: 0.0.0.0
- name: cluster.deprecation_indexing.enabled
value: "false"
- name: ES_JAVA_OPTS
value: -Xmx512m -Xms512m
- name: node.roles
value: master,data,data_content,data_hot,data_warm,data_cold,ingest,ml,remote_cluster_client,transform,
- name: discovery.type
value: single-node
- name: ELASTIC_PASSWORD
valueFrom:
secretKeyRef:
key: password
name: elasticsearch-credentials
- name: xpack.security.enabled
value: "false"
image: artifactory.radisys.com:8093/vdu.rsys/efk/elasticsearch:7.17.3
imagePullPolicy: IfNotPresent
name: elasticsearch
ports:
- containerPort: 9200
name: http
protocol: TCP
- containerPort: 9300
name: transport
protocol: TCP
readinessProbe:
exec:
command:
- bash
- -c
- |
set -e
# If the node is starting up wait for the cluster to be ready (request params: "wait_for_status=green&timeout=1s" )
# Once it has started only check that the node itself is responding
START_FILE=/tmp/.es_start_file
# Disable nss cache to avoid filling dentry cache when calling curl
# This is required with Elasticsearch Docker using nss < 3.52
export NSS_SDB_USE_CACHE=no
http () {
local path="${1}"
local args="${2}"
set -- -XGET -s
if [ "$args" != "" ]; then
set -- "$@" $args
fi
if [ -n "${ELASTIC_PASSWORD}" ]; then
set -- "$@" -u "elastic:${ELASTIC_PASSWORD}"
fi
curl --output /dev/null -k "$@" "http://127.0.0.1:9200${path}"
}
if [ -f "${START_FILE}" ]; then
echo 'Elasticsearch is already running, lets check the node is healthy'
HTTP_CODE=$(http "/" "-w %{http_code}")
RC=$?
if [[ ${RC} -ne 0 ]]; then
echo "curl --output /dev/null -k -XGET -s -w '%{http_code}' \${BASIC_AUTH} http://127.0.0.1:9200/ failed with RC ${RC}"
exit ${RC}
fi
# ready if HTTP code 200, 503 is tolerable if ES version is 6.x
if [[ ${HTTP_CODE} == "200" ]]; then
exit 0
elif [[ ${HTTP_CODE} == "503" && "7" == "6" ]]; then
exit 0
else
echo "curl --output /dev/null -k -XGET -s -w '%{http_code}' \${BASIC_AUTH} http://127.0.0.1:9200/ failed with HTTP code ${HTTP_CODE}"
exit 1
fi
else
echo 'Waiting for elasticsearch cluster to become ready (request params: "wait_for_status=green&timeout=1s" )'
if http "/_cluster/health?wait_for_status=green&timeout=1s" "--fail" ; then
touch ${START_FILE}
exit 0
else
echo 'Cluster is not yet ready (request params: "wait_for_status=green&timeout=1s" )'
exit 1
fi
fi
failureThreshold: 3
initialDelaySeconds: 10
periodSeconds: 10
successThreshold: 3
timeoutSeconds: 5
resources:
limits:
cpu: "8"
memory: 8Gi
requests:
cpu: "1"
memory: 5Gi
securityContext:
runAsNonRoot: true
runAsUser: 1000
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /usr/share/elasticsearch/data
name: elasticsearch-storage
dnsPolicy: ClusterFirst
initContainers:
- command:
- chown
- -R
- 1000:1000
- /usr/share/elasticsearch/
image: artifactory.radisys.com:8093/vdu.rsys/efk/busybox:1.34.1
imagePullPolicy: IfNotPresent
name: init-volume-permission
resources: {}
securityContext:
privileged: true
runAsUser: 0
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /usr/share/elasticsearch/data
name: elasticsearch-storage
restartPolicy: Always
schedulerName: default-scheduler
securityContext: {}
terminationGracePeriodSeconds: 30
volumes:
- name: elasticsearch-storage
persistentVolumeClaim:
claimName: elasticsearch-pvc
updateStrategy:
rollingUpdate:
partition: 0
type: RollingUpdate
status:
collisionCount: 0
currentReplicas: 1
currentRevision: elasticsearch-master-5cf649ffb6
observedGeneration: 1
replicas: 1
updateRevision: elasticsearch-master-5cf649ffb6
updatedReplicas: 1
</elastic.yaml>
[root@node1 containers]# kubectl get sc
NAME PROVISIONER RECLAIMPOLICY VOLUMEBINDINGMODE ALLOWVOLUMEEXPANSION AGE
elastic-storage kubernetes.io/no-provisioner Retain WaitForFirstConsumer false 59m
[root@node1 containers]# kubectl get pv
NAME CAPACITY ACCESS MODES RECLAIM POLICY STATUS CLAIM STORAGECLASS REASON AGE
du-log-volume-pv 10Gi RWO Retain Bound default/du-log-volume-pvc 41m
elasticsearch-pv 30Gi RWO Retain Bound default/elasticsearch-pvc elastic-storage 59m
[root@node1 containers]# kubectl get pvc
NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE
du-log-volume-pvc Bound du-log-volume-pv 10Gi RWO 41m
elasticsearch-pvc Bound elasticsearch-pv 30Gi RWO elastic-storage 59m
[root@node1 containers]#
This is PV hostpath:
[root@node1 data]# cd /mnt/elastic/data
[root@node1 data]# ls -lrt
total 0
drwxrwxr-x 3 labadmin labadmin 15 Jul 27 05:53 nodes
[root@node1 data]# cd nodes/0/
[root@node1 0]# ls -lrt
total 8
-rw-rw-r-- 1 labadmin labadmin 0 Jul 27 05:53 node.lock
drwxrwxr-x 17 labadmin labadmin 4096 Jul 27 06:06 indices
drwxrwxr-x 2 labadmin labadmin 42 Jul 27 06:24 snapshot_cache
drwxrwxr-x 2 labadmin labadmin 4096 Jul 27 06:24 _state
[root@node1 0]#
I can see some data here.
Final ovservation:
- If ES cluster health is green and I restart ES (by deleting pod), it is able to come up.
- If ES cluster health is yellow (using single node, so health turns yellow as soon as fluentd starts pushing data) and I restart ES (by deleting pod), it is NOT able to come up.
what can I do to ensure ES comes up after restart?