I have metricbeat installed using ECK on my EKS cluster. it seems to be crashing quite frequently, the error i have spotted is the following:
I0608 07:45:51.571964 8 trace.go:205] Trace[136023862]: "DeltaFIFO Pop Process" ID:elastic-system,Depth:11,Reason:slow event handlers blocking the queue (08-Jun-2022 07:45:51.170) (total time: 401ms):
Trace[136023862]: [401.696877ms] [401.696877ms] END
-
What does the above mean and how can i resolve this issue?
-
As an aside i also see the following warning logs, is there some way to disable this warning? It apperas very often
{"log.level":"warn","@timestamp":"2022-06-08T07:45:48.867Z","log.logger":"tls","log.origin":{"file.name":"tlscommon/tls_config.go","file.line":105},"message":"SSL/TLS verifications disabled.","service.name":"metricbeat","ecs.version":"1.6.0"}
metricbeat.yaml
apiVersion: beat.k8s.elastic.co/v1beta1
kind: Beat
metadata:
name: metricbeat
namespace: elastic-system
spec:
type: metricbeat
version: 8.2.0
elasticsearchRef:
name: elasticsearch-prod
kibanaRef:
name: kibana-prod
config:
http.enabled: true
setup.ilm:
enabled: true
policy_name: metricbeat-custom
policy_file: /etc/indice-lifecycle.json
setup.template.settings:
index:
routing.allocation.require.type: "monitoring"
metricbeat:
autodiscover:
providers:
- type: kubernetes
scope: cluster
hints.enabled: true
templates:
- condition:
contains:
kubernetes.labels.scrape: es
config:
- module: elasticsearch
metricsets:
- ccr
- cluster_stats
- enrich
- index
- index_recovery
- index_summary
- ml_job
- node_stats
- shard
period: 10s
hosts: "https://${data.host}:${data.ports.https}"
username: ${MONITORED_ES_USERNAME}
password: ${MONITORED_ES_PASSWORD}
# WARNING: disables TLS as the default certificate is not valid for the pod FQDN
# TODO: switch this to "certificate" when available: https://github.com/elastic/beats/issues/8164
ssl.verification_mode: "none"
xpack.enabled: true
- condition:
contains:
kubernetes.labels.scrape: kb
config:
- module: kibana
metricsets:
- stats
period: 10s
hosts: "https://${data.host}:${data.ports.https}"
username: ${MONITORED_ES_USERNAME}
password: ${MONITORED_ES_PASSWORD}
# WARNING: disables TLS as the default certificate is not valid for the pod FQDN
# TODO: switch this to "certificate" when available: https://github.com/elastic/beats/issues/8164
ssl.verification_mode: "none"
xpack.enabled: true
- condition:
contains:
kubernetes.labels.common.k8s.elastic.co/type: beat
config:
- module: beat
metricsets:
- stats
- state
period: 10s
hosts: "http://${NODE_NAME}:5066"
# WARNING: disables TLS as the default certificate is not valid for the pod FQDN
# TODO: switch this to "certificate" when available: https://github.com/elastic/beats/issues/8164
ssl.verification_mode: "none"
xpack.enabled: true
modules:
- module: system
period: 10s
metricsets:
- cpu
- load
- memory
- network
- process
- process_summary
process:
include_top_n:
by_cpu: 5
by_memory: 5
processes:
- .*
- module: system
period: 1m
metricsets:
- filesystem
- fsstat
processors:
- drop_event:
when:
regexp:
system:
filesystem:
mount_point: ^/(sys|cgroup|proc|dev|etc|host|lib)($|/)
- module: docker
period: 10s
hosts: ["unix:///var/run/docker.sock"]
metricsets: ["container", "cpu", "diskio", "healthcheck", "info", "memory", "network"]
- module: kubernetes # kubelet
period: 10s
node: ${NODE_NAME}
hosts: ["https://${NODE_NAME}:10250"]
add_metadata: true
metricsets: ["node", "system", "pod", "container", "volume"]
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
ssl.verification_mode: "none"
- module: kubernetes # kube-state-metrics
period: 10s
node: ${NODE_NAME}
hosts:
- "kube-prometheus-stack-kube-state-metrics.monitoring.svc.cluster.local:8080"
add_metadata: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
ssl:
verification_mode: none
metricsets:
- state_node
- state_daemonset
- state_deployment
- state_replicaset
- state_statefulset
- state_pod
- state_container
- state_job
- state_cronjob
- state_resourcequota
- state_service
- state_persistentvolume
- state_persistentvolumeclaim
- state_storageclass
## causes pods to restart
## - module: aws
## period: 60s
## metricsets:
## - ec2
## - lambda
## - rds
## - ebs
## - elb
## access_key_id: ${AWS_ACCESS_KEY_ID}
## secret_access_key: ${AWS_SECRET_ACCESS_KEY}
processors:
- add_cloud_metadata: {}
- add_host_metadata: {}
daemonSet:
podTemplate:
spec:
serviceAccountName: metricbeat
automountServiceAccountToken: true # some older Beat versions are depending on this settings presence in k8s context
containers:
- args:
- -e
- -c
- /etc/beat.yml
- -system.hostfs=/hostfs
name: metricbeat
volumeMounts:
- mountPath: /hostfs/sys/fs/cgroup
name: cgroup
- mountPath: /var/run/docker.sock
name: dockersock
- mountPath: /hostfs/proc
name: proc
- name: indice-lifecycle
mountPath: /etc/indice-lifecycle.json
readOnly: true
subPath: indice-lifecycle.json
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: MONITORED_ES_USERNAME
value: elastic
- name: MONITORED_ES_PASSWORD
valueFrom:
secretKeyRef:
key: elastic
name: elasticsearch-prod-es-elastic-user
- name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
name: aws-secret
key: access-key-id
- name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
name: aws-secret
key: access-secret-key
dnsPolicy: ClusterFirstWithHostNet
hostNetwork: true # Allows to provide richer host metadata
securityContext:
runAsUser: 0
terminationGracePeriodSeconds: 30
volumes:
- hostPath:
path: /sys/fs/cgroup
name: cgroup
- hostPath:
path: /var/run/docker.sock
name: dockersock
- hostPath:
path: /proc
name: proc
- name: indice-lifecycle
configMap:
defaultMode: 0600
name: metricbeat-indice-lifecycle
---
# permissions needed for metricbeat
# source: https://www.elastic.co/guide/en/beats/metricbeat/current/metricbeat-module-kubernetes.html
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: metricbeat
namespace: elastic-system
rules:
- apiGroups:
- ""
resources:
- nodes
- namespaces
- events
- pods
- services
verbs:
- get
- list
- watch
- apiGroups:
- "extensions"
resources:
- replicasets
verbs:
- get
- list
- watch
- apiGroups:
- apps
resources:
- statefulsets
- deployments
- replicasets
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- nodes/stats
verbs:
- get
- nonResourceURLs:
- /metrics
verbs:
- get
- apiGroups: ["batch"]
resources: ["*"]
verbs: ["get", "list", "watch"]
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: metricbeat
namespace: elastic-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: metricbeat
namespace: elastic-system
subjects:
- kind: ServiceAccount
name: metricbeat
namespace: elastic-system
roleRef:
kind: ClusterRole
name: metricbeat
apiGroup: rbac.authorization.k8s.io
---
# metricbeat.indice-lifecycle.configmap.yml
apiVersion: v1
kind: ConfigMap
metadata:
namespace: elastic-system
name: metricbeat-indice-lifecycle
labels:
app: metricbeat
data:
indice-lifecycle.json: |-
{
"policy": {
"phases": {
"hot": {
"actions": {
"rollover": {
"max_primary_shard_size": "50GB",
"max_size": "50GB",
"max_age": "1d"
}
}
},
"delete": {
"min_age": "30d",
"actions": {
"delete": {}
}
}
}
}
}
---