Hello, I read various issues regarding the data duplication error messages in filebeat logs, However I haven't really understood what the root cause is. Please can some one explain me really what the root cause is? below is the error message
{"log.level":"error","@timestamp":"2023-08-21T10:05:27.167Z","log.logger":"input","log.origin":{"file.name":"input-logfile/manager.go","file.line":183},"message":"filestream input with ID 'filestream-kubernetes-pod-1170a1d331b2efe193b3816759e44d789b1841d8bed791f9859d442098341d9d' already exists, this will lead to data duplication, please use a different ID. Metrics collection has been disabled on this input.","service.name":"filebeat","ecs.version":"1.6.0"}
below is my daemonset.yaml:
filebeat.autodiscover:
providers:
- type: kubernetes
node: ${NODE_NAME}
hints.enabled: true
hints.default_config:
# Using explicit filestream input until container input is switched to filestream, see https://github.com/elastic/beats/issues/34393
type: filestream
prospector.scanner.symlinks: true
id: filestream-kubernetes-pod-${data.kubernetes.container.id}
take_over: true
paths:
- /var/log/containers/*${data.kubernetes.container.id}.log
parsers:
# telling Filebeat to use its default container log parser configuration for processing logs from Kubernetes containers
- container: ~
add_resource_metadata:
cronjob: false # disable on purpose to resolve memory leakadge issue. See: https://discuss.elastic.co/t/filebeat-memory-leak-via-filebeat-autodiscover-and-200-000-goroutines/322082
deployment: false # disable on purpose to resolve memory leakadge issue. See: https://discuss.elastic.co/t/filebeat-memory-leak-via-filebeat-autodiscover-and-200-000-goroutines/322082
namespace:
enabled: true
fields_under_root: true
fields:
kubernetes.cluster: {{ .Values.name }}
kubernetes.stage: {{ (split "-" .Values.name)._1 }}
processors:
- add_host_metadata:
netinfo.enabled: false
when.not.equals.kubernetes.namespace_labels.namespace-type: application
- drop_fields:
fields: ['ecs.version', 'kubernetes.namespace_uid']
ignore_missing: true
when.not.equals.kubernetes.namespace_labels.namespace-type: application
- drop_fields:
fields: ['kubernetes.node.uid', 'kubernetes.pod.ip', '/^kubernetes.node.labels.*/']
ignore_missing: true
# the "index-name" field is used by ELK to determine the effective index
# the effective index is "index-name" suffixed by the current day
- copy_fields:
fields:
- from: kubernetes.labels.xxx/index-name
to: index-name
fail_on_error: false
ignore_missing: true
when.not.has_fields: ['index-name']
# all applications in our namespaces will use the xxx index, if not overwritten by a label
- add_fields:
target: ''
fields:
index-name: xxx
when:
and:
- not.has_fields: ['index-name']
- or:
- equals.kubernetes.namespace_labels.namespace-type: shared
- equals.kubernetes.namespace_labels.namespace-type: helper
- add_fields:
fields:
agent.hostname: ${HOSTNAME}
target: ""
- copy_fields:
fields:
- from: container.image.name
to: kubernetes.container.image
fail_on_error: false
ignore_missing: true
target: "kubernetes"
- decode_json_fields:
fields: ['message']
overwrite_keys: true
target: ""
# Keep only the mentioned fields and drop the rest for index-name yyyy.
- include_fields:
when:
contains:
index-name: yyyy
fields: ["reason", "message", "source", "firstTimestamp", "lastTimestamp", "count", "type", "involvedObject.kind", "involvedObject.namespace", "involvedObject.name", "involvedObject.labels", "index-name", "kubernetes.cluster", "kubernetes.stage"]
# the "tenant" field is just for convinience
- copy_fields:
fields:
- from: kubernetes.namespace_labels.tenant
to: tenant
fail_on_error: false
ignore_missing: true
when.not.has_fields: ['tenant']
# drop events without index-name, because ELK can't handle them anyway
- drop_event:
when.not.has_fields: ['index-name']
output.logstash:
hosts:
- {{ printf "%s:%d" .Values.log_sink.address (.Values.log_sink.port | int) }}
ssl:
certificate_authorities:
- "/etc/certs/ca.pem"