Filestream data duplication in filebeat 8.9.1

Hello, I read various issues regarding the data duplication error messages in filebeat logs, However I haven't really understood what the root cause is. Please can some one explain me really what the root cause is? below is the error message

{"log.level":"error","@timestamp":"2023-08-21T10:05:27.167Z","log.logger":"input","log.origin":{"file.name":"input-logfile/manager.go","file.line":183},"message":"filestream input with ID 'filestream-kubernetes-pod-1170a1d331b2efe193b3816759e44d789b1841d8bed791f9859d442098341d9d' already exists, this will lead to data duplication, please use a different ID. Metrics collection has been disabled on this input.","service.name":"filebeat","ecs.version":"1.6.0"}

below is my daemonset.yaml:

filebeat.autodiscover:
      providers:
        - type: kubernetes
          node: ${NODE_NAME}
          hints.enabled: true
          hints.default_config:
            # Using explicit filestream input until container input is switched to filestream, see https://github.com/elastic/beats/issues/34393
            type: filestream
            prospector.scanner.symlinks: true
            id: filestream-kubernetes-pod-${data.kubernetes.container.id}
            take_over: true
            paths:
              - /var/log/containers/*${data.kubernetes.container.id}.log
            parsers:
            # telling Filebeat to use its default container log parser configuration for processing logs from Kubernetes containers
            - container: ~ 
          add_resource_metadata:
            cronjob: false # disable on purpose to resolve memory leakadge issue. See: https://discuss.elastic.co/t/filebeat-memory-leak-via-filebeat-autodiscover-and-200-000-goroutines/322082
            deployment: false # disable on purpose to resolve memory leakadge issue. See: https://discuss.elastic.co/t/filebeat-memory-leak-via-filebeat-autodiscover-and-200-000-goroutines/322082
            namespace:
              enabled: true
    fields_under_root: true
    fields:
      kubernetes.cluster: {{ .Values.name }}
      kubernetes.stage: {{ (split "-" .Values.name)._1 }}
    processors:
      - add_host_metadata:
          netinfo.enabled: false
          when.not.equals.kubernetes.namespace_labels.namespace-type: application
      - drop_fields:
          fields: ['ecs.version', 'kubernetes.namespace_uid']
          ignore_missing: true
          when.not.equals.kubernetes.namespace_labels.namespace-type: application
      - drop_fields:
          fields: ['kubernetes.node.uid', 'kubernetes.pod.ip', '/^kubernetes.node.labels.*/']
          ignore_missing: true
      # the "index-name" field is used by ELK to determine the effective index
      # the effective index is "index-name" suffixed by the current day
      - copy_fields:
          fields:
            - from: kubernetes.labels.xxx/index-name
              to: index-name
          fail_on_error: false
          ignore_missing: true
          when.not.has_fields: ['index-name']
      # all applications in our namespaces will use the xxx index, if not overwritten by a label
      - add_fields:
          target: ''
          fields:
            index-name: xxx
          when:
            and:
            - not.has_fields: ['index-name']
            - or:
              - equals.kubernetes.namespace_labels.namespace-type: shared
              - equals.kubernetes.namespace_labels.namespace-type: helper
      - add_fields:
          fields:
            agent.hostname: ${HOSTNAME}
          target: ""
      - copy_fields:
          fields:
            - from: container.image.name
              to: kubernetes.container.image
          fail_on_error: false
          ignore_missing: true
          target: "kubernetes"
      - decode_json_fields:
          fields: ['message']
          overwrite_keys: true
          target: ""
      # Keep only the mentioned fields and drop the rest for index-name yyyy. 
      - include_fields:
          when:
            contains:
              index-name: yyyy
          fields: ["reason", "message", "source", "firstTimestamp", "lastTimestamp", "count", "type", "involvedObject.kind", "involvedObject.namespace", "involvedObject.name", "involvedObject.labels", "index-name", "kubernetes.cluster", "kubernetes.stage"]
      # the "tenant" field is just for convinience
      - copy_fields:
          fields:
            - from: kubernetes.namespace_labels.tenant
              to: tenant
          fail_on_error: false
          ignore_missing: true
          when.not.has_fields: ['tenant']
      # drop events without index-name, because ELK can't handle them anyway
      - drop_event:
          when.not.has_fields: ['index-name']
    output.logstash:
      hosts:
      - {{ printf "%s:%d" .Values.log_sink.address (.Values.log_sink.port | int) }}
      ssl:
        certificate_authorities:
          - "/etc/certs/ca.pem"

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.