Fluend periodically fails sending logs to Elasticsearch

Issue: fluentd periodically stops sending logs to Elasticsearch. when i delete the fluentd pod, fluentd starts sending previously stuck logs successfully to Elasticsearch and i can see them in kibana. after several hours, fluentd stops successfully sending logs to Elasticsearch again.

Fluentd pod logs:

2022-01-03 10:23:17 +0000 [warn]: #0 suppressed same stacktrace
2022-01-03 10:23:17 +0000 [warn]: #0 failed to flush the buffer. retry_time=121 next_retry_seconds=2022-01-03 10:23:44 +0000 chunk="5d4a5ecfd6b14cb1b5f1a967d7fb0cbb" error_class=Fluent::Plugin::ElasticsearchOutput::RecoverableRequestFailure error="could not push logs to Elasticsearch cluster ({:host=>\"elasticsearch-master\", :port=>9200, :scheme=>\"https\", :user=>\"elastic\", :password=>\"obfuscated\", :path=>\"\"}): no implicit conversion of nil into String (TypeError)"
  2022-01-03 10:23:17 +0000 [warn]: #0 suppressed same stacktrace
2022-01-03 10:23:41 +0000 [info]: #0 [filter_kube_metadata] stats - namespace_cache_size: 0, pod_cache_size: 10, namespace_cache_miss: 96063, pod_cache_api_updates: 404, id_cache_miss: 404, pod_cache_watch_delete_ignored: 195, pod_cache_watch_updates: 1168, pod_cache_watch_ignored: 98
2022-01-03 10:23:44 +0000 [warn]: #0 failed to flush the buffer. retry_time=122 next_retry_seconds=2022-01-03 10:24:11 +0000 chunk="5d4a947e283136cb03b506c1532e41df" error_class=Fluent::Plugin::ElasticsearchOutput::RecoverableRequestFailure error="could not push logs to Elasticsearch cluster ({:host=>\"elasticsearch-master\", :port=>9200, :scheme=>\"https\", :user=>\"elastic\", :password=>\"obfuscated\", :path=>\"\"}): Broken pipe (Errno::EPIPE)"

Elasticsearch logs show many of the below, i assume this might be related to metricbeat sending logs as well:

{"type": "server", "timestamp": "2022-01-03T10:29:12,645Z", "level": "WARN", "component": "o.e.x.s.t.n.SecurityNetty4HttpServerTransport", "cluster.name": "elasticsearch", "node.name": "elasticsearch-master-0", "message": "received plaintext http traffic on an https channel, closing connection Netty4HttpChannel{localAddress=/172.16.0.31:9200, remoteAddress=/172.16.0.18:52110}", "cluster.uuid": "3VsHguEpQzqCEyUZW4N02Q", "node.id": "vubSfqY3Q8u9EMaCtFk7Vg"  }
{"type": "server", "timestamp": "2022-01-03T10:29:12,653Z", "level": "WARN", "component": "o.e.x.s.t.n.SecurityNetty4HttpServerTransport", "cluster.name": "elasticsearch", "node.name": "elasticsearch-master-0", "message": "received plaintext http traffic on an https channel, closing connection Netty4HttpChannel{localAddress=/172.16.0.31:9200, remoteAddress=/172.16.0.18:52112}", "cluster.uuid": "3VsHguEpQzqCEyUZW4N02Q", "node.id": "vubSfqY3Q8u9EMaCtFk7Vg"  }
{"type": "server", "timestamp": "2022-01-03T10:29:12,658Z", "level": "WARN", "component": "o.e.x.s.t.n.SecurityNetty4HttpServerTransport", "cluster.name": "elasticsearch", "node.name": "elasticsearch-master-0", "message": "received plaintext http traffic on an https channel, closing connection Netty4HttpChannel{localAddress=/172.16.0.31:9200, remoteAddress=/172.16.0.18:52114}", "cluster.uuid": "3VsHguEpQzqCEyUZW4N02Q", "node.id": "vubSfqY3Q8u9EMaCtFk7Vg"  }

fluentd helm chart configuration:

fileConfigs:
    01_sources.conf: |-
      ## logs from podman
      <source>
        @type tail
        @id in_tail_container_logs
        @label @KUBERNETES
        @log_level debug
        path /var/log/containers/*.log
        pos_file /var/log/fluentd-containers.log.pos
        tag kubernetes.*
        read_from_head true
        <parse>
          @type multi_format
          <pattern>
            format json
            time_key time
            time_type string
            time_format "%Y-%m-%dT%H:%M:%S.%NZ"
            keep_time_key true
          </pattern>
          <pattern>
            format regexp
            expression /^(?<time>.+) (?<stream>stdout|stderr)( (.))? (?<log>.*)$/
            time_format '%Y-%m-%dT%H:%M:%S.%NZ'
            keep_time_key true
          </pattern>
        </parse>
        emit_unmatched_lines true
      </source>
    04_outputs.conf: |-
      <label @OUTPUT>
        <match **>
          @type elasticsearch
          hosts https://elasticsearch-master:9200
          path ""
          user elastic
          ssl_verify false
          password *******
          <buffer>
            @type file
            path /var/log/fluentd-buffers/kubernetes.system.buffer
            flush_mode interval
            timekey 1h
            retry_type exponential_backoff
            flush_thread_count 4
            flush_interval 10s
            total_limit_size 1G
            retry_forever: true
            retry_timeout 1h
            retry_max_interval 30
            chunk_limit_size 2M
            queue_limit_length 10
            overflow_action block
          </buffer>
        </match>
      </label>

Elasticsearch helm chart configuration:

elasticsearch:
  volumeClaimTemplate:
    storageClassName: csi-disk
    accessModes: ["ReadWriteOnce"]
    resources:
      requests:
        storage: 30Gi
  replicas: 1
  clusterHealthCheckParams: "wait_for_status=yellow&timeout=4s"
  readinessProbe:
    failureThreshold: 3
    initialDelaySeconds: 30
    periodSeconds: 10
    successThreshold: 3
    timeoutSeconds: 5
  resources:
    requests:
      cpu: "300m"
      memory: "1000Mi"
    limits:
      cpu: "1000m"
      memory: "2500Mi"
  esConfig:
    elasticsearch.yml: |
      xpack.security.enabled: true
      xpack.security.transport.ssl.enabled: true
      xpack.security.transport.ssl.verification_mode: certificate
      xpack.security.transport.ssl.keystore.path: /usr/share/elasticsearch/config/certs-gen/keystore.p12
      xpack.security.transport.ssl.truststore.path: /usr/share/elasticsearch/config/certs-gen/keystore.p12
      xpack.security.http.ssl.enabled: true
      xpack.security.http.ssl.truststore.path: /usr/share/elasticsearch/config/certs-gen/keystore.p12
      xpack.security.http.ssl.keystore.path: /usr/share/elasticsearch/config/certs-gen/keystore.p12
      http.max_content_length: 1GB
      http.max_header_size: 64kb
  protocol: https

node resource usage:

NAME CPU(cores) CPU% MEMORY(bytes) MEMORY%
192.168.0.193 241m 12% 4393Mi 69%

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.