Collect container logs with elastic-agent

Hey Everyone,

I'm having some trouble getting the Kubernetes integration to fully work on my ECK cluster.

What I'm trying to do is get elastic-agent to read container logs and forward them to ES.

Here's my elastic-agent.yaml:

---
apiVersion: agent.k8s.elastic.co/v1alpha1
kind: Agent
metadata:
  name: elastic-agent
  namespace: default
spec:
  version: 8.12.0
  kibanaRef:
    name: kibana
  fleetServerRef:
    name: fleet-server
  mode: fleet
  policyID: eck-agent
  daemonSet:
    podTemplate:
      spec:
        tolerations:
        - key: "node-role.kubernetes.io/control-plane"
          operator: "Exists"
          effect: "NoSchedule"
        serviceAccountName: elastic-agent
        hostNetwork: true
        hostPID: true
        dnsPolicy: ClusterFirstWithHostNet
        automountServiceAccountToken: true
        securityContext:
          runAsUser: 0
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: elastic-agent
subjects:
  - kind: ServiceAccount
    name: elastic-agent
    namespace: default
roleRef:
  kind: ClusterRole
  name: elastic-agent
  apiGroup: rbac.authorization.k8s.io
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
  namespace: default
  name: elastic-agent
subjects:
  - kind: ServiceAccount
    name: elastic-agent
    namespace: default
roleRef:
  kind: Role
  name: elastic-agent
  apiGroup: rbac.authorization.k8s.io
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
  name: elastic-agent-kubeadm-config
  namespace: default
subjects:
  - kind: ServiceAccount
    name: elastic-agent
    namespace: default
roleRef:
  kind: Role
  name: elastic-agent-kubeadm-config
  apiGroup: rbac.authorization.k8s.io
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: elastic-agent
  labels:
    k8s-app: elastic-agent
rules:
  - apiGroups: [""]
    resources:
      - nodes
      - namespaces
      - events
      - pods
      - services
      - configmaps
      - serviceaccounts
      - persistentvolumes
      - persistentvolumeclaims
    verbs: ["get", "list", "watch"]
  - apiGroups: ["extensions"]
    resources:
      - replicasets
    verbs: ["get", "list", "watch"]
  - apiGroups: ["apps"]
    resources:
      - statefulsets
      - deployments
      - replicasets
      - daemonsets
    verbs: ["get", "list", "watch"]
  - apiGroups:
      - ""
    resources:
      - nodes/stats
    verbs:
      - get
  - apiGroups: [ "batch" ]
    resources:
      - jobs
      - cronjobs
    verbs: [ "get", "list", "watch" ]
  - nonResourceURLs:
      - "/metrics"
    verbs:
      - get
  - apiGroups: ["rbac.authorization.k8s.io"]
    resources:
      - clusterrolebindings
      - clusterroles
      - rolebindings
      - roles
    verbs: ["get", "list", "watch"]
  - apiGroups: ["policy"]
    resources:
      - podsecuritypolicies
    verbs: ["get", "list", "watch"]
  - apiGroups: [ "storage.k8s.io" ]
    resources:
      - storageclasses
    verbs: [ "get", "list", "watch" ]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
  name: elastic-agent
  namespace: default
  labels:
    k8s-app: elastic-agent
rules:
  - apiGroups:
      - coordination.k8s.io
    resources:
      - leases
    verbs: ["get", "create", "update"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
  name: elastic-agent-kubeadm-config
  namespace: default
  labels:
    k8s-app: elastic-agent
rules:
  - apiGroups: [""]
    resources:
      - configmaps
    resourceNames:
      - kubeadm-config
    verbs: ["get"]
---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: elastic-agent
  namespace: default
  labels:
    k8s-app: elastic-agent

I've tried messing around with volumeMounts but when trying to specify them alongside volumes I get the following error:
Warning ReconciliationError 42m (x23 over 108m) agent-controller Reconciliation error: DaemonSet.apps "elastic-agent-agent" is invalid: spec.template.spec.containers[0].image: Required value

The volumes and volumeMounts can be found here. The issue is that it's referenced from these docs, and in these docs elastic-agent is run as a DaemonSet directly, not through the Agent CRD that comes with ECK operator.

Any help is greatly apprecited!

Cheers,
Luka

Here's the yaml I was working with that threw the error above

---
apiVersion: agent.k8s.elastic.co/v1alpha1
kind: Agent
metadata:
  name: elastic-agent
  namespace: default
spec:
  version: 8.12.0
  kibanaRef:
    name: kibana
  fleetServerRef:
    name: fleet-server
  mode: fleet
  policyID: eck-agent
  daemonSet:
    podTemplate:
      spec:
        tolerations:
        - key: "node-role.kubernetes.io/control-plane"
          operator: "Exists"
          effect: "NoSchedule"
        serviceAccountName: elastic-agent
        hostNetwork: true
        hostPID: true
        dnsPolicy: ClusterFirstWithHostNet
        automountServiceAccountToken: true
        securityContext:
          runAsUser: 0
        containers:
          - name: elastic-agent
            volumeMounts:
              - name: proc
                mountPath: /hostfs/proc
                readOnly: true
              - name: cgroup
                mountPath: /hostfs/sys/fs/cgroup
                readOnly: true
              - name: varlibdockercontainers
                mountPath: /var/lib/docker/containers
                readOnly: true
              - name: varlog
                mountPath: /var/log
                readOnly: true
              - name: etc-full
                mountPath: /hostfs/etc
                readOnly: true
              - name: var-lib
                mountPath: /hostfs/var/lib
                readOnly: true
              - name: etc-mid
                mountPath: /etc/machine-id
                readOnly: true
              - name: sys-kernel-debug
                mountPath: /sys/kernel/debug
              - name: elastic-agent-state
                mountPath: /usr/share/elastic-agent/state
        volumes:
          - name: agent-data
            emptyDir: {}
          - name: proc
            hostPath:
              path: /proc
          - name: cgroup
            hostPath:
              path: /sys/fs/cgroup
          - name: varlibdockercontainers
            hostPath:
              path: /var/lib/docker/containers
          - name: varlog
            hostPath:
              path: /var/log
          - name: etc-full
            hostPath:
              path: /etc
          - name: var-lib
            hostPath:
              path: /var/lib
          - name: etc-mid
            hostPath:
              path: /etc/machine-id
              type: File
          - name: sys-kernel-debug
            hostPath:
              path: /sys/kernel/debug
          - name: elastic-agent-state
            hostPath:
              path: /var/lib/elastic-agent-managed/default/state
              type: DirectoryOrCreate
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: elastic-agent
subjects:
  - kind: ServiceAccount
    name: elastic-agent
    namespace: default
roleRef:
  kind: ClusterRole
  name: elastic-agent
  apiGroup: rbac.authorization.k8s.io
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
  namespace: default
  name: elastic-agent
subjects:
  - kind: ServiceAccount
    name: elastic-agent
    namespace: default
roleRef:
  kind: Role
  name: elastic-agent
  apiGroup: rbac.authorization.k8s.io
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
  name: elastic-agent-kubeadm-config
  namespace: default
subjects:
  - kind: ServiceAccount
    name: elastic-agent
    namespace: default
roleRef:
  kind: Role
  name: elastic-agent-kubeadm-config
  apiGroup: rbac.authorization.k8s.io
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: elastic-agent
  labels:
    k8s-app: elastic-agent
rules:
  - apiGroups: [""]
    resources:
      - nodes
      - namespaces
      - events
      - pods
      - services
      - configmaps
      - serviceaccounts
      - persistentvolumes
      - persistentvolumeclaims
    verbs: ["get", "list", "watch"]
  - apiGroups: ["extensions"]
    resources:
      - replicasets
    verbs: ["get", "list", "watch"]
  - apiGroups: ["apps"]
    resources:
      - statefulsets
      - deployments
      - replicasets
      - daemonsets
    verbs: ["get", "list", "watch"]
  - apiGroups:
      - ""
    resources:
      - nodes/stats
    verbs:
      - get
  - apiGroups: [ "batch" ]
    resources:
      - jobs
      - cronjobs
    verbs: [ "get", "list", "watch" ]
  - nonResourceURLs:
      - "/metrics"
    verbs:
      - get
  - apiGroups: ["rbac.authorization.k8s.io"]
    resources:
      - clusterrolebindings
      - clusterroles
      - rolebindings
      - roles
    verbs: ["get", "list", "watch"]
  - apiGroups: ["policy"]
    resources:
      - podsecuritypolicies
    verbs: ["get", "list", "watch"]
  - apiGroups: [ "storage.k8s.io" ]
    resources:
      - storageclasses
    verbs: [ "get", "list", "watch" ]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
  name: elastic-agent
  namespace: default
  labels:
    k8s-app: elastic-agent
rules:
  - apiGroups:
      - coordination.k8s.io
    resources:
      - leases
    verbs: ["get", "create", "update"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
  name: elastic-agent-kubeadm-config
  namespace: default
  labels:
    k8s-app: elastic-agent
rules:
  - apiGroups: [""]
    resources:
      - configmaps
    resourceNames:
      - kubeadm-config
    verbs: ["get"]
---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: elastic-agent
  namespace: default
  labels:
    k8s-app: elastic-agent

The reference I used was from this post where he hasn't specified the image but apparently it works.

EDIT - even when copying this example I get the same error

One more thing I've noticed is when applying a new configuration it recreates the pods, and all data is lost (aka there's a massive amount of Offline agents in the fleet UI). Even though by default data should persist across restarts:

kubectl describe po elastic-agent-agent-97mjg

Volumes:
  agent-data:
    Type:          HostPath (bare host directory volume)
    Path:          /var/lib/elastic-agent/default/elastic-agent/state
    HostPathType:  DirectoryOrCreate

There's also a massive amount of these errors (for every module):
{"log.level":"error","@timestamp":"2024-02-09T13:57:49.055Z","message":"Failed to list light metricsets for module postgresql: getting metricsets for module 'postgresql': loading light module 'postgresql' definition: loading module configuration from '/usr/share/elastic-agent/data/elastic-agent-3c8be7/components/module/postgresql/module.yml': config file (\"/usr/share/elastic-agent/data/elastic-agent-3c8be7/components/module/postgresql/module.yml\") must be owned by the user identifier (uid=0) or root","component":{"binary":"metricbeat","dataset":"elastic_agent.metricbeat","id":"system/metrics-default","type":"system/metrics"},"log":{"source":"system/metrics-default"},"log.origin":{"file.line":145,"file.name":"mb/lightmodules.go"},"service.name":"metricbeat","ecs.version":"1.6.0","log.logger":"registry.lightmodules","ecs.version":"1.6.0"}

Seems to be related to this issue. The module files are indeed owned by elastic-agent while the container itself is running as root. Can't find any errors related to why it doesn't remember it's state though. HostPath seems to be mounted accordingly and it's present on the nodes:

[root@k8s-master01 ~]# ll /var/lib/elastic-agent/default/elastic-agent/state
total 0
drwxr-x---. 5 root root 75 Feb  9 15:43 data

Managed to get it working. Posting here in case anyone needs it. I have no clue what I've changed to get it to work tbh, it was trial and error

---
apiVersion: agent.k8s.elastic.co/v1alpha1
kind: Agent
metadata:
  name: elastic-agent
  namespace: default
spec:
  version: 8.12.0
  kibanaRef:
    name: kibana
  fleetServerRef:
    name: fleet-server
  mode: fleet
  policyID: eck-agent
  daemonSet:
    podTemplate:
      spec:
        tolerations:
        - key: "node-role.kubernetes.io/control-plane"
          operator: "Exists"
          effect: "NoSchedule"
        serviceAccountName: elastic-agent
        hostNetwork: true
        hostPID: true
        dnsPolicy: ClusterFirstWithHostNet
        automountServiceAccountToken: true
        containers:
        - name: agent
          volumeMounts:
            - name: proc
              mountPath: /hostfs/proc
              readOnly: true
            - name: cgroup
              mountPath: /hostfs/sys/fs/cgroup
              readOnly: true
            - name: varlibdockercontainers
              mountPath: /var/lib/docker/containers
              readOnly: true
            - name: varlog
              mountPath: /var/log
              readOnly: true
            - name: etc-full
              mountPath: /hostfs/etc
              readOnly: true
            - name: var-lib
              mountPath: /hostfs/var/lib
              readOnly: true
            - name: etc-mid
              mountPath: /etc/machine-id
              readOnly: true
            - name: sys-kernel-debug
              mountPath: /sys/kernel/debug
        securityContext:
          runAsUser: 0
        volumes:
          - name: proc
            hostPath:
              path: /proc
          - name: cgroup
            hostPath:
              path: /sys/fs/cgroup
          - name: varlibdockercontainers
            hostPath:
              path: /var/lib/docker/containers
          - name: varlog
            hostPath:
              path: /var/log
          - name: etc-full
            hostPath:
              path: /etc
          - name: var-lib
            hostPath:
              path: /var/lib
          - name: etc-mid
            hostPath:
              path: /etc/machine-id
              type: File
          - name: sys-kernel-debug
            hostPath:
              path: /sys/kernel/debug