Mutate a Nested Field

Hi there,

I have an index that looks like so:

{
  "_index": "ec2_logs-index-2020.05.11",
  "_type": "_doc",
  "_id": "V3LAAnIBk2krYRHTuxCC",
  "_version": 1,
  "_score": null,
  "_source": {
    "owner": "104115521938",
    "@version": "1",
    "logStream": "mystream",
    "subscriptionFilters": [
      "To-Firehose"
    ],
    "messageType": "DATA_MESSAGE",
    "logGroup": "ec2-fluentd",
    "@timestamp": "2020-05-11T08:01:24.400Z",
    "logEvents": [
      {
        "timestamp": 1589167148000,
        "id": "35439611645423604170418303021688828486859711444132757504",
        "message": "{\"host\":\"ip-22-22-2-83\",\"ident\":\"dhclient\",\"pid\":\"2987\",\"message\":\"XMT: Solicit on eth0, interval 109630ms.\"}"
      },
      {
        "timestamp": 1589167161000,
        "id": "35439611935333291751316403861653166031288411021878165505",
        "message": "{\"host\":\"ip-22-22-2-83\",\"ident\":\"sshd\",\"pid\":\"3864\",\"message\":\"Accepted publickey for ec2-user from 138.75.33.59 port 51868 ssh2: RSA SHA256:QY/2uJJiV5cYpErAb/KLg/505B6WQ4ZlvcBazh1Qfyo\"}"
      },
      {
        "timestamp": 1589167161000,
        "id": "35439611935333291751316403861653166031288411021878165506",
        "message": "{\"host\":\"ip-22-22-2-83\",\"ident\":\"systemd\",\"message\":\"Created slice User Slice of ec2-user.\"}"
      },
      {
        "timestamp": 1589167161000,
        "id": "35439611935333291751316403861653166031288411021878165507",
        "message": "{\"host\":\"ip-22-22-2-83\",\"ident\":\"systemd\",\"message\":\"Starting User Slice of ec2-user.\"}"
      },
      {
        "timestamp": 1589167161000,
        "id": "35439611935333291751316403861653166031288411021878165508",
        "message": "{\"host\":\"ip-22-22-2-83\",\"ident\":\"systemd\",\"message\":\"Started Session 2 of user ec2-user.\"}"
      },
      {
        "timestamp": 1589167161000,
        "id": "35439611935333291751316403861653166031288411021878165509",
        "message": "{\"host\":\"ip-22-22-2-83\",\"ident\":\"systemd-logind\",\"message\":\"New session 2 of user ec2-user.\"}"
      }
    ],
    "type": "ec2_logs"
  },
  "fields": {
    "@timestamp": [
      "2020-05-11T08:01:24.400Z"
    ]
  },
  "sort": [
    1589184084400
  ]
}

The index mapping is:

{
  "ec2_logs-index-2020.05.11" : {
    "mappings" : {
      "dynamic" : "true",
      "_meta" : { },
      "_source" : {
        "includes" : [ ],
        "excludes" : [ ]
      },
      "dynamic_date_formats" : [
        "strict_date_optional_time",
        "yyyy/MM/dd HH:mm:ss Z||yyyy/MM/dd Z"
      ],
      "dynamic_templates" : [ ],
      "date_detection" : true,
      "numeric_detection" : false,
      "properties" : {
        "@timestamp" : {
          "type" : "date",
          "format" : "strict_date_optional_time"
        },
        "@version" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "logEvents" : {
          "type" : "nested",
          "properties" : {
            "id" : {
              "type" : "text"
            },
            "message" : {
              "type" : "text"
            },
            "timestamp" : {
              "type" : "date_nanos"
            }
          }
        },
        "logGroup" : {
          "type" : "text"
        },
        "logStream" : {
          "type" : "text"
        },
        "messageType" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "owner" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "subscriptionFilters" : {
          "type" : "text"
        },
        "type" : {
          "type" : "text"
        }
      }
    }
  }
}

What I'm trying to do here is removing all forward slash and square brackets from logEvents.messages field and change logEvents.id to just a -

To that end, I have the following filter in logstash:

filter
{
  mutate{
    gsub => [
      "[logEvents][message]", "[\[\]\\]", "",
      "[logEvents][id]", ".*", "-"
    ]
  }
}

But the resulting output is exactly the same. What did i do wrong?

Thanks in advance...
ck

Hi. Your problem is that you actually want to iterate over an array of objects with fields instead of a single, nested field.

Unfortunately, gsub only works with strings or arrays of strings.

Instead of

it should be something like

    gsub     => [
          "[logEvents][0][message]", "[\[\]\\]", "",
          "[logEvents][1][message]", "[\[\]\\]", "",
          etc.

so, for a variable number of LogEvents elements, I guess you will need to use a custom ruby filter to read logEvents array and apply a substitution to all the message values

Hi Andres,

Yes you are right. There is an indeterminate number of logEvents elements. Sorry I have never done any ruby scripting before. I have googled a piece of code that'll probably return me the number of elements.

ruby {
  code => "
    event.set('number_of_elements', event.get('logEvents').length)
  "
}

After some googling, probably (?) the ruby code is:

ruby {
  code => "
    event.set('number_of_elements', event.get('logEvents').length)
    count = 0
    while count <= number_of_elements do
      mutate {   
        gsub     => [
          "[logEvents][count][message]", "[\[\]\\]", "",
        ]
      }
    count +=1
    end
  "
}

Does it work like that?

thanks,
ck

I presume you will have easier test environment to check your own data :face_with_raised_eyebrow: and unfortunately I have just the bare minimum ruby knowledge to fulfill my needs regarding to elastic stack / logstash usage.

So I can ony give you general advice here, and you must take it as a non-rigorous descriptions at best :smiley:

  • The mutate block inside a while loop is "logstash configuration/code", not Ruby. You can not mix both inside a ruby block.

  • There is a ruby gsub method that you can apply to strings.

  • The while loop may be improved, but I'll show another quick&dirty solution that may serve as inspiration:

    code => "
        event.set("logEvents", event.get("logEvents").each{ |item| item[:message].gsub!( /[\[\]\\]/, "" ) } )
    "

Notes:

  • I haven't seen brackets or back slashes (other than the ones used to escape json quotes etc) [, [, \ in your message fields, you will need to pick other examples to test it.

  • Maybe you will need a more robust solution: check that logevents is not empty by assigning it to a intermediate variable and adding conditions, etc.

  • More experienced ruby devs may give better insights or a more efficient, elegant, etc. solution.

1 Like

Hi there,

something like the following should work in your case:

filter {
  ruby {
    code => "
      log_events = event.get('logEvents')
      log_events.map do |event|
        event['id'] = '-'
        event['message'].gsub!(/[\[\]\/]/, '')
      end
      event.set('logEvents', log_events)
    "
  }
}

I tested it with the following pipeline (giving in input the sample you posted) and it seems to work properly:

input {
  generator { 
    count => 1
    lines => [ '{"owner": "104115521938","@version": "1","logStream": "mystream","subscriptionFilters": ["To-Firehose"],"messageType": "DATA_MESSAGE","logGroup": "ec2-fluentd","@timestamp": "2020-05-11T08:01:24.400Z","logEvents": [{"timestamp": 1589167148000,"id": "35439611645423604170418303021688828486859711444132757504","message": "{\"host\":\"ip-22-22-2-83\",\"ident\":\"dhclient\",\"pid\":\"2987\",\"message\":\"XMT: Solicit on eth0, interval 109630ms.\"}"},{"timestamp": 1589167161000,"id": "35439611935333291751316403861653166031288411021878165505","message": "{\"host\":\"ip-22-22-2-83\",\"ident\":\"sshd\",\"pid\":\"3864\",\"message\":\"Accepted publickey for ec2-user from 138.75.33.59 port 51868 ssh2: RSA SHA256:QY/2uJJiV5cYpErAb/KLg/505B6WQ4ZlvcBazh1Qfyo\"}"},{"timestamp": 1589167161000,"id": "35439611935333291751316403861653166031288411021878165506","message": "{\"host\":\"ip-22-22-2-83\",\"ident\":\"systemd\",\"message\":\"Created slice User Slice of ec2-user.\"}"},{"timestamp": 1589167161000,"id": "35439611935333291751316403861653166031288411021878165507","message": "{\"host\":\"ip-22-22-2-83\",\"ident\":\"systemd\",\"message\":\"Starting User Slice of ec2-user.\"}"},{"timestamp": 1589167161000,"id": "35439611935333291751316403861653166031288411021878165508","message": "{\"host\":\"ip-22-22-2-83\",\"ident\":\"systemd\",\"message\":\"Started Session 2 of user ec2-user.\"}"},{"timestamp": 1589167161000,"id": "35439611935333291751316403861653166031288411021878165509","message": "{\"host\":\"ip-22-22-2-83\",\"ident\":\"systemd-logind\",\"message\":\"New session 2 of user ec2-user.\"}"}],"type": "ec2_logs"}' ]
    codec => "json"
  }
}

filter {
  ruby {
    code => "
      log_events = event.get('logEvents')
      log_events.map do |event|
        event['id'] = '-'
        event['message'].gsub!(/[\[\]\/]/, '')
      end
      event.set('logEvents', log_events)
    "
  }
}

output {
  stdout{}
}

However, I set the id nested field as the string -. If you want it to be nil just replace the - hardcoded string with nil obviously.

Thanks Fabio and Andres! :+1: This is exactly what I'm looking for.