Xml-file too big for logstash?

If you run a split filter against a very large field then split is very memory intensive (it creates a copy of the very large field for each entry in that field). That could be optimized out, but split does not do that. Try this.

So I tried your approach but I'm not sure about how to adjust it for my example. I kept the splitData.rb exactly like in your example (with field and target), so that's the first thing I wasn't sure about.
And in my config file I wasn't sure about whether to leave the target field and store_xml etc., and also not about how to adjust the ruby-part.
With this config file:

input {
  file {
        path => "/home/christiane/Dokumente/evtx_shortx.xml"
        start_position => beginning
        mode => read
        sincedb_path => "/dev/null"
        codec => multiline
        {
                pattern => "^<\?xmldata.*>"
                negate => "true"
                what => "previous"
                auto_flush_interval => 1
                max_lines => 200000000
                max_bytes => "100 MiB"
        }
        }
       }

filter {
  xml {
        source => "message"
        target => "[@metadata][theXML]"
        store_xml => true
        remove_namespaces => true
        force_array => false
        remove_field => ["message"]
      }

  ruby {
        path => '/home/christiane/splitData.rb'
        script_params => { field => "Event" target => "theXML"}
        }

  ruby {
        code => '
                e = event.get("[@metadata][theXML][Event][EventData][Data]")

                if e.kind_of?(Array)
                        e.each { |x|
                                event.set(x["Name"], x["content"])
                               }
                else
                        event.set(e["Name"], e["content"])
                end
                '
        }

 mutate {
        copy => {
                "[@metadata][theXML][Event][System][Provider]" => "Provider"
                "[@metadata][theXML][Event][System][EventID]" => "EventID"
                "[@metadata][theXML][Event][System][Version]" => "Version"
                "[@metadata][theXML][Event][System][Level]" => "Level"
                "[@metadata][theXML][Event][System][Task]" => "Task"
                "[@metadata][theXML][Event][System][Opcode]" => "Opcode"
                "[@metadata][theXML][Event][System][Keywords]" => "Keywords"
                "[@metadata][theXML][Event][System][TimeCreated]" => "TimeCreated"
                "[@metadata][theXML][Event][System][TimeCreated][SystemTime]" => "SystemTime"
                "[@metadata][theXML][Event][System][EventRecordID]" => "EventRecordID"
                "[@metadata][theXML][Event][System][Correlation]" => "Correlation"
                "[@metadata][theXML][Event][System][Execution]" => "Execution"
                "[@metadata][theXML][Event][System][Channel]" => "Channel"
                "[@metadata][theXML][Event][System][Computer]" => "Computer"
                "[@metadata][theXML][Event][System][Security]" => "Security"
                }
          }

  date {
        match => ["[@metadata][theXML][Event][System][TimeCreated][SystemTime]", "YYYY-MM-dd HH:mm:$
        timezone => "Europe/Berlin"
        }

output {
        stdout { codec => rubydebug {metadata => true}}
}

I don't get several Events, just one Document where the Events are part of an Array if I understand it correctly.

That's the output I get:

[ERROR] 2019-05-06 09:02:32.994 [[main]>worker2] ruby - Could not process event: undefined method `each' for nil:NilClass {:script_path=>"/home/christiane/splitData.rb", :class=>"NoMethodError", :backtrace=>["/home/christiane/splitData.rb:10:in `filter'", "/usr/share/logstash/vendor/bundle/jruby/2.5.0/gems/logstash-filter-ruby-3.1.5/lib/logstash/filters/ruby/script/context.rb:55:in `execute_filter'", "/usr/share/logstash/vendor/bundle/jruby/2.5.0/gems/logstash-filter-ruby-3.1.5/lib/logstash/filters/ruby/script.rb:30:in `execute'", "/usr/share/logstash/vendor/bundle/jruby/2.5.0/gems/logstash-filter-ruby-3.1.5/lib/logstash/filters/ruby.rb:98:in `file_script'", "/usr/share/logstash/vendor/bundle/jruby/2.5.0/gems/logstash-filter-ruby-3.1.5/lib/logstash/filters/ruby.rb:84:in `filter'", "/usr/share/logstash/logstash-core/lib/logstash/filters/base.rb:143:in `do_filter'", "/usr/share/logstash/logstash-core/lib/logstash/filters/base.rb:162:in `block in multi_filter'", "org/jruby/RubyArray.java:1792:in `each'", "/usr/share/logstash/logstash-core/lib/logstash/filters/base.rb:159:in `multi_filter'", "org/logstash/config/ir/compiler/AbstractFilterDelegatorExt.java:115:in `multi_filter'", "(eval):155:in `block in filter_func'", "/usr/share/logstash/logstash-core/lib/logstash/pipeline.rb:358:in `filter_batch'", "/usr/share/logstash/logstash-core/lib/logstash/pipeline.rb:337:in `worker_loop'", "/usr/share/logstash/logstash-core/lib/logstash/pipeline.rb:304:in `block in start_workers'"]}
[ERROR] 2019-05-06 09:02:33.003 [[main]>worker2] ruby - Ruby exception occurred: undefined method `[]' for nil:NilClass
/usr/share/logstash/vendor/bundle/jruby/2.5.0/gems/awesome_print-1.7.0/lib/awesome_print/formatters/base_formatter.rb:31: warning: constant ::Fixnum is deprecated
{
          "path" => "/home/christiane/Dokumente/evtx_shortx.xml",
    "@timestamp" => 2019-05-06T07:02:31.733Z,
     "@metadata" => {
          "host" => "christiane-ThinkPad-X240",
          "path" => "/home/christiane/Dokumente/evtx_shortx.xml",
        "theXML" => {
            "Event" => [
                [0] {
                        "xmlns" => "http://schemas.microsoft.com/win/2004/08/events/event",
                       "System" => {
                            "Execution" => {
                             "ThreadID" => "696",
                            "ProcessID" => "692"
                        },
                              "EventID" => {
                            "Qualifiers" => "",
                               "content" => "1089"
                        },
                                 "Task" => "101",
                             "Security" => {
                            "UserID" => "S-1-5-18"
                        },
                             "Provider" => {
                            "Name" => "Microsoft-Windows-AAD",
                            "Guid" => "{4de9bc9c-b27a-43c9-8994-0915f1a5e24f}"
                        },
                                "Level" => "2",
                              "Version" => "0",
                              "Channel" => "Microsoft-Windows-AAD/Operational",
                             "Keywords" => "0x4000000000000012",
                               "Opcode" => "0",
                             "Computer" => "xxx",
                          "TimeCreated" => {
                            "SystemTime" => "2019-02-01 15:08:46.508312"
                        },
                        "EventRecordID" => "19",
                          "Correlation" => {
                            "RelatedActivityID" => "",
                                   "ActivityID" => "{0114a2a5-ba40-0001-b6a2-140140bad401}"
                        }
                    },
                    "EventData" => {
                        "Data" => {
                               "Name" => "Status",
                            "content" => "0xc00484b2"
                        }
                    }
                },
                [1] {
                        "xmlns" => "http://schemas.microsoft.com/win/2004/08/events/event",
                       "System" => {
                            "Execution" => {
                             "ThreadID" => "696",
                            "ProcessID" => "692"
                        },
                              "EventID" => {
                            "Qualifiers" => "",
                               "content" => "1104"
                        },
                                 "Task" => "101",
                             "Security" => {
                            "UserID" => "S-1-5-18"
                        },
                             "Provider" => {
                            "Name" => "Microsoft-Windows-AAD",
                            "Guid" => "{4de9bc9c-b27a-43c9-8994-0915f1a5e24f}"
                        },
                                "Level" => "2",
                              "Version" => "0",
                              "Channel" => "Microsoft-Windows-AAD/Operational",
                             "Keywords" => "0x4000000000000012",
                               "Opcode" => "0",
                             "Computer" => "xxx",
                          "TimeCreated" => {
                            "SystemTime" => "2019-02-01 15:08:46.508322"
                        },
                        "EventRecordID" => "20",
                          "Correlation" => {
                            "RelatedActivityID" => "",
                                   "ActivityID" => "{0114a2a5-ba40-0001-b6a2-140140bad401}"
                        }
                    },
                    "EventData" => {
                        "Data" => [
                            [0] {
                                   "Name" => "API",
                                "content" => "Plugin initialize"
                            },
                            [1] {
                                   "Name" => "Result",
                                "content" => "3221521586"
                            }
                        ]
                    }
                },

(sorry, had to split the output in two in order to fit in the body)

  [2] {
                            "xmlns" => "http://schemas.microsoft.com/win/2004/08/events/event",
                           "System" => {
                                "Execution" => {
                                 "ThreadID" => "692",
                                "ProcessID" => "688"
                            },
                                  "EventID" => {
                                "Qualifiers" => "",
                                   "content" => "1089"
                            },
                                     "Task" => "101",
                                 "Security" => {
                                "UserID" => "S-1-5-18"
                            },
                                 "Provider" => {
                                "Name" => "Microsoft-Windows-AAD",
                                "Guid" => "{4de9bc9c-b27a-43c9-8994-0915f1a5e24f}"
                            },
                                    "Level" => "2",
                                  "Version" => "0",
                                  "Channel" => "Microsoft-Windows-AAD/Operational",
                                 "Keywords" => "0x4000000000000012",
                                   "Opcode" => "0",
                                 "Computer" => "xxx",
                              "TimeCreated" => {
                                "SystemTime" => "2019-02-05 13:29:08.712667"
                            },
                            "EventRecordID" => "21",
                              "Correlation" => {
                                "RelatedActivityID" => "",
                                       "ActivityID" => "{bf1d25ff-bd56-0005-0026-1dbf56bdd401}"
                            }
                        },
                        "EventData" => {
                            "Data" => {
                                   "Name" => "Status",
                                "content" => "0xc00484b2"
                            }
                        }
                    },
                    [3] {
                            "xmlns" => "http://schemas.microsoft.com/win/2004/08/events/event",
                           "System" => {
                                "Execution" => {
                                 "ThreadID" => "692",
                                "ProcessID" => "688"
                            },
                                  "EventID" => {
                                "Qualifiers" => "",
                                   "content" => "1104"
                            },
                                     "Task" => "101",
                                 "Security" => {
                                "UserID" => "S-1-5-18"
                            },
                                 "Provider" => {
                                "Name" => "Microsoft-Windows-AAD",
                                "Guid" => "{4de9bc9c-b27a-43c9-8994-0915f1a5e24f}"
                            },
                                    "Level" => "2",
                                  "Version" => "0",
                                  "Channel" => "Microsoft-Windows-AAD/Operational",
                                 "Keywords" => "0x4000000000000012",
                                   "Opcode" => "0",
                                 "Computer" => "xxx",
                              "TimeCreated" => {
                                "SystemTime" => "2019-02-05 13:29:08.712677"
                            },
                            "EventRecordID" => "22",
                              "Correlation" => {
                                "RelatedActivityID" => "",
                                       "ActivityID" => "{bf1d25ff-bd56-0005-0026-1dbf56bdd401}"
                            }
                        },
                        "EventData" => {
                            "Data" => [
                                [0] {
                                       "Name" => "API",
                                    "content" => "Plugin initialize"
                                },
                                [1] {
                                       "Name" => "Result",
                                    "content" => "3221521586"
                                }
                            ]
                        }
                    }
                ]
            }
        },
              "host" => "christiane-ThinkPad-X240",
          "@version" => "1",
              "tags" => [
            [0] "multiline",
            [1] "_rubyexception"
        ]
    }

I'm sure I didn't do the correct adjustments to your approach, and I tried around a few things (like removing the target in the xml-filter and setting store_xml on false, also tried different parameters in the ruby-part) but had no success so far.

Also, could a lot of ruby_exception have something to do with the java.lang.OutOfMemory? Should I try fixing that first, could that help?

There is a bug related to removing @metadata fields which means the optimization does not work for such fields. Try

  xml {
    source => "message"
    target => "[theXML]"
    store_xml => true
    remove_namespaces => true
    force_array => false
    remove_field => ["message"]
  }

ruby {
    path => '/home/christiane/splitData.rb'
    script_params => { field => "[theXML][Event]" target => "someOtherField"}
}

Do I have to remove the @metadata everywhere in the file then? Cause I still get ruby exceptions for each event, and it's all basically still just one big Document, in this case called "otherField".

[ERROR] 2019-05-07 07:37:51.400 [[main]>worker0] ruby - Ruby exception occurred: undefined method `[]' for nil:NilClass
[ERROR] 2019-05-07 07:37:51.406 [[main]>worker0] ruby - Ruby exception occurred: undefined method `[]' for nil:NilClass
[ERROR] 2019-05-07 07:37:51.407 [[main]>worker0] ruby - Ruby exception occurred: undefined method `[]' for nil:NilClass
[ERROR] 2019-05-07 07:37:51.408 [[main]>worker0] ruby - Ruby exception occurred: undefined method `[]' for nil:NilClass
/usr/share/logstash/vendor/bundle/jruby/2.5.0/gems/awesome_print-1.7.0/lib/awesome_print/formatters/base_formatter.rb:31: warning: constant ::Fixnum is deprecated
{
          "path" => "/home/christiane/Dokumente/evtx_shortx.xml",
    "@timestamp" => 2019-05-07T05:37:49.798Z,
    "otherField" => {
            "xmlns" => "http://schemas.microsoft.com/win/2004/08/events/event",
        "EventData" => {
            "Data" => {
                "content" => "0xc00484b2",
                   "Name" => "Status"
            }
        },
           "System" => {
                     "Task" => "101",
              "Correlation" => {
                       "ActivityID" => "{0114a2a5-ba40-0001-b6a2-140140bad401}",
                "RelatedActivityID" => ""
            },
                 "Keywords" => "0x4000000000000012",
                  "Channel" => "Microsoft-Windows-AAD/Operational",
                   "Opcode" => "0",
                 "Security" => {
                "UserID" => "S-1-5-18"
            },
                 "Provider" => {
                "Guid" => "{4de9bc9c-b27a-43c9-8994-0915f1a5e24f}",
                "Name" => "Microsoft-Windows-AAD"
            },
              "TimeCreated" => {
                "SystemTime" => "2019-02-01 15:08:46.508312"
            },
                "Execution" => {
                 "ThreadID" => "696",
                "ProcessID" => "692"
            },
            "EventRecordID" => "19",
                  "Version" => "0",
                 "Computer" => "xxx",
                  "EventID" => {
                "Qualifiers" => "",
                   "content" => "1089"
            },
                    "Level" => "2"
        }
    },
     "@metadata" => {
        "host" => "christiane-ThinkPad-X240",
        "path" => "/home/christiane/Dokumente/evtx_shortx.xml"
    },
      "@version" => "1",
          "host" => "christiane-ThinkPad-X240",
        "theXML" => {},
          "tags" => [
        [0] "multiline",
        [1] "_rubyexception"
    ]
}
   .... (two more events)
{
          "path" => "/home/christiane/Dokumente/evtx_shortx.xml",
    "@timestamp" => 2019-05-07T05:37:49.798Z,
    "otherField" => {
            "xmlns" => "http://schemas.microsoft.com/win/2004/08/events/event",
        "EventData" => {
            "Data" => [
                [0] {
                    "content" => "Plugin initialize",
                       "Name" => "API"
                },
                [1] {
                    "content" => "3221521586",
                       "Name" => "Result"
                }
            ]
        },
           "System" => {
                     "Task" => "101",
              "Correlation" => {
                       "ActivityID" => "{bf1d25ff-bd56-0005-0026-1dbf56bdd401}",
                "RelatedActivityID" => ""
            },
                 "Keywords" => "0x4000000000000012",
                  "Channel" => "Microsoft-Windows-AAD/Operational",
                   "Opcode" => "0",
                 "Security" => {
                "UserID" => "S-1-5-18"
            },
                 "Provider" => {
                "Guid" => "{4de9bc9c-b27a-43c9-8994-0915f1a5e24f}",
                "Name" => "Microsoft-Windows-AAD"
            },
              "TimeCreated" => {
                "SystemTime" => "2019-02-05 13:29:08.712677"
            },
                "Execution" => {
                 "ThreadID" => "692",
                "ProcessID" => "688"
            },
            "EventRecordID" => "22",
                  "Version" => "0",
                 "Computer" => "xxx",
                  "EventID" => {
                "Qualifiers" => "",
                   "content" => "1104"
            },
                    "Level" => "2"
        }
    },
     "@metadata" => {
        "host" => "christiane-ThinkPad-X240",
        "path" => "/home/christiane/Dokumente/evtx_shortx.xml"
    },
      "@version" => "1",
          "host" => "christiane-ThinkPad-X240",
        "theXML" => {},
          "tags" => [
        [0] "multiline",
        [1] "_rubyexception"
    ]
}

(just removed two events from the output so that it fits)

The value used as the target of the xml filter has to be the prefix used in the field name as the field for the script_params option of the ruby filter. So if you use otherField in the xml filter then you need that in the script_params.

so if I have
target => "[theXML]"
in the xml filter, then it should be:
script_params => {field => "[theXML][Event]" target => "[theXML]"}
in the ruby part?
I tried that as well, the output still looks the same.

There is no [theXML][Event] in the sample messages you posted most recently. There is a field called [theXML][EventData][Event], and in the second sample message that is an array, so it would get split.

If that is the only data in [theXML] that you want (i.e. you are OK discarding System etc.) then you could make the target [theXML]. I would start by setting target to be some other field and seeing if you like what you get.

yeah, but I want it to split for the field [theXML]{Event], I need the other data as well.
So do I need to put that as the target then?

That field does not exist in your sample messages.

but it did in the one I posted yesterday, before the change without the @metadata, where all the Events were just part of an Array.

OK, if all that has changed from that is removing the [@metadata] prefix then try something like

script_params => {field => "[theXML][Event]" target => "[anotherField]"}

still confused about the target though, it has to be the same in the xml filter then?

No, in fact you most likely want them to be different. Consider this configuration

input { generator { count => 1 message => '<a><b><c>c</c></b><b><c>d</c></b></a>' } }

filter {
    xml {
        source => "message"
        target => "[theXML]"
        store_xml => true
        remove_namespaces => true
        force_array => false
        remove_field => ["message"]
        }
    ruby {
        path => '/home/user/ruby/splitData.rb'
        script_params => { field => "[theXML][b]" target => "someOtherField"}
    }
}
output { stdout { codec => rubydebug { metadata => false } } }

Without the ruby filter the XML would be parsed into a field called theXML which contains an array called b

    "theXML" => {
    "b" => [
        [0] {
            "c" => "c"
        },
        [1] {
            "c" => "d"
        }
    ]

Now we want to split that array and create an event for each entry with the array entry as a field on the event. target tells the code what to call that field. With the ruby filter the events will look like

{
    "someOtherField" => {
        "c" => "d"
    },
    [...]
            "theXML" => {}
}

Does that help?

Just to make sure, I still leave everything else in the input-part, just add the generator, right? Also the other ruby code?
I left everything else the way it was, and it doesn't work.
Though I'm still not sure if I have to remove the @metadata everywhere else as well, like in the mutate- or date-filter?

I am not saying to add the generator to your code. I am just using that to generate a very small piece of XML that contains an array so that I could show a complete example.

You would need to remove [@metadata] everywhere. The field option passed to splitData.rb has to match your XML, so if the array is [theXML][Event] then use that.

So I tried around a few things and I guess the problem is not the newly added ruby part but the one that was there before, the one handling the EventData Block. I tried commenting out the other part and then the split works and I get no ruby exceptions:

"theXML" => {
           "System" => {
                 "Keywords" => "0x4000000000000012",
                    "Level" => "2",
                  "Channel" => "Microsoft-Windows-AAD/Operational",
                  "EventID" => {
                   "content" => "1089",
                "Qualifiers" => ""
            },
              "Correlation" => {
                       "ActivityID" => "{0114a2a5-ba40-0001-b6a2-140140bad401}",
                "RelatedActivityID" => ""
            },
                  "Version" => "0",
                   "Opcode" => "0",
                "Execution" => {
                "ProcessID" => "692",
                 "ThreadID" => "696"
            },
                     "Task" => "101",
                 "Computer" => "xxx",
            "EventRecordID" => "19",
                 "Provider" => {
                "Name" => "Microsoft-Windows-AAD",
                "Guid" => "{4de9bc9c-b27a-43c9-8994-0915f1a5e24f}"
            },
                 "Security" => {
                "UserID" => "S-1-5-18"
            },
              "TimeCreated" => {
                "SystemTime" => "2019-02-01 15:08:46.508312"
            }
        },
            "xmlns" => "http://schemas.microsoft.com/win/2004/08/events/event",
        "EventData" => {
            "Data" => {
                "content" => "0xc00484b2",
                   "Name" => "Status"
            }
        }
    },
          "host" => "christiane-ThinkPad-X240",
      "@version" => "1",
    "@timestamp" => 2019-05-09T05:36:07.229Z,
          "tags" => [
        [0] "multiline"
    ],
          "path" => "/home/christiane/Dokumente/evtx_shortx.xml"
}
{
        "theXML" => {
           "System" => {
                 "Keywords" => "0x4000000000000012",
                    "Level" => "2",
                  "Channel" => "Microsoft-Windows-AAD/Operational",
                  "EventID" => {
                   "content" => "1104",
                "Qualifiers" => ""
            },
              "Correlation" => {
                       "ActivityID" => "{0114a2a5-ba40-0001-b6a2-140140bad401}",
                "RelatedActivityID" => ""
            },
                  "Version" => "0",
                   "Opcode" => "0",
                "Execution" => {
                "ProcessID" => "692",
                 "ThreadID" => "696"
            },
                     "Task" => "101",
                 "Computer" => "xxx",
            "EventRecordID" => "20",
                 "Provider" => {
                "Name" => "Microsoft-Windows-AAD",
                "Guid" => "{4de9bc9c-b27a-43c9-8994-0915f1a5e24f}"
            },
                 "Security" => {
                "UserID" => "S-1-5-18"
            },
              "TimeCreated" => {
                "SystemTime" => "2019-02-01 15:08:46.508322"
            }
        },
            "xmlns" => "http://schemas.microsoft.com/win/2004/08/events/event",
        "EventData" => {
            "Data" => [
                [0] {
                    "content" => "Plugin initialize",
                       "Name" => "API"
                },
                [1] {
                    "content" => "3221521586",
                       "Name" => "Result"
                }
            ]
        }
    },
          "host" => "christiane-ThinkPad-X240",
      "@version" => "1",
    "@timestamp" => 2019-05-09T05:36:07.229Z,
          "tags" => [
        [0] "multiline"
    ],
          "path" => "/home/christiane/Dokumente/evtx_shortx.xml"
}

So now at least we know that the split is working, but how can I get it to show the content from the EventData like before now?

that's what my filter is currently looking like by the way:

filter {
  xml {
        source => "message"
        target => "[theXML]"
        store_xml => true
        remove_namespaces => true
        force_array => false
        remove_field => ["message"]
      }

  ruby {
        path => '/home/christiane/splitData.rb'
        script_params => {field => "[theXML][Event]" target => "[theXML][Event]"}
       }
# ruby {
#       code => '
#               e = event.get("[theXML][Event][EventData][Data]")

#               if e.kind_of?(Array)
#                       e.each { |x|
#                               event.set(x["Name"], x["content"])
#                              }
#               else
#                       event.set(e["Name"], e["content"])
#               end
#               '
#       }
#  mutate {
#       copy => {
#               "[theXML][Event][System][Provider]" => "Provider"
#               "[theXML][Event][System][EventID]" => "EventID"
#               "[theXML][Event][System][Version]" => "Version"
#               "[theXML][Event][System][Level]" => "Level"
#               "[theXML][Event][System][Task]" => "Task"
#               "[theXML][Event][System][Opcode]" => "Opcode"
#               "[theXML][Event][System][Keywords]" => "Keywords"
#               "[theXML][Event][System][TimeCreated]" => "TimeCreated"
#               "[theXML][Event][System][TimeCreated][SystemTime]" => "SystemTime"
#               "[theXML][Event][System][EventRecordID]" => "EventRecordID"
#               "[theXML][Event][System][Correlation]" => "Correlation"
#               "[theXML][Event][System][Execution]" => "Execution"
#               "[theXML][Event][System][Channel]" => "Channel"
#               "[theXML][Event][System][Computer]" => "Computer"
#               "[theXML][Event][System][Security]" => "Security"
#               }
 #         }

  date {
        match => ["[theXML][Event][System][TimeCreated][SystemTime]", "YYYY-MM-dd HH:mm:s$
        timezone => "Europe/Berlin"
        }

  }

okay nevermind, it's not the other ruby part, it's the mutate filter. I guess it doesn't work like that (with copy) without the [@metadata]?