Exception in pipeline worker and Java Out of Memory Errors

Hello,

I'm runnung ELK 6.6 and I'm trying to index about 2,000 json files and I'm getting the following error:

java.lang.OutOfMemoryError: Java heap space
Dumping heap to java_pid6685.hprof ...
Heap dump file created [24803815009 bytes in 292.571 secs]
[ERROR] 2019-02-25 20:21:26.757 [[main]>worker5] pipeline - Exception in pipelineworker, the pipeline stopped processing new events, please check your filter configuration and restart Logstash. {:pipeline_id=>"main", "exception"=>"Java heap space", "backtrace"=>["java.util.IdentityHashMap.init(IdentityHashMap.java:255)", "java.util.IdentityHashMap.<init>(IdentityHashMap.java:227)", "org.logstash.ConvertedMap.<init>(ConvertedMap.java:47)", "org.logstash.ConvertedMap.newFromMap(ConvertedMap.java:51)", "org.logstash.Valuefier.lambda$initConverters$12(Valuefier.java:143)", "org.logstash.Valuefier$$Lambda$152/1973233285.convert(Unknown Source)", "org.logstash.Valuefier.convert(Valuefier.java:73)", "org.logstash.ConvertedList.newFromList(ConvertedList.java:27)", "org.logstash.Valuefier.lambda$initConverters$13(Valuefier.java:144)", "org.logstash.Valuefier$$Lambda$153/981189584.convert(Unknown Source)", "org.logstash.Valuefier.convert(Valuefier.java:73)", "org.logstash.ConvertedMap.newFromMap(ConvertedMap.java:55)", "org.logstash.Valuefier.lambda$initConverters$12(Valuefier.java:143)", "org.logstash.Valuefier$$Lambda$152/1973233285.convert(Unknown Source)", "org.logstash.Valuefier.convert(Valuefier.java:73)", "org.logstash.ConvertedMap.newFromMap(ConvertedMap.java:55)", "org.logstash.Event.clone(Event.java:268)", "org.logstash.ext.JrubyEventExtLibrary$RubyEvent.rubyClone(JrubyEventExtLibrary.java:147)", "org.logstash.ext.JrubyEventExtLibrary$RubyEvent.rubyClone(JrubyEventExtLibrary.java:142)", "java.lang.invoke.LambdaForm$DMH/247944893.invokeSpecial_LL_L(LambdaForm$DMH)", "java.lang.invoke.LambdaForm$BMH/679143692.reinvoke(LambdaForm$BMH)", "java.lang.invoke.LambdaForm$MH/2045766957.delegate(LambdaForm$MH)", "java.lang.invoke.LambdaForm$MH/187150864.guard(LambdaForm$MH)", "java.lang.invoke.LambdaForm$MH/2045766957.delegate(LambdaForm$MH)", "java.lang.invoke.LambdaForm$MH/187150864.guard(LambdaForm$MH)", "java.lang.invoke.LambdaForm$MH/1985178707.linkToCallSite(LambdaForm$MH)", "usr.share.logstash.vendor.bundle.jruby.$2_dot_3_dot_0.gems.logstash_minus_filter_minus_split_minus_3_dot_1_dot_7.lib.logstash.filters.split.RUBY$block$filter$1(/usr/share/logstash/vendor/bundle/jruby/2.3.0/gems/logstash-filter-split-3.1.7/lib/logstash/filters/split.rb:88)", "java.lang.invoke.LambdaForm$DMH/1953515933.invokeStatic_L8_L(LambdaForm$DMH)", "java.lang.invoke.LambdaForm$MH/479442206.invokeExact_MT(LambdaForm$MH)", "org.jruby.runtime.CompiledIRBlockBody.yieldDirect(CompiledIRBlockBody.java:156)", "org.jruby.runtime.BlockBody.yield(BlockBody.java:114)", "org.jruby.runtime.Block.yield(Block.java:165)"], :thread=>"#<Thread:0xcc26f9a sleep>"}
[ERROR] 2019-02-25 20:21:27.265 [LogStash::Runner] Logstash - java.lang.OutOfMemoryError: Java heap space

My config file looks like this:

input {
  file {
    type => "json"
    codec => "json"
    path => "/home/ubuntu/json/*"
    sincedb_path => "/var/tmp/.sincedb_threats"
    start_position => "beginning"
  }
}

filter {

  split {
    field => "[Event][Attribute]"
  }

  mutate {
    add_field => {"intel-type-metadata" => ""}
  }

  # IP Fields
  if [Event][Attribute][type] == "ip-src" {
    mutate {
      replace => [ "[Event][Attribute][type]", "IP" ]
      update => {"intel-type-metadata" => "ip-src"}
    }
  } else if [Event][Attribute][type] == "ip-dst" {
    mutate {
      replace => [ "[Event][Attribute][type]", "IP" ]
      update => {"intel-type-metadata" => "ip-dst"}
    }
  } else if [Event][Attribute][type] == "dst-ip" {
    mutate {
      replace => [ "[Event][Attribute][type]", "IP" ]
      update => {"intel-type-metadata" => "dst-ip"}
    }
  }

  if [Event][Object] {
    mutate {
      remove_field => [ "[Event][Object]" ]
    }
  }

  mutate {
    rename => {
      "[Event][info]" => "intel-source"
      "[Event][analysis]" => "intel-analysis"
      "[Event][Attribute][comment]" => "comments"
      "[Event][Attribute][category]" => "intel-category"
      "[Event][Attribute][uuid]" => "intel-uuid"
      "[Event][Attribute][timestamp]" => "intel-timestamp"
      "[Event][Attribute][to_ids]" => "exportable-to-ids"
      "[Event][Attribute][value]" => "intel-value"
      "[Event][Attribute][type]" => "intel-type"
      "[Event][threat_level_id]" => "threat-level-id"
    }

    convert => {
      "intel-analysis" => "integer"
      "threat_level_id" => "integer"
    }

    add_tag => [ "misp_daily_updates" ]

    remove_field => [ "[Event]" ]
  }

  date {
    match => [ "intel-timestamp", "UNIX" ]
    target => "intel-timestamp"
  }

  translate {
    field => "intel-analysis"
    destination => "intel-analysis-code"
    dictionary => [
        "0", "Initial",
        "1", "Ongoing",
        "2", "Complete"
    ]
  }

  translate {
    field => "threat-level-id"
    destination => "threat-level-code"
    dictionary => [
        "1", "High",
        "2", "Medium",
        "3", "Low",
        "4", "Undefined"
    ]
  }

  grok {
    match => { "intel-value" => "%{IP:intel-ip}"}
    tag_on_failure => [ ]
  }

  # add geoip attributes
  geoip {
    source => "intel-ip"
    tag_on_failure => [ ]
  }

  # Fingerprinting to remove duplicates
  fingerprint {
    concatenate_sources => true
    source => ["intel_type", "intel-value"]
    target => "[@metadata][fingerprint]"
    method => "MURMUR3"
  }

  if "_jsonparsefailure" in [tags] {
    drop { }
  }
}

output {
  stdout { codec => rubydebug }
  elasticsearch {
      hosts => localhost
      document_id => "%{[@metadata][fingerprint]}"
      index => "threats-%{+YYYY.MM.dd}"
  }
}

Any help is appreciated! Thanks in advance.

I would move that up to the top of the filter {}. No point in doing work if you are going to drop the packet.

There is nothing I can see in the filter that would obviously cause a memory leak. You are going to have to load up the whole 24 GB heap dump in MAT or some similar too and see what the large objects are. I suspect the majority of the memory will have accumulated in one collection.

Once MAT has loaded and parsed the dump (which may take hours) if you have to spend more than 60 seconds to figure out where the memory is being used then give up. If it not super obvious you are unlikely to find it.

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.