Hi, I have a problem with logstash, it starts working normally, but later I see huge heap usage. After analyzing logs from garbage collector I see that Old Generation is rises and not cleaned well. It leads to degradation of service and OOM killed process.
I take a heap dump from logstash process, and there is 90% of Inefficient Primitive Array (heaphero as analyzer)
Software version:
- Logstash 7.6.2
- Java 8
Logstash configuration:
input {
gelf {
port => 12210
type => "gelf-1"
remap => true }
}
filter {
if "gelf" in [type] and "events" not in [type] {
json {
source => "message"
target => "doc"
#remove_field => ["message"]
}
#collect errors
if ([doc][log-level] == "warn" or [doc][log-level] == "error") {
if ("" in [err][message]) {
mutate { add_field => {"error.message" => "%{[err][message]}"} }
}
if ("" in [doc][message]) {
mutate { add_field => {"error.message" => "%{[doc][message]}"} }
}
}
if ([doc][err]) {
if ([doc][err][name]) { mutate { add_field => {"error.name" => "%{[doc][err][name]}"} } }
if ([doc][err][message]) { mutate { add_field => {"error.message" => "%{[doc][err][message]}"} } }
if ([doc][err][code]) { mutate { add_field => {"error.code" => "%{[doc][err][code]}"} } } }
if ([facility] == "management") {
if ([doc][request]) {
if ([doc][request][url]) {
mutate { add_field => {"request.url" => "%{[doc][request][url]}"} }
}
if ([doc][request][data]) {
if ([doc][request][data][merch_id]) {
mutate { add_field => {"request.data.merch_id" => "%{[doc][request][data][merch_id]}"} } }
if ([doc][request][data][cust_id]) {
mutate { add_field => {"request.data.cust_id" => "%{[doc][request][data][cust_id]}"} } }
if ([doc][request][data][cust_session_id]) {
mutate { add_field => {"request.data.cust_session_id" => "%{[doc][request][data][cust_session_id]}"} } }
if ([doc][request][data][game_code]) {
mutate { add_field => {"request.data.game_code" => "%{[doc][request][data][game_code]}"} } }
if ([doc][request][data][ticket]) {
mutate { add_field => {"request.data.ticket" => "%{[doc][request][data][ticket]}"} } }
if ([doc][request][data][amount]) {
mutate { add_field => {"request.data.amount" => "%{[doc][request][data][amount]}"} } }
if ([doc][request][data][currency_code]) {
mutate { add_field => {"request.data.currency_code" => "%{[doc][request][data][currency_code]}"} } }
if ([doc][request][data][trx_id]) {
mutate { add_field => {"request.data.trx_id" => "%{[doc][request][data][trx_id]}"} } }
if ([doc][request][data][game_id]) {
mutate { add_field => {"request.data.game_id" => "%{[doc][request][data][game_id]}"} } }
if ([doc][request][data][round_id]) {
mutate { add_field => {"request.data.round_id" => "%{[doc][request][data][round_id]}"} } }
if ([doc][request][data][event_type]) {
mutate { add_field => {"request.data.event_type" => "%{[doc][request][data][event_type]}"} } }
if ([doc][request][data][event_id]) {
mutate { add_field => {"request.data.event_id" => "%{[doc][request][data][event_id]}"} } }
if ([doc][request][data][platform]) {
mutate { add_field => {"request.data.platform" => "%{[doc][request][data][platform]}"} } }
if ([doc][request][data][game_type]) {
mutate { add_field => {"request.data.game_type" => "%{[doc][request][data][game_type]}"} } }
if ([doc][request][data][game_status]) {
mutate { add_field => {"request.data.game_status" => "%{[doc][request][data][game_status]}"} } }
}
if ([doc][responseCode]) {
mutate { add_field => {"responseCode" => "%{[doc][responseCode]}"} }
}
if ([doc][responseBody]) {
mutate { add_field => {"responseBody" => "%{[doc][responseBody]}"} }
}
if ([doc][response]) {
if ([doc][response][error_code]) {
mutate { add_field => {"response.error_code" => "%{[doc][response][error_code]}"} } }
if ([doc][response][error_msg]) {
mutate { add_field => {"response.error_msg" => "%{[doc][response][error_msg]}"} } }
if ([doc][response][balance]) {
mutate { add_field => {"response.balance" => "%{[doc][response][balance]}"} } }
if ([doc][response][currency_code]) {
mutate { add_field => {"response.currency_code" => "%{[doc][response][currency_code]}"} } }
if ([doc][response][free_bet_count]) {
mutate { add_field => {"response.free_bet_count" => "%{[doc][response][free_bet_count]}"} } }
if ([doc][response][trx_id]) {
mutate { add_field => {"response.trx_id" => "%{[doc][response][trx_id]}"} } }
}
}
}
mutate { convert => {"request.data.amount" => "integer" } }
#collect trace-id
if ([message] =~ /trace-id/) { mutate { add_field => {"trace-id" => "%{[doc][trace-id]}"} } }
#collect name
if ([message] =~ /name/) { mutate { add_field => {"name" => "%{[doc][name]}"} } }
#collect level
if ([message] =~ /level/) { mutate { add_field => {"log-level" => "%{[doc][level]}"} } }
prune { whitelist_names => ["error.message", "error.name", "error.code", "request.url", "request.data.merch_id", "request.data.cust_id", "request.data.cust_session_id", "request.data.game_code", "request.data.ticket", "request.data.amount", "request.data.currency_code", "request.data.trx_id", "request.data.game_id", "request.data.round_id", "request.data.event_type", "request.data.event_id", "request.data.platform", "request.data.game_type", "request.data.game_status", "responseCode", "responseBody", "response.error_code", "response.error_msg", "response.balance", "response.currency_code", "response.free_bet_count", "response.trx_id", "trace-id", "name", "log-level", "level", "message", "short_message", "source_host", "type", "facility", "@timestamp", "host"]}
}
}
if [type] == "gelf-1"
{
elasticsearch {
hosts => ["172.29.14.101:9200","172.29.14.102:9200","172.29.14.103:9200","172.29.14.104:9200","172.29.14.105:9200","172.29.14.106:9200","172.29.14.107:9200","172.29.14.108:9200","172.29.14.109:9200"]
index => "logstash-gelf-1"
}
} else {
elasticsearch {
hosts => ["172.29.14.101:9200","172.29.14.102:9200","172.29.14.103:9200","172.29.14.104:9200","172.29.14.105:9200","172.29.14.106:9200","172.29.14.107:9200","172.29.14.108:9200","172.29.14.109:9200"]
index => "logstash-%{+YYYY.MM.dd}-other"
}
And next configuration for JVM:
## GC configuration
-Xms24G
-Xmx24G
-XX:+UseG1GC
-XX:+UseCompressedOops
-XX:MaxGCPauseMillis=100
-XX:MaxTenuringThreshold=8
-XX:+UseStringDeduplication
-XX:StringDeduplicationAgeThreshold=3
-XX:+AggressiveOpts
-XX:+UseFastAccessorMethods
-verbose:gc
-Xloggc:/tmp/gc.log
-XX:+PrintGCDetails
-XX:+PrintGCDateStamps
I thought some of the filters I use have memory leaks, but it looks like they are all updated to the latest version. Please help me with any advice, a lot of time was spent to try to solve this problem)