Duplicated log messages

Hello!

I've recently found that there are duplicate messages in elasticsearch. Sometimes the same message duplicating 20 times but sometimes only 2 times. Of course in the original log file it only appears once.

I saw a workaround for this by using fingerprint plugin and generating a hash for every message. However I guess it's not a very efficient method. Moreover I can imagine scenarios when the messages are duplicating in the original log file and of course in these cases I don't want to ignore them.

Isn't the sincedb file's purpose to avoid to load a line multiple times?

In case I paste the logstash config file here:

input {
    file {
        path => "/data/logs/*/*/{server.*.log,frontend.*.log}"
        start_position => "beginning"
        sincedb_path => "/data/logs/sincedb/.sincedb_superappstack"

        codec => multiline {
            pattern => "^%{TIMESTAMP_ISO8601}"
            negate => true
            what => "previous"
        }
    }
}

filter{
    grok {
        match => {
            "path" => "/data/logs/%{NOTSPACE:environment}/%{NOTSPACE:application}/"
        }
    }

    if [application] == "frontend" {
        grok {
            match => {
                "message" => "(?m)%{TIMESTAMP_ISO8601:timestamp} \[%{DATA:thread}\] (?<loglevel>(DEBUG|INFO |WARN |ERROR)) (%{NOTSPACE:sessionid})? (%{IP:clientip})? %{NOTSPACE:class} \- %{GREEDYDATA:message}"
            }

            overwrite => [
                "message"
            ]
        }

        grok {
            match => {
                "message" => "((?<callbacktype>.+) via callback \(((http|https):\/\/(?<callbackurl>[^\/]+).+)?\).+\(try: (?<callbacktry>\d))?"
            }
        }
        
        if [callbacktype] {
            if ![callbackurl] {
                mutate {
                    add_field => { "callbackurl" => "NA" }
                }
            }
        }

        #if exisits... prevent geoip parse failure
        if [clientip] =~ /^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$/ {
            geoip {
                source => "clientip"
            }
        }
    } else {
        grok {
            match => {
                "message" => "(?m)%{TIMESTAMP_ISO8601:timestamp} \[%{DATA:thread}\] (?<loglevel>(DEBUG|INFO |WARN |ERROR)) (%{WORD:sessionid})? (%{USERNAME:username}|\(\))? %{DATA:class} \- %{GREEDYDATA:message}"
            }

            overwrite => [
                "message"
            ]
        }
    }

    mutate {
        strip => [ "loglevel" ]
    }

    date {
        match => [
            "timestamp" , "YYYY-MM-dd HH:mm:ss.SSS"
        ]
        remove_field => [ "timestamp" ]
    }
}

output {
    if "_grokparsefailure" in [tags] {
        file { "path" => "/data/logs/grok_failures" }
    } else {
        elasticsearch {
            hosts => ["localhost:9200"]
            index => "superappstack_%{environment}-%{+YYYY.MM.dd}"
        }
    }
}

Thank you in advance!

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.