Hello!
I've recently found that there are duplicate messages in elasticsearch. Sometimes the same message duplicating 20 times but sometimes only 2 times. Of course in the original log file it only appears once.
I saw a workaround for this by using fingerprint plugin and generating a hash for every message. However I guess it's not a very efficient method. Moreover I can imagine scenarios when the messages are duplicating in the original log file and of course in these cases I don't want to ignore them.
Isn't the sincedb file's purpose to avoid to load a line multiple times?
In case I paste the logstash config file here:
input {
file {
path => "/data/logs/*/*/{server.*.log,frontend.*.log}"
start_position => "beginning"
sincedb_path => "/data/logs/sincedb/.sincedb_superappstack"
codec => multiline {
pattern => "^%{TIMESTAMP_ISO8601}"
negate => true
what => "previous"
}
}
}
filter{
grok {
match => {
"path" => "/data/logs/%{NOTSPACE:environment}/%{NOTSPACE:application}/"
}
}
if [application] == "frontend" {
grok {
match => {
"message" => "(?m)%{TIMESTAMP_ISO8601:timestamp} \[%{DATA:thread}\] (?<loglevel>(DEBUG|INFO |WARN |ERROR)) (%{NOTSPACE:sessionid})? (%{IP:clientip})? %{NOTSPACE:class} \- %{GREEDYDATA:message}"
}
overwrite => [
"message"
]
}
grok {
match => {
"message" => "((?<callbacktype>.+) via callback \(((http|https):\/\/(?<callbackurl>[^\/]+).+)?\).+\(try: (?<callbacktry>\d))?"
}
}
if [callbacktype] {
if ![callbackurl] {
mutate {
add_field => { "callbackurl" => "NA" }
}
}
}
#if exisits... prevent geoip parse failure
if [clientip] =~ /^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$/ {
geoip {
source => "clientip"
}
}
} else {
grok {
match => {
"message" => "(?m)%{TIMESTAMP_ISO8601:timestamp} \[%{DATA:thread}\] (?<loglevel>(DEBUG|INFO |WARN |ERROR)) (%{WORD:sessionid})? (%{USERNAME:username}|\(\))? %{DATA:class} \- %{GREEDYDATA:message}"
}
overwrite => [
"message"
]
}
}
mutate {
strip => [ "loglevel" ]
}
date {
match => [
"timestamp" , "YYYY-MM-dd HH:mm:ss.SSS"
]
remove_field => [ "timestamp" ]
}
}
output {
if "_grokparsefailure" in [tags] {
file { "path" => "/data/logs/grok_failures" }
} else {
elasticsearch {
hosts => ["localhost:9200"]
index => "superappstack_%{environment}-%{+YYYY.MM.dd}"
}
}
}
Thank you in advance!