We have generated an _id
field which we set in our elasticsearch output so that, in case we ever load the same log twice, we don't get duplicate events.
However, we are now seeing lots of duplicate _id fields (see below).
Our id consists of the name of the logfile plus the timestamp from the logfile plus the filebeat offset field. This %{logfile}_%{timestamp}_%{offset}
should surely be unique.
filter {
...
mutate {
rename => {"@timestamp" => "timestamp"}
update => {
"source" => "%{[fields][source]}"
}
add_field => {
"server" => "%{[fields][server]}"
"logfile" => "%{[fields][source]}-%{[fields][server]}-%{[fields][system]}"
"[@metadata][system]" => "%{[fields][system]}"
"[@metadata][offset]" => "%{offset}"
"[@metadata][id]" => "%{logfile}_%{timestamp}_%{offset}"
}
}
output {
elasticsearch {
hosts => ["localhost"]
index => "%{[@metadata][index]}"
document_id => "%{[@metadata][id]}"
document_type => "_doc"
}
}
We are pretty sure we're not processing the same logs twice but even so: we'd expect a re-occurring _id to overwrite a document.
As you can see below, we have 2 documents with the same _id and come from exactly the same logfile line. Indeed we even have more documents from that same line, sometimes with the _id field set to 3 values:
"_id": "access-app01-prod_2019-02-26T01:10:25.000Z_26546834,access-app01-prod,access-app01-prod_2019-02-26T01:10:25.000Z_26546834",
It's chaos...
{
"took": 10,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 2,
"max_score": null,
"hits": [
{
"_index": "access-prod1",
"_type": "_doc",
"_id": "access-app01-prod_2019-02-26T01:10:25.000Z_26546834,access-app01-prod,access-app01-prod_2019-02-26T01:10:25.000Z_26546834",
"_score": null,
"_source": {
"timestamp": "2019-02-26T01:10:25.000Z",
"category": "XX3",
"sessionId": "XXX1",
"status": "200",
"tags": [
"beats_input_codec_plain_applied"
],
"userAgent": "FF1",
"path": [
"XX2",
"XX4"
],
"timeTaken": "31",
"logfile": [
"access-app01-prod",
"access-app01-prod"
],
"method": "GET",
"subcategory": "XX4",
"beat": {
"name": "uznc74",
"version": "6.6.1",
"hostname": "uznc74"
},
"server": [
"app01",
"app01"
],
"prospector": {
"type": "log"
},
"type": "INFO",
"host": {
"name": "uznc74"
},
"source": "access",
"@version": "1",
"bytesSent": "59579",
"log": {
"file": {
"path": "D:\\logs\\tomcat\\access.20190226.log"
}
},
"language": "pl",
"ip": "62.111.143.82",
"url": "/XX2/XX4",
"message": "[26/Feb/2019:02:10:25 +0100] 194.9.122.200 62.111.143.82 200 \"GET /XX2/XX4 HTTP/1.1\" 59579 31 XXX1 \"FF1\" \"en\"",
"input": {
"type": "log"
},
"offset": 26546834
},
"sort": [
1551143425000
]
},
{
"_index": "access-prod1",
"_type": "_doc",
"_id": "access-app01-prod_2019-02-26T01:10:25.000Z_26546834",
"_score": null,
"_source": {
"ip": "62.111.143.82",
"prospector": {
"type": "log"
},
"method": "GET",
"url": "/XX2/XX4",
"@version": "1",
"path": [
"XX2",
"XX4"
],
"offset": 26546834,
"log": {
"file": {
"path": "D:\\logs\\tomcat\\access.20190226.log"
}
},
"host": {
"name": "uznc74"
},
"source": "access",
"bytesSent": "59579",
"status": "200",
"server": "app01",
"message": "[26/Feb/2019:02:10:25 +0100] 194.9.122.200 62.111.143.82 200 \"GET /XX2/XX4 HTTP/1.1\" 59579 31 XXX1 \"FF1\" \"en\"",
"logfile": "access-app01-prod",
"timestamp": "2019-02-26T01:10:25.000Z",
"userAgent": "FF1",
"category": "XX3",
"subcategory": "XX4",
"sessionId": "XXX1",
"beat": {
"version": "6.6.1",
"hostname": "uznc74",
"name": "uznc74"
},
"timeTaken": "31",
"tags": [
"beats_input_codec_plain_applied"
],
"language": "pl",
"type": "INFO",
"input": {
"type": "log"
}
},
"sort": [
1551143425000
]
}
]
}
}