Hi, guys. Sorry to interrupt your beautiful, but may be someone can help me with search query I'm trying to figure out.
Here goes.. I have an index where filebeat sends logs. For example /var/log/messages. Documents look like this:
{
"_index": "log-infra-2023.02",
"_id": "IaX7MYYBlZu06yQiL1Vh",
"_version": 1,
"_score": 0,
"_ignored": [
"message.keyword",
"event.original.keyword"
],
"_source": {
"fingerprint": "feaa1ab964941b1d86fa695e8f2a01dd97803b3bd398acebb2f953a971bac53a",
"agent": {
"ephemeral_id": "cf0c7c36-e721-46ed-9409-c54942940186",
"version": "8.4.1",
"id": "306e9554-3384-4ddf-9b0c-fc6d0785e65c",
"name": "localhost.localdomain"
},
"input": {
"type": "log"
},
"@timestamp": "2023-02-08T17:03:04.552Z",
"message": "2023-02-08T19:02:54.960888+02:00 localhost.localdomain kernel: [2039643.113272] IPTables-Dropped: IN=eth0 OUT= MAC=00:50:56:ab:d4:48:00:50:56:ab:60:c8:08:00 SRC=10.30.30.30 DST=10.10.10.10 LEN=60 TOS=0x00 PREC=0x00 TTL=64 ID=13097 DF PROTO=TCP SPT=52512 DPT=443 WINDOW=29200 RES=0x00 SYN URGP=0 ",
"event": {
"original": "2023-02-08T19:02:54.960888+02:00 localhost.localdomain kernel: [2039643.113272] IPTables-Dropped: IN=eth0 OUT= MAC=00:50:56:ab:d4:48:00:50:56:ab:60:c8:08:00 SRC=10.30.30.30 DST=10.10.10.10 LEN=60 TOS=0x00 PREC=0x00 TTL=64 ID=13097 DF PROTO=TCP SPT=52512 DPT=443 WINDOW=29200 RES=0x00 SYN URGP=0 ",
"timezone": "+02:00",
"module": "system",
"dataset": "system.syslog"
},
"@version": "1",
"host": {
"ip": "10.20.20.20",
"name": "localhost.localdomain"
},
"logstash_server": "logstash.localdomain",
"sk_service": "infra",
"log": {
"offset": 75241,
"file": {
"path": "/var/log/messages"
}
},
"tags": [
"beats_input_codec_plain_applied"
],
"ecs": {
"version": "1.12.0"
},
"service": {
"type": "system"
},
"fileset": {
"name": "syslog"
}
},
"fields": {
"agent.version.keyword": [
"8.4.1"
],
"service.type.keyword": [
"system"
],
"input.type.keyword": [
"log"
],
"host.name.keyword": [
"localhost.localdomain"
],
"event.dataset.keyword": [
"system.syslog"
],
"tags.keyword": [
"beats_input_codec_plain_applied"
],
"logstash_server": [
"logstash.localdomain"
],
"service.type": [
"system"
],
"host.ip": [
"10.20.20.20"
],
"ecs.version.keyword": [
"1.12.0"
],
"host.ip.keyword": [
"10.20.20.20"
],
"indexingDate": [
"2023-02-08"
],
"event.module": [
"system"
],
"fingerprint": [
"feaa1ab964941b1d86fa695e8f2a01dd97803b3bd398acebb2f953a971bac53a"
],
"@version": [
"1"
],
"agent.name": [
"localhost.localdomain"
],
"logstash_server.keyword": [
"logstash.localdomain"
],
"host.name": [
"localhost.localdomain"
],
"sk_service.keyword": [
"infra"
],
"log.file.path.keyword": [
"/var/log/messages"
],
"event.timezone": [
"+02:00"
],
"agent.ephemeral_id.keyword": [
"cf0c7c36-e721-46ed-9409-c54942940186"
],
"event.original": [
"2023-02-08T19:02:54.960888+02:00 localhost.localdomain kernel: [2039643.113272] IPTables-Dropped: IN=eth0 OUT= MAC=00:50:56:ab:d4:48:00:50:56:ab:60:c8:08:00 SRC=10.30.30.30 DST=10.10.10.10 LEN=60 TOS=0x00 PREC=0x00 TTL=64 ID=13097 DF PROTO=TCP SPT=52512 DPT=443 WINDOW=29200 RES=0x00 SYN URGP=0 "
],
"sk_service": [
"infra"
],
"agent.name.keyword": [
"localhost.localdomain"
],
"agent.id.keyword": [
"306e9554-3384-4ddf-9b0c-fc6d0785e65c"
],
"fileset.name": [
"syslog"
],
"@version.keyword": [
"1"
],
"input.type": [
"log"
],
"log.offset": [
75241
],
"message": [
"2023-02-08T19:02:54.960888+02:00 localhost.localdomain kernel: [2039643.113272] IPTables-Dropped: IN=eth0 OUT= MAC=00:50:56:ab:d4:48:00:50:56:ab:60:c8:08:00 SRC=10.30.30.30 DST=10.10.10.10 LEN=60 TOS=0x00 PREC=0x00 TTL=64 ID=13097 DF PROTO=TCP SPT=52512 DPT=443 WINDOW=29200 RES=0x00 SYN URGP=0 "
],
"fingerprint.keyword": [
"feaa1ab964941b1d86fa695e8f2a01dd97803b3bd398acebb2f953a971bac53a"
],
"tags": [
"beats_input_codec_plain_applied"
],
"fileset.name.keyword": [
"syslog"
],
"@timestamp": [
"2023-02-08T17:03:04.552Z"
],
"agent.id": [
"306e9554-3384-4ddf-9b0c-fc6d0785e65c"
],
"ecs.version": [
"1.12.0"
],
"log.file.path": [
"/var/log/messages"
],
"event.module.keyword": [
"system"
],
"agent.ephemeral_id": [
"cf0c7c36-e721-46ed-9409-c54942940186"
],
"agent.version": [
"8.4.1"
],
"event.dataset": [
"system.syslog"
],
"event.timezone.keyword": [
"+02:00"
]
},
"ignored_field_values": {
"message.keyword": [
"2023-02-08T19:02:54.960888+02:00 localhost.localdomain kernel: [2039643.113272] IPTables-Dropped: IN=eth0 OUT= MAC=00:50:56:ab:d4:48:00:50:56:ab:60:c8:08:00 SRC=10.30.14.28 DST=10.30.14.25 LEN=60 TOS=0x00 PREC=0x00 TTL=64 ID=13097 DF PROTO=TCP SPT=52512 DPT=443 WINDOW=29200 RES=0x00 SYN URGP=0 "
],
"event.original.keyword": [
"2023-02-08T19:02:54.960888+02:00 localhost.localdomain kernel: [2039643.113272] IPTables-Dropped: IN=eth0 OUT= MAC=00:50:56:ab:d4:48:00:50:56:ab:60:c8:08:00 SRC=10.30.14.28 DST=10.30.14.25 LEN=60 TOS=0x00 PREC=0x00 TTL=64 ID=13097 DF PROTO=TCP SPT=52512 DPT=443 WINDOW=29200 RES=0x00 SYN URGP=0 "
]
}
}
From document JSON, yes, I can see that we are hitting ignore.above limit, and that's why I don't have message.keyword field. Which makes search for these documents a challenge.
Goal is to retrieve documents from index where:
- host.name.keyword : "arc11.test.sk.sise"
- log.file.path.keyword : "/var/log/messages"
- message(not a keyword) : starts with 2023-02-02 (I think "minus" is a special character)
So far I tried
POST /log-infra-2023.01/_search?ignore_throttled=false
{
"_source": ["message", "host.name", "log.file.path"],
"query": {
"bool": {
"must": [
{ "term": { "log.file.path.keyword": { "value": "/var/log/messages" } } },
{ "term": { "host.name.keyword": { "value": "localhost.localdomain" } } }
],
"filter": [
{"match_phrase_prefix": { "message": { "query": "2023-01-03", "operator" : "AND" }}}
]
}
}
}
But it seems that filter match_phrase_prefix handles 2023-01-03 as 3 separate terms. Is there any way to change this filter to match date as a single string in the beginning of a message field?
P.S.
There is a possibility that pattern analyzer can help, but cannot figure out the correct pattern.