Need help with search query

Hi, guys. Sorry to interrupt your beautiful, but may be someone can help me with search query I'm trying to figure out.

Here goes.. I have an index where filebeat sends logs. For example /var/log/messages. Documents look like this:

{
  "_index": "log-infra-2023.02",
  "_id": "IaX7MYYBlZu06yQiL1Vh",
  "_version": 1,
  "_score": 0,
  "_ignored": [
    "message.keyword",
    "event.original.keyword"
  ],
  "_source": {
    "fingerprint": "feaa1ab964941b1d86fa695e8f2a01dd97803b3bd398acebb2f953a971bac53a",
    "agent": {
      "ephemeral_id": "cf0c7c36-e721-46ed-9409-c54942940186",
      "version": "8.4.1",
      "id": "306e9554-3384-4ddf-9b0c-fc6d0785e65c",
      "name": "localhost.localdomain"
    },
    "input": {
      "type": "log"
    },
    "@timestamp": "2023-02-08T17:03:04.552Z",
    "message": "2023-02-08T19:02:54.960888+02:00 localhost.localdomain kernel: [2039643.113272] IPTables-Dropped: IN=eth0 OUT= MAC=00:50:56:ab:d4:48:00:50:56:ab:60:c8:08:00 SRC=10.30.30.30 DST=10.10.10.10 LEN=60 TOS=0x00 PREC=0x00 TTL=64 ID=13097 DF PROTO=TCP SPT=52512 DPT=443 WINDOW=29200 RES=0x00 SYN URGP=0 ",
    "event": {
      "original": "2023-02-08T19:02:54.960888+02:00 localhost.localdomain kernel: [2039643.113272] IPTables-Dropped: IN=eth0 OUT= MAC=00:50:56:ab:d4:48:00:50:56:ab:60:c8:08:00 SRC=10.30.30.30 DST=10.10.10.10 LEN=60 TOS=0x00 PREC=0x00 TTL=64 ID=13097 DF PROTO=TCP SPT=52512 DPT=443 WINDOW=29200 RES=0x00 SYN URGP=0 ",
      "timezone": "+02:00",
      "module": "system",
      "dataset": "system.syslog"
    },
    "@version": "1",
    "host": {
      "ip": "10.20.20.20",
      "name": "localhost.localdomain"
    },
    "logstash_server": "logstash.localdomain",
    "sk_service": "infra",
    "log": {
      "offset": 75241,
      "file": {
        "path": "/var/log/messages"
      }
    },
    "tags": [
      "beats_input_codec_plain_applied"
    ],
    "ecs": {
      "version": "1.12.0"
    },
    "service": {
      "type": "system"
    },
    "fileset": {
      "name": "syslog"
    }
  },
  "fields": {
    "agent.version.keyword": [
      "8.4.1"
    ],
    "service.type.keyword": [
      "system"
    ],
    "input.type.keyword": [
      "log"
    ],
    "host.name.keyword": [
      "localhost.localdomain"
    ],
    "event.dataset.keyword": [
      "system.syslog"
    ],
    "tags.keyword": [
      "beats_input_codec_plain_applied"
    ],
    "logstash_server": [
      "logstash.localdomain"
    ],
    "service.type": [
      "system"
    ],
    "host.ip": [
      "10.20.20.20"
    ],
    "ecs.version.keyword": [
      "1.12.0"
    ],
    "host.ip.keyword": [
      "10.20.20.20"
    ],
    "indexingDate": [
      "2023-02-08"
    ],
    "event.module": [
      "system"
    ],
    "fingerprint": [
      "feaa1ab964941b1d86fa695e8f2a01dd97803b3bd398acebb2f953a971bac53a"
    ],
    "@version": [
      "1"
    ],
    "agent.name": [
      "localhost.localdomain"
    ],
    "logstash_server.keyword": [
      "logstash.localdomain"
    ],
    "host.name": [
      "localhost.localdomain"
    ],
    "sk_service.keyword": [
      "infra"
    ],
    "log.file.path.keyword": [
      "/var/log/messages"
    ],
    "event.timezone": [
      "+02:00"
    ],
    "agent.ephemeral_id.keyword": [
      "cf0c7c36-e721-46ed-9409-c54942940186"
    ],
    "event.original": [
      "2023-02-08T19:02:54.960888+02:00 localhost.localdomain kernel: [2039643.113272] IPTables-Dropped: IN=eth0 OUT= MAC=00:50:56:ab:d4:48:00:50:56:ab:60:c8:08:00 SRC=10.30.30.30 DST=10.10.10.10 LEN=60 TOS=0x00 PREC=0x00 TTL=64 ID=13097 DF PROTO=TCP SPT=52512 DPT=443 WINDOW=29200 RES=0x00 SYN URGP=0 "
    ],
    "sk_service": [
      "infra"
    ],
    "agent.name.keyword": [
      "localhost.localdomain"
    ],
    "agent.id.keyword": [
      "306e9554-3384-4ddf-9b0c-fc6d0785e65c"
    ],
    "fileset.name": [
      "syslog"
    ],
    "@version.keyword": [
      "1"
    ],
    "input.type": [
      "log"
    ],
    "log.offset": [
      75241
    ],
    "message": [
      "2023-02-08T19:02:54.960888+02:00 localhost.localdomain kernel: [2039643.113272] IPTables-Dropped: IN=eth0 OUT= MAC=00:50:56:ab:d4:48:00:50:56:ab:60:c8:08:00 SRC=10.30.30.30 DST=10.10.10.10 LEN=60 TOS=0x00 PREC=0x00 TTL=64 ID=13097 DF PROTO=TCP SPT=52512 DPT=443 WINDOW=29200 RES=0x00 SYN URGP=0 "
    ],
    "fingerprint.keyword": [
      "feaa1ab964941b1d86fa695e8f2a01dd97803b3bd398acebb2f953a971bac53a"
    ],
    "tags": [
      "beats_input_codec_plain_applied"
    ],
    "fileset.name.keyword": [
      "syslog"
    ],
    "@timestamp": [
      "2023-02-08T17:03:04.552Z"
    ],
    "agent.id": [
      "306e9554-3384-4ddf-9b0c-fc6d0785e65c"
    ],
    "ecs.version": [
      "1.12.0"
    ],
    "log.file.path": [
      "/var/log/messages"
    ],
    "event.module.keyword": [
      "system"
    ],
    "agent.ephemeral_id": [
      "cf0c7c36-e721-46ed-9409-c54942940186"
    ],
    "agent.version": [
      "8.4.1"
    ],
    "event.dataset": [
      "system.syslog"
    ],
    "event.timezone.keyword": [
      "+02:00"
    ]
  },
  "ignored_field_values": {
    "message.keyword": [
      "2023-02-08T19:02:54.960888+02:00 localhost.localdomain kernel: [2039643.113272] IPTables-Dropped: IN=eth0 OUT= MAC=00:50:56:ab:d4:48:00:50:56:ab:60:c8:08:00 SRC=10.30.14.28 DST=10.30.14.25 LEN=60 TOS=0x00 PREC=0x00 TTL=64 ID=13097 DF PROTO=TCP SPT=52512 DPT=443 WINDOW=29200 RES=0x00 SYN URGP=0 "
    ],
    "event.original.keyword": [
      "2023-02-08T19:02:54.960888+02:00 localhost.localdomain kernel: [2039643.113272] IPTables-Dropped: IN=eth0 OUT= MAC=00:50:56:ab:d4:48:00:50:56:ab:60:c8:08:00 SRC=10.30.14.28 DST=10.30.14.25 LEN=60 TOS=0x00 PREC=0x00 TTL=64 ID=13097 DF PROTO=TCP SPT=52512 DPT=443 WINDOW=29200 RES=0x00 SYN URGP=0 "
    ]
  }
}

From document JSON, yes, I can see that we are hitting ignore.above limit, and that's why I don't have message.keyword field. Which makes search for these documents a challenge.

Goal is to retrieve documents from index where:

  1. host.name.keyword : "arc11.test.sk.sise"
  2. log.file.path.keyword : "/var/log/messages"
  3. message(not a keyword) : starts with 2023-02-02 (I think "minus" is a special character)

So far I tried

POST /log-infra-2023.01/_search?ignore_throttled=false
{
  "_source": ["message", "host.name", "log.file.path"], 
  "query": {
    "bool": {
      "must": [
          { "term": { "log.file.path.keyword": { "value": "/var/log/messages" } } },
          { "term": { "host.name.keyword": { "value": "localhost.localdomain" } } }
      ],
      "filter": [
        {"match_phrase_prefix": { "message": { "query": "2023-01-03", "operator" : "AND" }}}
      ]
    }
  }
}

But it seems that filter match_phrase_prefix handles 2023-01-03 as 3 separate terms. Is there any way to change this filter to match date as a single string in the beginning of a message field?

P.S.
There is a possibility that pattern analyzer can help, but cannot figure out the correct pattern.

Maybe take a look at the wildcard field which was introduced to deal with these sorts of messages

Yeah, but changing a field type, unfortunately, is not an option. Data is already indexed and re-indexing it will take a lot of time. We are talking about 150 TB of logs.

But I cannon figure out why filter is not working as expected. According to my logic it must return only those documents where message starts with 2023-01-03. But actually response is empty. Why?

FYI changing analyzer usually means a reindex too.

If we can’t change the content that is indexed we can change the queries. If match phrase prefix query is not doing what you need maybe take a look at ‘interval’ query which has more control but also complexity. It will probably help to use the _analyze API to understand how your original doc content has been stored as indexed tokens.

2 Likes

Hi,
if you cannot change the mapping and analyzer for practical reason, match_phrase_prefix query should work anyway (provided that message field is mapped as text, which should be the default).
Remove the "operator" : "AND" which is implicit in that query clause, and it should operate as you expect.
However, you would benefit from a better mapping or analyzers (as Mark suggests). Moreover, it looks like your document lacks structure. The date at the beginning of the message should be indexed as a date type field (possibly as the @timestamp), giving you much more query flexibility (date intervals, aggregations...)

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.