Querying syslog data to fetch unique error messages

Here is the query I have tried.... Although I have kept filter_duplicate_text as true still I get messages of the form {"............June 3 time stamp ...log message.............."} I want to filter out all these duplicate messages. Can anybody help me with this?

GET log_stash_2020.06.16/_search
{
  "query": {
    "bool": {
      "must": [
        {
          "match_phrase": {
            "message": "Error"
          }
        },
        {
          "match_phrase": {
            "type": "lab_id"
          }
        }
      ]
    }
  },
  "aggs": {
    "log_message": {
      "significant_text": {
        "field": "message",
        "filter_duplicate_text": "true"
      }
    }
  },
  "size": 1000
}

Sample documents containing log messages:

{
        "_index" : "logstash_2020.06.16",
        "_type" : "doc",
        "_id" : "################",
        "_score" : 1.0,
        "_source" : {
          "logsource" : "router_id",
          "timestamp" : "Jun 15 20:00:00",
          "program" : "some_program",
          "host" : "#############",
          "priority" : "27",
          "@timestamp" : "2020-06-16T00:00:01.020Z",
          "type" : "lab_id",
          "pid" : "####",
          "message" : ": ############### send failed with error: ENOENT -- Item not found (No error: 0)",
          "@version" : "1"
        }
      }

{
        "_index" : "logstash_2020.06.16",
        "_type" : "doc",
        "_id" : "################",
        "_score" : 1.0,
        "_source" : {
          "host" : "################",
          "@timestamp" : "2020-06-16T00:00:02.274Z",
          "type" : "####",
          "tags" : [
            "_grokparsefailure"
          ],
          "message" : "################:Jun 15 20:00:18.908 EDT: mediasvr[2546]: %MEDIASVR-MEDIASVR-4-PARTITION_USAGE_ALERT : High disk usage alert : host ##### exceeded 100%  \n",
          "@version" : "1"
        }
      }
 The second response contains timestamp in the error message itself and the message field is of type "text".How do I write a query so that it will fetch unique error messages from elastic search DB i.e ignore the time stamp if it is the only difference between two messages.
The mapping details of the index.
{
  "logstash-2020.05.07" : {
    "mappings" : {
      "doc" : {
        "dynamic_templates" : [
          {
            "message_field" : {
              "path_match" : "message",
              "match_mapping_type" : "string",
              "mapping" : {
                "norms" : false,
                "type" : "text"
              }
            }
          },
          {
            "string_fields" : {
              "match" : "*",
              "match_mapping_type" : "string",
              "mapping" : {
                "fields" : {
                  "keyword" : {
                    "ignore_above" : 256,
                    "type" : "keyword"
                  }
                },
                "norms" : false,
                "type" : "text"
              }
            }
          }
        ],
        "properties" : {
          "@timestamp" : {
            "type" : "date"
          },
          "@version" : {
            "type" : "keyword"
          },
          "geoip" : {
            "dynamic" : "true",
            "properties" : {
              "ip" : {
                "type" : "ip"
              },
              "latitude" : {
                "type" : "half_float"
              },
              "location" : {
                "type" : "geo_point"
              },
              "longitude" : {
                "type" : "half_float"
              }
            }
          },
          "host" : {
            "type" : "text",
            "norms" : false,
            "fields" : {
              "keyword" : {
                "type" : "keyword",
                "ignore_above" : 256
              }
            }
          },
          "logsource" : {
            "type" : "text",
            "norms" : false,
            "fields" : {
              "keyword" : {
                "type" : "keyword",
                "ignore_above" : 256
              }
            }
          },
          "message" : {
            "type" : "text",
            "norms" : false
          },
          "pid" : {
            "type" : "text",
            "norms" : false,
            "fields" : {
              "keyword" : {
                "type" : "keyword",
                "ignore_above" : 256
              }
            }
          },
          "priority" : {
            "type" : "text",
            "norms" : false,
            "fields" : {
              "keyword" : {
                "type" : "keyword",
                "ignore_above" : 256
              }
            }
          },
          "program" : {
            "type" : "text",
            "norms" : false,
            "fields" : {
              "keyword" : {
                "type" : "keyword",
                "ignore_above" : 256
              }
            }
          },
          "tags" : {
            "type" : "text",
            "norms" : false,
            "fields" : {
              "keyword" : {
                "type" : "keyword",
                "ignore_above" : 256
              }
            }
          },
          "timestamp" : {
            "type" : "text",
            "norms" : false,
            "fields" : {
              "keyword" : {
                "type" : "keyword",
                "ignore_above" : 256
              }
            }
          },
          "type" : {
            "type" : "text",
            "norms" : false,
            "fields" : {
              "keyword" : {
                "type" : "keyword",
                "ignore_above" : 256
              }
            }
          }
        }
      },
      "_default_" : {
        "dynamic_templates" : [
          {
            "message_field" : {
              "path_match" : "message",
              "match_mapping_type" : "string",
              "mapping" : {
                "norms" : false,
                "type" : "text"
              }
            }
          },
          {
            "string_fields" : {
              "match" : "*",
              "match_mapping_type" : "string",
              "mapping" : {
                "fields" : {
                  "keyword" : {
                    "ignore_above" : 256,
                    "type" : "keyword"
                  }
                },
                "norms" : false,
                "type" : "text"
              }
            }
          }
        ],
        "properties" : {
          "@timestamp" : {
            "type" : "date"
          },
          "@version" : {
            "type" : "keyword"
          },
          "geoip" : {
            "dynamic" : "true",
            "properties" : {
              "ip" : {
                "type" : "ip"
              },
              "latitude" : {
                "type" : "half_float"
              },
              "location" : {
                "type" : "geo_point"
              },
              "longitude" : {
                "type" : "half_float"
              }
            }
          }
        }
      }
    }
  }
}

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.