Good day,
I am having issues when using regexp clause in my query for identifying and filtering out log messages produced by the filebeat, packetbeat, metricbeat, and auditbeat processes which contain the patterns below:
filebeat[####]:
packetbeat[####]:
metricbeat[####]:
auditbeat[####]:
My objective is to KEEP log messages produced by other processes (such as systemd) that contain the beat name as presented in the following example:
Nov 9 17:55:56 ml-is7033-fall-2020-12 systemd[1]: Stopped Filebeat sends log files to Logstash or directly to Elasticsearch..
But I want to FILTER OUT the log messages produced by the beat agents, such as:
Nov 9 17:55:56 ml-is7033-fall-2020-12 filebeat[31087]: 2020-11-09T17:55:56.864-0600#011INFO#011crawler/crawler.go:139#011Stopping Crawler
My current code uses a wildcard but this method eliminates both of the previously mentioned log messages. I have tried to change the "must_not" clause as presented below but its response includes all log messages with the patterns I seek to remove as if it was ignoring the regex clause:
"must_not": [{
"bool" : {
"should" : [
{"exists": {"field": "event.module"}},
{"regexp": {"message.keyword": ".*filebeat\\[[0-9]{2,7}\\]\\:.*"}},
{"regexp": {"message.keyword": ".*packetbeat\\[[0-9]{2,7}\\]\\:.*"}},
{"regexp": {"message.keyword": ".*metricbeat\\[[0-9]{2,7}\\]\\:.*"}},
{"regexp": {"message.keyword": ".*auditbeat\\[[0-9]{2,7}\\]\\:.*"}}
]
}
}]
I have tried both "message" and "message.keyword" fields as well as the following patterns:
".*filebeat\\[[0-9]{2,7}\\]\\:.*"
"/.*filebeat\\[[0-9]{2,7}\\]\\:.*/"
"filebeat\\[[0-9]{2,7}\\]\\:"
I will greatly appreciate any help.
My current code is the following:
elastic_user="elastic_dummy_user"
elastic_password="elastic_dummy_password"
scroll_size=1000
scroll_time=10s
curl -u $elastic_user:$elastic_password -XGET "http://10.0.0.9:9200/filebeat*/_search?pretty&scroll=$scroll_time&size=$scroll_size" -H 'Content-Type: application/json' -d'
{
"query": {
"bool": {
"must": [
{
"query_string": {
"query": "host.name: ml-is7033-*",
"analyze_wildcard": true,
"time_zone": "America/Chicago"
}
}
],
"must_not": [{
"bool" : {
"should" : [
{"exists": {"field": "event.module"}},
{"wildcard": {"message": "*filebeat*"}},
{"wildcard": {"message": "*packetbeat*"}},
{"wildcard": {"message": "*metricbeat*"}},
{"wildcard": {"message": "*auditbeat*"}}
]
}
}
],
"filter": [
{
"match": {"log.file.path":"/var/log/syslog"}
},
{
"range": {
"@timestamp": {
"gte": "2020-11-06T18:00:00.000Z",
"lte": "2020-11-06T18:59:59.999Z",
"format": "strict_date_optional_time"
}
}
}
]
}
},
"sort" : [{ "@timestamp" : "desc" }, { "log.offset" : "desc" }]
}' > query_response.txt
scroll_id=$(cat query_response.txt | jq -r ._scroll_id)
total_hits=$(cat query_response.txt | jq -r .hits.total.value)
current_hits=$total_hits
cat query_response.txt | jq -r '["hostname","@timestamp","file_path","log_offset","message"], (.hits.hits[]._source | [.host.name, .["@timestamp"], .log.file.path, .log.offset, .message]) | @csv' > $(date +%F).csv
echo $scroll_id
echo $total_hits
while (( "$current_hits" >= 0 )); do
i = $(($i + 1))
curl -u $elastic_user:$elastic_password -XGET "http://10.0.0.9:9200/_search/scroll/$scroll_id?pretty" | jq -r '(.hits.hits[]._source | [.host.name, .["@timestamp"], .log.file.path, .log.offset, .message]) | @csv' >> sc$i.txt \
&& current_hits=$(($current_hits-$scroll_size)) \
&& echo "Remaining Documents: ${current_hits}";
done