I am using the fingerprint plugin to deduplicate data. I hope to obtain the duplicate entries of each piece of data. What should I do?This is my configuration file.
input {
beats {
port => 5044
}
}
filter{
grok{
match => {
# "message" => "%{TIMESTAMP_ISO8601:timestamp} %{HOSTNAME:host} %{DATA:process_name}(?:\[%{NUMBER:pid}\])?: %{GREEDYDATA:log_message}"
message => "(?:%{TIMESTAMP_ISO8601:timestamp})? (?:%{DATA:hostname})? (?:%{DATA:process_name})?(?:\[%{NUMBER:pid}\])?:(?:%{GREEDYDATA:log_message})?"
}
}
# 标记缺失的 pid 字段
if ![timestamp]{
mutate{
add_field => { "timestamp" => "N/A"}
}
mutate{
add_tag => ["missing_timestamp"]
}
}
if ![hostname]{
mutate{
add_field => { "hostname" => "N/A"}
}
mutate{
add_tag => ["missing_hostname"]
}
}
if ![process_name]{
mutate{
add_field => { "process_name" => "N/A"}
}
mutate{
add_tag => ["missing_process_name"]
}
}
if ![pid] {
mutate {
add_field => { "pid" => "N/A" }
# 给缺失的 pid 赋默认值
}
mutate {
add_tag => ["missing_pid"]
# 添加标记,指示该条日志缺少 pid
}
}
if ![log_message] {
mutate{
add_field => { "log_message" => "N/A"}
}
mutate{
add_tag => ["missing_log_message"]
}
}
mutate{
remove_field => ["event", "log", "@version", "@timestamp", "message", "host"]
}
fingerprint{
source => ["hostname", "process_name", "pid", "log_message"]
target => "[@metadata][generated_id]"
method => "SHA256"
concatenate_sources => true
}
}
output {
elasticsearch {
hosts => ["http://192.168.52.130:9200"]
# index => "%{[@metadata][beat]}-%{[@metadata][version]}-%{+YYYY.MM.dd}"
index => "%{[fields][node]}-%{+YYYY.MM.dd}"
document_id => "%{[@metadata][generated_id]}"
#user => "elastic"
#password => "changeme"
}
}
Why is the indexed date I configured not taking effect?
index => "%{[fields][node]}-%{+YYYY.MM.dd}"
The log format is as follows.
2024-10-15T00:00:03.172528+08:00 yp-VMware-Virtual-Platform [1]: rsyslog.service: Sent signal SIGHUP to main process 1346 (rsyslogd) on client request.
Is the following method a good method? Do I need to change to another way to handle deduplication?
filter {
# 初始化 fingerprints 字段为空数组
mutate {
add_field => { "fingerprints" => [] }
}
# 假设已经有了其他的 filter 配置
mutate {
add_field => {
"[duplicate_count]" => "%{[@metadata][total_duplicates]}" # 初始值设置为0
}
}
# 生成指纹
fingerprint {
source => ["hostname", "process_name", "pid", "log_message"]
target => "[@metadata][generated_id]"
method => "SHA256"
concatenate_sources => true
}
# 检查是否是重复事件
if "[@metadata][generated_id]" in [fingerprints] {
# 如果是重复,增加 duplicate_count
mutate {
add_field => {
"[duplicate_count]" => "%{[@metadata][total_duplicates]} + 1"
}
}
mutate {
remove_field => ["[@metadata][total_duplicates]"]
}
} else {
# 如果是新事件,将 generated_id 添加到 fingerprints 数组中
mutate {
push => { "fingerprints" => "%{[@metadata][generated_id]}" }
}
}
}