Some logs are missing in Elasticsearch

Hi,

We have a 3 node cluster ( 2 data nodes and 1 ingest node ) which is Centos 7.3, openjdk version "1.8.0_121", Logstash 5.2 and Elasticsearch 5.2. We receives the log files every 5 minutes. We found that some logs are missing on the elasticserch sometimes. If we duplicate the file, all logs will be imported to the cluster. We didn't see any error messages from both Logstash and Elasticsearch logs. We have tried to update the Logstash to 5.4.1 and the problem is still exist. We also tried to add a file output on Logstash. The problem is the same on the output file. The total number of imported logs is the same as Elasticsearch. Seems the problem is caused by Logstash. We have no idea on how and when it happen.

Screen Captures

Regards,

Ricky

Add the configuration for reference

logstash config file

input {
file {
path => "/var/log/xxxxxx/*_pcap.json"
sincedb_path => "/var/log/logstash/tshark-sincedb"
#interval => 300
type => "Check_OK"
start_position=> "beginning"
}

file {
path => "/var/log/xxxxxx/*_err.json"
sincedb_path => "/var/log/logstash/tshark-err-sincedb"
#interval => 300
type => "Check_Fail"
start_position=> "beginning"
}
}

filter{
json{
source => "message"
}

fingerprint{
 concatenate_sources => true
 method => "SHA1"
 key => "xxxxxx-elasticsearch"
 source => [ "timestamp","Source_IP","Destination_IP","Application_ID","Command_Code","Flags_Request","Session_Id","Origin_Realm","Origin_Host","Destination_Realm","Destination_Host","User_Name","Result_Code","Experimental_Result_Code","RAT_Type","CC_Request_Type","CC_Request_Number","Service_Context_Id","User_Equipment_Info_Value","PDP_Address_IPv4","Rule_Space_Decision","MME_Name","missing" ]
}

date {
match => [ "timestamp", "MMM  dd, YYYY HH:mm:ss.SSSSSSSSS ZZ", "MMM dd, YYYY HH:mm:ss.SSSSSSSSS ZZ" ]
}

if [type] == "Check_OK" {
mutate {
add_field => {
"Command_Name_temp" => "%{Command_Code}%{Flags_Request}"
}
}

translate {
   field => "Application_ID"
   destination => "Application_Name"
   fallback => "Unknown"
   dictionary_path => "/var/log/logstash/xxxxxx_dictionary.yml"
   }

translate {
   field => "Command_Name_temp"
   destination => "Command_Name"
   fallback => "Unknown"
   dictionary_path => "/var/log/logstash/xxxxxx_dictionary.yml"
   remove_field => [ "Command_Name_temp" ]
   }

translate {
   exact => true
   regex => true
   field => "Origin_Realm"
   destination => "Origin_Provider"
   fallback => "Unknown"
   dictionary_path => "/var/log/logstash/MNC-MCC-PLNM.yaml"
   }

translate {
   exact => true
   regex => true
   field => "Destination_Realm"
   destination => "Destination_Provider"
   fallback => "Unknown"
   dictionary_path => "/var/log/logstash/MNC-MCC-PLNM.yaml"
   }

translate {
   exact => true
   regex => true
   field => "Origin_Realm"
   destination => "Origin_Country"
   fallback => "Unknown"
   dictionary_path => "/var/log/logstash/MCC-Country.yaml"
   }

translate {
   exact => true
   regex => true
   field => "Destination_Realm"
   destination => "Destination_Country"
   fallback => "Unknown"
   dictionary_path => "/var/log/logstash/MCC-Country.yaml"
   }

translate {
   exact => true
   field => "Origin_Country"
   destination => "Origin_Location_temp"
   fallback => "Unknown"
   dictionary_path => "/var/log/logstash/Country-LatLong.json"
   }

translate {
   exact => true
   field => "Destination_Country"
   destination => "Destination_Location_temp"
   fallback => "Unknown"
   dictionary_path => "/var/log/logstash/Country-LatLong.json"
   }

translate {
   exact => true
   regex => true
   field => "Source_IP"
   destination => "Source_Host_temp"
   fallback => "Unknown,Unknown"
   dictionary_path => "/var/log/logstash/xxxxxx-hosts.yaml"
   }

translate {
   exact => true
   regex => true
   field => "Destination_IP"
   destination => "Destination_Host_temp"
   fallback => "Unknown,Unknown"
   dictionary_path => "/var/log/logstash/xxxxxx-hosts.yaml"
   }

if ("" in [Source_Host_temp]) {
   mutate {
     split => { "Source_Host_temp" => "," }
     add_field => { 
             "Source_Host_Name" => "%{[Source_Host_temp][0]}" 
             }
     add_field => { 
             "Source_Host_Type" => "%{[Source_Host_temp][1]}" 
             }
     remove_field => [ "Source_Host_temp" ]
       }
 }

if [Source_Host_Type] == "IPX" {
   translate {
     exact => true
     regex => true
     field => "Source_Host_Name"
     destination => "Source_IPX"
     fallback => "Unknown"
     dictionary_path => "/var/log/logstash/xxxxxx-ipx.yaml"
       }
 }

if ("" in [Destination_Host_temp]) {
   mutate {
     split => { "Destination_Host_temp" => "," }
     add_field => { 
             "Destination_Host_Name" => "%{[Destination_Host_temp][0]}" 
             }
     add_field => { 
             "Destination_Host_Type" => "%{[Destination_Host_temp][1]}" 
             }
     remove_field => [ "Destination_Host_temp" ]
       }
 }

if [Destination_Host_Type] == "IPX" {
   translate {
     exact => true
     regex => true
     field => "Destination_Host_Name"
     destination => "Destination_IPX"
     fallback => "Unknown"
     dictionary_path => "/var/log/logstash/xxxxxx-ipx.yaml"
       }
 }

if ("" in [Origin_Country]) {
   mutate {
     split => { "Origin_Location_temp" => "," }
     add_field => [ "[Origin_Location][lat]", "%{[Origin_Location_temp][0]}" ]
     add_field => [ "[Origin_Location][lon]", "%{[Origin_Location_temp][1]}" ]
 remove_field => [ "Origin_Location_temp" ]
       }
 }

if ("" in [Destination_Country]) {
   mutate {
     split => { "Destination_Location_temp" => "," }
     add_field => [ "[Destination_Location][lat]", "%{[Destination_Location_temp][0]}" ]
     add_field => [ "[Destination_Location][lon]", "%{[Destination_Location_temp][1]}" ]
     remove_field => [ "Destination_Location_temp" ]
     }
  # remove_field => [ "Origin_Location_temp", "Destination_Location_temp" ]
  }
   
 }

}

output {
if [type] == "Check_OK" {
elasticsearch {
action => "index"
hosts => [ "10.192.0.178:9200","10.192.0.147:9200" ]
index => "tshark-%{+YYYYMMdd}"
document_id => "%{fingerprint}"
flush_size => 5000
}

	file {
		path => "/var/log/xxxxxx/output_test_%{+YYYYMMdd}.txt"
		codec => "json_lines"
		}
       }

      if [type] == "Check_Fail" {
	elasticsearch {
		action => "index"
		hosts => [ "10.192.0.178:9200","10.192.0.147:9200" ]
		index => "tshark-err-%{+YYYYMMdd}"
		document_id => "%{fingerprint}"
		flush_size => 5000
	}
           file {
                    path => "/var/log/xxxxxx/output_err_test_%{+YYYYMMdd}.txt"
                    codec => "json_lines"
                    }
       }

}

What does _cat/indices show? Do you have any deletes from a single load that may indicate hash collisions? How many logs are missing?

All indices are green and open. Some logs are duplicated between files, around 1600 lines. We configured Logstash to generate the fingerprint by all fields of the event. Those duplicate logs will be overwritten by the new log files. We didn't see any message about hash collisions. When the problem occurs, over 80% logs are missing.

fingerprint{
concatenate_sources => true
method => "SHA1"
key => "xxxxxx-elasticsearch"
source => [ "timestamp","Source_IP","Destination_IP","Application_ID","Command_Code","Flags_Request","Session_Id","Origin_Realm","Origin_Host","Destination_Realm","Destination_Host","User_Name","Result_Code","Experimental_Result_Code","RAT_Type","CC_Request_Type","CC_Request_Number","Service_Context_Id","User_Equipment_Info_Value","PDP_Address_IPv4","Rule_Space_Decision","MME_Name","missing" ]
}

What does _cat/indices show for the index where documents are missing?

The following is the results from 'wc -l' for the total no. of logs

_cat/indices

Please do not post screenshots of text as it is very hard to read. It looks like you have a significant number of duplicates that have resulted in updates (shown as deleted documents). Is it perhaps possible that the fingerprint calculation fails for a portion of records and you end up with a document named %{fingerprint} (no variable substitution) being updated for lots of different records? Can you search and see if you have a record with %{fingerprint} as an id?

Those deleted documents are the duplicataed entries between log files. Those logs files are network packet captures. They have 12 seconds overlap time frame. It have 300000 documents duplicated a day.

I have searched on Kibana, no record with %{fingerprint} as _id

Is there anything in the logs?

No error on both logstash and elastic search.

May I know any limitation on total no. of documents? We have 541829108 documents right now. We notice this problem when the cluster reached 300000000 documents. Normally, it have 11000000 documents a day. Seems the more data stored, the problem occurs more often.

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.