i am trying to parse big json file , here is the sample 5 records in the file with the orignial file containing 100k
{"dgdocs": [{"doc id": "0", "docowner": "0", "documentType": "cte1_0", "docver": "1", "ogsys": "fc", "docstat1": "smartsys", "claimId": ""}, {"doc id": "1", "docowner": "1", "documentType": "cte1_0", "docver": "1", "ogsys": "fc", "docstat1": "smartsys", "claimId": ""}, {"doc id": "2", "docowner": "2", "documentType": "cte1_0", "docver": "1", "ogsys": "fc", "docstat1": "smartsys", "claimId": ""}, {"doc id": "3", "docowner": "3", "documentType": "cte1_0", "docver": "1", "ogsys": "fc", "docstat1": "smartsys", "claimId": ""}, {"doc id": "4", "docowner": "4", "documentType": "cte1_0", "docver": "1", "ogsys": "fc", "docstat1": "smartsys", "claimId": ""}], "totalCount": 5}
here is my config script :-
input
{
file
{
path => "${LS_XML_PATH}/*.json"
start_position => "beginning"
sincedb_path => "NUL"
exclude => "*.gz"
mode => "read"
#codec => json_lines // tried this with while commenting other codecs
#code => json // tried this with while commenting other codecs
codec => multiline // tried this with while commenting other codecs
{
pattern => '^\{'
#pattern => "^}"
#pattern => "dgdocs"
negate => true
what => previous
max_lines => 2000000
auto_flush_interval => 3
}
}
}
filter
{
json
{
source => "message"
}
if [dgdocs]
{
split
{
id => "cte1main_split_field"
field => "[dgdocs]"
}
}
mutate { gsub => ["message","^(.*)dgdocs: ",""]}
mutate
{
add_field =>
{
"creationDate" => "%{[@metadata][creationDate]}"
"docId" => "%{[dgdocs][docId]}"
"docOwner" => "%{[dgdocs][docOwner]}"
"docType" => "%{[dgdocs][docType]}"
"docVersion" => "%{[dgdocs][docVer]}"
"ogsys" => "%{[dgdocs][ogsys]}"
"docStatus1" => "%{[dgdocs][docStat1]}"
"event1Timestamp" => "%{[@metadata][creationDate]}"
"claimId" => "%{[dgdocs][claimId]}"
}
}
mutate
{
remove_field =>
["message","tags","@timestamp","host","@version","path","dgdocs"]
}
}
output
{
#for debugging purpose
stdout { codec => rubydebug }
}