Help with parsing xml data

Hi everyone,

I'm new to using ELK stack. I'm trying to parse an xml audit file. Even though I get the output as I need, because of the length of code I'm assuming there could be a simpler solution than what I've written. The XML filter doesn't just give values, instead I'm getting the tags along with it which I'm removing with gsub. Can someone please help me with this?

Thanks,
Krishna

Sample of data I'm using:

<engineAuditEvent encrypted="n" signed="n">
    <localTime>2018-03-09 16:25:21.520 -0500</localTime>
    <time>2018-03-09 21:25:21.520</time>
    <utc>2018-03-09 21:25:21.520</utc>
    <engineName>TESTENG1</engineName>
    <msg>
        <msgID>AUD0205I</msgID>
        <msgText>Session 34f32f38-12fd-4907-abfe-3036cd328847 [SFTP]: Outbound plain socket connection with primary node ftpp.test established.</msgText>
        <condCode>0</condCode>
        <auditData>
            <sessionId>34f32f38-12fd-4907-abfe-3036cd328847</sessionId>
            <protocol>SFTP</protocol>
            <clientAddress>8.8.8.8</clientAddress>
            <clientPort>46723</clientPort>
            <clientIdent>
                <userid>TEST</userid>
            </clientIdent>
            <serverAddress>ftpp.test</serverAddress>
            <serverPort>10022</serverPort>
            <outboundPort>34203</outboundPort>
            <serverIdent>
                <userid>TEST</userid>
            </serverIdent>
        </auditData>
    </msg>
</engineAuditEvent>

test-pipeline.conf

input {
 file {
  path => "/home/testdev/ELK/test-logs/testlog.xml"
  start_position => "beginning"
  sincedb_path => "/dev/null"
  type => xml
	codec => multiline {
		pattern => '</engineAuditEvent>'
		negate => true
		what => next
		}   
	}
}

# ---------------- Filter ----------------
filter {
if [type] == "xml" {
 
	xml {
#		remove_namespaces => true
		store_xml => "false" 
		source => "message"
		xpath => [ 
			"/engineAuditEvent/localTime", "est",
			"/engineAuditEvent/msg/msgText", "msg",
			"/engineAuditEvent/msg/msgText", "msgID",
			"/engineAuditEvent/engineName", "engine",			
			"/engineAuditEvent/msg/auditData/sessionId", "sessionid",
			"/engineAuditEvent/msg/auditData/sessionId", "clientAddress",
			"/engineAuditEvent/msg/auditData/clientIdent/userid", "userid" 
			]
		}
	mutate {
     		gsub => [ "est", "<localTime>", "" ]
			gsub => [ "msg", "<msgText>", "" ]
			gsub => [ "msg", "</msgText>", "" ]
			gsub => [ "sessionid", "<sessionId>", "" ]
			gsub => [ "sessionid", "</sessionId>", "" ]
			gsub => [ "userid", "<userid>", "" ]
			gsub => [ "userid", "</userid>", "" ]
			gsub => [ "msgID", "<msgid>", "" ]
			gsub => [ "msgID", "</msgid>", "" ]
			gsub => [ "clientAddress", "<clientip>", "" ]
			gsub => [ "clientAddress", "</clientip>", "" ]
		}
	grok {
			match => { "est" => "%{TIMESTAMP_ISO8601:localtime}%{GREEDYDATA:extradata}" }
			remove_field => [ "extradata", "est" ]			
			}
	date {
			match => [ "localtime", "yyyy-MM-dd HH:mm:ss.SSS" ]
			target => "@timestamp"
          } 
		  
	grok {
			match => { "msg" => "%{GREEDYDATA:msgtxt}" }
#			remove_field => [ "msg", "tags", "message" ]			
	}
	grok {
			match => { "userid" => "%{GREEDYDATA:loginid}" }	
	}
	grok {
			match => { "sessionid" => "%{GREEDYDATA:session-id}" }			
	}
		
	if "_grokparsefailure" in [tags] {
                drop { }
        }

 }
}

output {
	stdout { codec => rubydebug }
}

OK. There are better xpath expressions that you can use. Instead of fetching the XML node, using /engineAuditEvent/localTime, you should pull the text out of each node. Something like this will work better.

      xpath => {
        "/engineAuditEvent/localTime/text()" => "est"
        "/engineAuditEvent/msg/msgText/text()" => "msg"
        "/engineAuditEvent/msg/msgID/text()" => "msgID"
        "/engineAuditEvent/engineName/text()" => "engine"
        "/engineAuditEvent/msg/auditData/sessionId/text()" => "sessionid"
        "/engineAuditEvent/msg/auditData/clientAddress/text()" => "clientAddress"
        "/engineAuditEvent/msg/auditData/clientIdent/userid/text()" => "userid"
      }

The next problem you will have is that xpath ignores force_array => false, and still puts everything into arrays. So you then have to do this to them all.

if [fieldx] { mutate { replace => { "fieldx" => "%{[fieldx][0]}" } } }

You may find it easier to set store_xml => true, and add target => theXML, then mutate+add_field to pull stuff out of theXML, and then delete that field.

BTW, mutate+add_field or mutate+rename would be a more traditional approach than

	grok {
			match => { "sessionid" => "%{GREEDYDATA:session-id}" }			
	}
1 Like

Thank you very much Badger.

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.