XML Filter: How import data from XML file

Hello everyone,

I'm trying to upload my xml file to logstash/elasticsearch to improve my research capabilites.
I already tried a lot of ways but I just can import all the file with the xml tags (very confusing). I would like to select some fields from my source (xml files) and upload them to the elasticsearch.
For example I would like to upload, from the XML, the fields "DateAndTime", "HeadLine" and "DataContent" (I attached a xml example).
I think my filter is not correct but I don't know why.
How can I do that?
Where can I find more information about XML filters and logstash/elasticsearch?
Do you know any online simulator for XML filters?
Thank you for your help.
Regards.

XML file:

<?xml version="1.0" encoding="UTF-8"?>
<NewsML>
   <TopicSet FormalName="TopicSet" />
   <NewsEnvelope>
      <DateAndTime>20200226T210709</DateAndTime>
      <NewsProduct FormalName="Service/>
      <Priority FormalName="5" />
   </NewsEnvelope>
   <NewsItem>
      <Identification>
         <NewsIdentifier>
            <ProviderId>prov.com</ProviderId>
            <DateId>20210427</DateId>
            <NewsItemId>28056701</NewsItemId>
            <RevisionId Update="N" PreviousRevision="0">1</RevisionId>
            <PublicIdentifier>urn:newsml:prov.com:20200427:28056701:1</PublicIdentifier>
         </NewsIdentifier>
         <DateLabel>21/01/2020 21:07:09</DateLabel>
      </Identification>
      <NewsManagement>
       ...
      </NewsManagement>
      <NewsComponent Essential="no" EquivalentsList="no">
         <TopicSet FormalName="NewsSubjects">
            <Topic>
              ...
            </Topic>
         </TopicSet>
         <NewsLines>
            <HeadLine xml:lang="x-default"><![CDATA[Title]]></HeadLine>
            <SubHeadLine><![CDATA[Sub title]]></SubHeadLine>
            <ByLine>Author</ByLine>
            <DateLine>13/02/2020</DateLine>
            <CreditLine>Press</CreditLine>
         </NewsLines>
         <AdministrativeMetadata>
            ...
         </AdministrativeMetadata>
         <DescriptiveMetadata>
            ...
         </DescriptiveMetadata>
         <ContentItem>           
            <DataContent><![CDATA[<P>Text1.</P>
<P>Text2...</P>
<P></P>
<P>ER // JMR</P>
</DataContent>
         </ContentItem>
      </NewsComponent>
   </NewsItem>
</NewsML>

My logstash.conf

input {

file {
    path => "/home/user/file.xml"
    start_position => "beginning"

    type => "xml"
    codec => multiline {
      pattern => "/NewsML/NewsItem"
      negate => true
      what => "previous"
    }
  }
}

filter {
  xml {
    source => "message"
    store_xml => false
    xpath => [                                                                                                                                                                            '/NewsML/NewsItem/Identification/NewsIdentifier/ProviderId/text()',"provider_Id"]
      '/NewsML/NewsItem/Identification/DateLabel/text()', "Identification_DateLabel",
      '/NewsML/NewsItem/NewsComponent/NewsLines/HeadLine/text()', "NewsLines_HeadLine",
      '/NewsML/NewsItem/NewsComponent/ContentItem/DataContent/text()', "DataContent"
    ]                                                                                                                                                                                                                                                                                                                                                   
}


output {
    elasticsearch {
        hosts => "http://localhost:9200"
        index => "myindex"
    }
    stdout { codec => rubydebug  }
}

I do not see anything in your data that matches that, so I would expect the multiline codec to never flush an event.

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.