Indexing a local xml-file


#1

Hi,

I have experience with indexing csv-files into elasticsearch but not with local xml-files. I am following this tutorial which explains how to index xml-data from a website with Logstash.

My environment does not have access to the internet so I've download the xml-file. I copy/pasted the conf, edited the conf, edited and posted the mapping and run the conf in Logstash. Unfortunately nothing happens. Logstash starts the pipeline and is waiting to pick up an xml-file but it does not collect the data even though the dataset is available at the given location. Any idea what might go wrong?

This is the conf:

input
{
    file
    {
        path => "/home/DSAdmin/stations.xml"
        start_position => "beginning"
        ignore_older => 0
        sincedb_path => "/dev/null"
     }
}

filter {

  ## interpret the message payload as XML
  xml {
    source => "message"
    target => "parsed"
  }

  ## Split out each "station" record in the XML into a different event
  split {
    field => "[parsed][station]"
    add_field => {
      ## generate a unique id for the station # X the sensor time to prevent duplicates
      id                  => "%{[parsed][station][id]}-%{[parsed][station][lastCommWithServer]}"
      stationName                => "%{[parsed][station][name]}"
      lastCommWithServer  => "%{[parsed][station][lastCommWithServer]}"
      lat                 => "%{[parsed][station][lat]}"
      long                => "%{[parsed][station][long]}"
      numBikes             => "%{[parsed][station][nbBikes]}"
      numEmptyDocks        => "%{[parsed][station][nbEmptyDocks]}"
    }
  }


  mutate {
    ## Convert the numeric fileds to the appropriate data type from strings
    convert => {
      "numBikes"       => "integer"
      "numEmptyDocks"  => "integer"
      "lat"           => "float"
      "long"          => "float"
    }
    ## put the geospatial value in the correct [ longitude, latitude ] format
    add_field => { "location" => [ "%{[long]}", "%{[lat]}" ]}
    ## get rid of the extra fields we don't need
    remove_field => [ "message", "parsed", "lat", "long", "host", "http_poller_metadata"]
  }

## use the embedded Unix timestamp
 date {
    match => ["lastCommWithServer", "UNIX_MS"]
    remove_field => ["lastCommWithServer"]
  }

}

output {
    elasticsearch {
        action => "index"
        hosts => "localhost"
        index => "bikestatus-dc-%{+YYYY.MM.dd}"
        document_type => "bikestatus"
        document_id => "%{[id]}"
    }

    stdout {}
}

Mapping:

PUT _template/bikestatus
 {
   "template": "bikestatus-*",
   "settings": {
     "number_of_shards": 1,
    "number_of_replicas": 0
   },
   "mappings": {
     "_default_": {
       "dynamic_templates": [
         {
           "string_fields": {
             "mapping": {
               "index": "not_analyzed",
               "omit_norms": true,
              "type": "string",
               "doc_values": true
             },
             "match_mapping_type": "string",
             "match": "*"
           }
         }
       ],
       "_all": {
         "enabled": false
       },
      "properties": {
         "@timestamp": {
           "type": "date",
           "format": "dateOptionalTime",
           "doc_values": true
          },
         "location": {
           "type": "geo_point",
          "geohash": true,
          "fielddata" : {
             "format" : "compressed",
             "precision" : "20m"
           }
         },
  "numBikes": { "type": "integer","doc_values": true },
  "numEmptyDocks": { "type": "integer","doc_values": true }
      }
    }
  }
 }

(Andrew Cholakian) #2

Have you tried doing the same with thin stdin input plus cat as a test?

I would try making a minimal config, just your input and output, then incrementally add filters until something breaks. That's a good way of troubleshooting problems.


(system) #3

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.