Ruby script to quickly parse URL for easier filtering or use in visualizations

Hi folks,

just wanted to share the Ruby script that we use for parsing the URLs the we receive in our CDN logs to do further analysis on it. It is a mix of what you can do with Grok plugin (https://www.elastic.co/guide/en/logstash/current/plugins-filters-grok.html) to parse the URL and TLD plugin (https://www.elastic.co/guide/en/logstash/current/plugins-filters-tld.html) to extract top, second and third level domains. For a given request url, the result would look as follows:

    {
      ...
      "request": {
          "path": "/shaded+bags+&+backpacks",
          "file": "shaded+bags+&+backpacks",
          "query": "version=1529452800769",
          "url": "https://www.spreadshirt.ca/shaded+bags+&+backpacks?version=1529452800769",      
          "proto": "https",
          "host": {
            "domain": "spreadshirt.ca",
            "sld": "spreadshirt",
            "subdomain": "www.spreadshirt.ca",
            "trd": "www",
            "tld": "ca"
          },
          "method": "GET"
       }
      ...
    }

The processed URLs come in handy for easier filtering of data or in case you want to use the data for visualizations.

Just wanted to post the Ruby script here, in case someone else needs it:

def register(params)
  require 'uri'
  require 'public_suffix'

  @source_field = params['source_field']
end

def filter(event)
  if event.get(@source_field) != nil
    field = event.get(@source_field)

    if field['url'] == '' or field['url'] == '-'
      if @source_field == 'referrer'
        event.remove(@source_field)
      end
    else
      begin
        uri = URI.parse(field['url'])

        field['proto'] = uri.scheme
        field['host'] = {
            "subdomain" => uri.host
        }
        field['path'] = uri.path if uri.path != nil
        field['query'] = uri.query if uri.query != nil
        field['fragment'] = uri.fragment if uri.fragment != nil
      rescue URI::InvalidURIError
        url = field['url']

        schemaindex = url.index("://")
        if schemaindex != nil
          field['proto'] = url[0, schemaindex]
          host = url[(schemaindex+3)..-1]
          field['host'] = {
              "subdomain" => host
          }

          hostindex = host.index("/")
          if hostindex != nil
            field['host'] = {
                "subdomain" => host[0, hostindex]
            }
            path = host[(hostindex)..-1]
            field['path'] = path

            pathindex = path.index("?")
            if pathindex != nil
              field['path'] = path[0, pathindex]
              query = path[(pathindex+1)..-1]
              field['query'] = query

              queryindex = query.index("#")
              if queryindex != nil
                field['query'] = query[0, queryindex]
                field['fragment'] = query[(queryindex+1)..-1]
              end
            end
          end
        end
      end

      if field['host'] != nil and PublicSuffix.valid?(field['host']['subdomain'])
        domain = PublicSuffix.parse(field['host']['subdomain'])
        field['host'] = {
          :tld => domain.tld,
          :sld => domain.sld,
          :trd => domain.trd,
          :domain => domain.domain,
          :subdomain => domain.subdomain
        }
      end

      if field['path'] != nil
        path = field['path']
        path = path[1..-1] if path.length > 0 and path[0] == '/'
        path_elements = path.split('/', -1)
        for i in 1..10
          field["ctx#{i}"] = path_elements[i-1] if path_elements.length > i
        end
        field['file'] = path_elements[-1] if path_elements[-1] != ''
      end

      event.set(@source_field, field)
    end
  end

  return [event]
end

test 'when url https://www.spreadshirt.com' do
  parameters {{ "source_field" => 'request' }}
  in_event {{ :request => { :url => 'https://www.spreadshirt.com/t-shirts?version=123#test=a' } }}
  expect('request proto is https') {|events| events.first.get('request')['proto'] == 'https'}
  expect('request host is www.spreadshirt.com') {|events| events.first.get('request')['host']['subdomain'] == 'www.spreadshirt.com'}
  expect('request tld is com') {|events| events.first.get('request')['host']['tld'] == 'com'}
  expect('request sld is spreadshirt') {|events| events.first.get('request')['host']['sld'] == 'spreadshirt'}
  expect('request trd is www') {|events| events.first.get('request')['host']['trd'] == 'www'}
  expect('request domain is spreadshirt.com') {|events| events.first.get('request')['host']['domain'] == 'spreadshirt.com'}
  expect('request path is /t-shirts') {|events| events.first.get('request')['path'] == '/t-shirts'}
  expect('request query is version=123') {|events| events.first.get('request')['query'] == 'version=123'}
  expect('request fragment is test=a') {|events| events.first.get('request')['fragment'] == 'test=a'}
end

...

You can use it in your logstash config as follows:

ruby {
  path => "/etc/logstash/scripts/cdn-url-extract.rb"
  script_params => { "source_field" => "request" }
 }

Cheers,
Martin

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.