Hi folks,
just wanted to share the Ruby script that we use for parsing the URLs the we receive in our CDN logs to do further analysis on it. It is a mix of what you can do with Grok plugin (https://www.elastic.co/guide/en/logstash/current/plugins-filters-grok.html) to parse the URL and TLD plugin (https://www.elastic.co/guide/en/logstash/current/plugins-filters-tld.html) to extract top, second and third level domains. For a given request url, the result would look as follows:
{
...
"request": {
"path": "/shaded+bags+&+backpacks",
"file": "shaded+bags+&+backpacks",
"query": "version=1529452800769",
"url": "https://www.spreadshirt.ca/shaded+bags+&+backpacks?version=1529452800769",
"proto": "https",
"host": {
"domain": "spreadshirt.ca",
"sld": "spreadshirt",
"subdomain": "www.spreadshirt.ca",
"trd": "www",
"tld": "ca"
},
"method": "GET"
}
...
}
The processed URLs come in handy for easier filtering of data or in case you want to use the data for visualizations.
Just wanted to post the Ruby script here, in case someone else needs it:
def register(params)
require 'uri'
require 'public_suffix'
@source_field = params['source_field']
end
def filter(event)
if event.get(@source_field) != nil
field = event.get(@source_field)
if field['url'] == '' or field['url'] == '-'
if @source_field == 'referrer'
event.remove(@source_field)
end
else
begin
uri = URI.parse(field['url'])
field['proto'] = uri.scheme
field['host'] = {
"subdomain" => uri.host
}
field['path'] = uri.path if uri.path != nil
field['query'] = uri.query if uri.query != nil
field['fragment'] = uri.fragment if uri.fragment != nil
rescue URI::InvalidURIError
url = field['url']
schemaindex = url.index("://")
if schemaindex != nil
field['proto'] = url[0, schemaindex]
host = url[(schemaindex+3)..-1]
field['host'] = {
"subdomain" => host
}
hostindex = host.index("/")
if hostindex != nil
field['host'] = {
"subdomain" => host[0, hostindex]
}
path = host[(hostindex)..-1]
field['path'] = path
pathindex = path.index("?")
if pathindex != nil
field['path'] = path[0, pathindex]
query = path[(pathindex+1)..-1]
field['query'] = query
queryindex = query.index("#")
if queryindex != nil
field['query'] = query[0, queryindex]
field['fragment'] = query[(queryindex+1)..-1]
end
end
end
end
end
if field['host'] != nil and PublicSuffix.valid?(field['host']['subdomain'])
domain = PublicSuffix.parse(field['host']['subdomain'])
field['host'] = {
:tld => domain.tld,
:sld => domain.sld,
:trd => domain.trd,
:domain => domain.domain,
:subdomain => domain.subdomain
}
end
if field['path'] != nil
path = field['path']
path = path[1..-1] if path.length > 0 and path[0] == '/'
path_elements = path.split('/', -1)
for i in 1..10
field["ctx#{i}"] = path_elements[i-1] if path_elements.length > i
end
field['file'] = path_elements[-1] if path_elements[-1] != ''
end
event.set(@source_field, field)
end
end
return [event]
end
test 'when url https://www.spreadshirt.com' do
parameters {{ "source_field" => 'request' }}
in_event {{ :request => { :url => 'https://www.spreadshirt.com/t-shirts?version=123#test=a' } }}
expect('request proto is https') {|events| events.first.get('request')['proto'] == 'https'}
expect('request host is www.spreadshirt.com') {|events| events.first.get('request')['host']['subdomain'] == 'www.spreadshirt.com'}
expect('request tld is com') {|events| events.first.get('request')['host']['tld'] == 'com'}
expect('request sld is spreadshirt') {|events| events.first.get('request')['host']['sld'] == 'spreadshirt'}
expect('request trd is www') {|events| events.first.get('request')['host']['trd'] == 'www'}
expect('request domain is spreadshirt.com') {|events| events.first.get('request')['host']['domain'] == 'spreadshirt.com'}
expect('request path is /t-shirts') {|events| events.first.get('request')['path'] == '/t-shirts'}
expect('request query is version=123') {|events| events.first.get('request')['query'] == 'version=123'}
expect('request fragment is test=a') {|events| events.first.get('request')['fragment'] == 'test=a'}
end
...
You can use it in your logstash config as follows:
ruby {
path => "/etc/logstash/scripts/cdn-url-extract.rb"
script_params => { "source_field" => "request" }
}
Cheers,
Martin