Optimizing Grok patterns

I have the following Grok pattern for parsing nginx's error log file:

input {
  file {
    id => "nginx_error_file_input"
    codec => "plain"
    tags => [ "nginx", "nginx_errors" ]
    type => "nginx_error_log"
    path => [ "/var/log/nginx/error.log" ]
  }
}

filter {
  if [type] == "nginx_error_log" {
    grok {
      id => "nginx_error_grok_filter"
      patterns_dir => ["/etc/logstash/patterns"]
      match => { message => "^%{NGINXTIMESTAMP:timestamp}%{SPACE}\[%{WORD:error_level}\]%{SPACE}%{POSINT:pid}#%{POSINT:tid}:%{SPACE}(\*(?<cid>\d+))?%{SPACE}%{GREEDYDATA:error_message}$" }
      add_tag => [ "_nginx_error_grok_filter" ]
      tag_on_failure => [ "_grokparsefailure_nginx_error_grok_filter" ]
    }
    date {
      id => "nginx_error_date_filter"
      match => [ "timestamp", "yyyy/MM/dd HH:mm:ss" ]
      locale => "en-US"
      timezone => "Europe/Berlin"
      target => "@timestamp"
    }
  }
  if "_nginx_error_grok_filter" in [tags] {
    if [error_message] =~ /,\s+client:\s+/ {
      grok {
        id => "nginx_error_client_grok_filter"
        patterns_dir => ["/etc/logstash/patterns"]
        match => { "error_message" => ",%{SPACE}client:%{SPACE}%{IPORHOST:client}" }
        tag_on_failure => [ "_grokparsefailure_nginx_error_client_grok_filter" ]
      }
    }
    if [error_message] =~ /,\s+server:\s+/ {
      grok {
        id => "nginx_error_server_grok_filter"
        patterns_dir => ["/etc/logstash/patterns"]
        match => { "error_message" => ",%{SPACE}server:%{SPACE}%{IPORHOST:server}" }
        add_tag => [ "server_%{server}" ]
        tag_on_failure => [ "_grokparsefailure_nginx_error_server_grok_filter" ]
      }
    }
    if [error_message] =~ /,\s+host:\s+/ {
      grok {
        id => "nginx_error_host_grok_filter"
        patterns_dir => ["/etc/logstash/patterns"]
        match => { "error_message" => ",%{SPACE}host:%{SPACE}%{IPORHOST:host}" }
        add_tag => [ "host_%{host}" ]
        tag_on_failure => [ "_grokparsefailure_nginx_error_host_grok_filter" ]
      }
    }
    if [error_message] =~ /,\s+request:\s+/ {
      grok {
        id => "nginx_error_request_grok_filter"
        patterns_dir => ["/etc/logstash/patterns"]
        match => { "error_message" => ',%{SPACE}request:%{SPACE}"%{QS:request}"' }
        tag_on_failure => [ "_grokparsefailure_nginx_error_request_grok_filter" ]
      }
    }
    if [error_message] =~ /,\s+upstream:\s+/ {
      grok {
        id => "nginx_error_upstream_grok_filter"
        patterns_dir => ["/etc/logstash/patterns"]
        match => { "error_message" => ',%{SPACE}upstream:%{SPACE}"%{QS:upstream}"' }
        tag_on_failure => [ "_grokparsefailure_nginx_error_upstream_grok_filter" ]
      }
    }
    if [error_message] =~ /,\s+referrer:\s+/ {
      grok {
        id => "nginx_error_referrer_grok_filter"
        patterns_dir => ["/etc/logstash/patterns"]
        match => { "error_message" => ',%{SPACE}referrer:%{SPACE}"%{QS:referrer}"' }
        tag_on_failure => [ "_grokparsefailure_nginx_error_referrer_grok_filter" ]
      }
    }
  }
}

As you can see, the configuration heavily relies on conditional Grok filters, which I guess have performance penalties (assumption). And the readability is not that nice.

Is there a way to optimize this Grok pattern to be more readable and (possibly) more performant.

@burnersk Can you try using just the first grok filter with or (|) conditions in the match pattern for the various error messages? Something similar to below:

  filter {
    if [type] == "nginx_error_log" {
      grok {
      id => "nginx_error_grok_filter"
      patterns_dir => ["/etc/logstash/patterns"]
      match => { message => "^%{NGINXTIMESTAMP:timestamp}%{SPACE}[%{WORD:error_level}]%{SPACE}%{POSINT:pid}#%{POSINT:tid}:%{SPACE}(*(?\d+))?%{SPACE}((,%{SPACE}client:%{SPACE}%{IPORHOST:client})|(,%{SPACE}server:%{SPACE}%{IPORHOST:server})|(,%{SPACE}host:%{SPACE}%{IPORHOST:host})|(,%{SPACE}request:%{SPACE}"%{QS:request})|(,%{SPACE}upstream:%{SPACE}"%{QS:upstream})|(,%{SPACE}referrer:%{SPACE}"%{QS:referrer}))$" }
      }
      date {
        id => "nginx_error_date_filter"
        match => [ "timestamp", "yyyy/MM/dd HH:mm:ss" ]
        locale => "en-US"
        timezone => "Europe/Berlin"
        target => "@timestamp"
      }
    }
  }

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.