Please help with grok pattern

Hi!
Log string:
"12/Mar/2018:07:45:23 +0000"; "91.198.36.14"; "UA:-:-"; "10.10.10.1:80"; "200"; "0.000 sec"; "0.003 sec"; "www.some-site.ua"; "GET /services/ HTTP/1.1"; "327"; "https://www.some-site.ua/services/"; "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36"

Logstash pattern:
NGINX_ACCESS_UPSTREAM "%{HTTPDATE:timestamp}"; "%{IPORHOST:clientip}"; "%{DATA}"; "(?:(?:%{IPORHOST:upstream_addr}:%{NUMBER:upstream_port} : %{IPORHOST:upstreamnext}:%{NUMBER:upstreamnext_port}|%{IPORHOST:upstream_addr}:%{NUMBER:upstream_port})|(?:%{IPORHOST:upstream_addr}:%{NUMBER:upstream_port}, %{IPORHOST:upstreamnext}:%{NUMBER:upstreamnext_port}|%{IPORHOST:upstream_addr}:%{NUMBER:upstream_port})|-)"; "(?:(?:%{NUMBER:upstream_status}|-) : (?:%{NUMBER:upstreamnext_status}|-)|(?:%{NUMBER:upstream_status}|-), (?:%{NUMBER:upstreamnext_status}|-)|(?:%{NUMBER:upstream_status}|-))"; "(?:(?:%{NUMBER:upstream_connect_time}|-) : (?:%{NUMBER:upstreamnext_connect_time}|-) sec|(?:%{NUMBER:upstream_connect_time}|-), (?:%{NUMBER:upstreamnext_connect_time}|-) sec|(?:%{NUMBER:upstream_connect_time}|-) sec)"; "(?:(?:%{NUMBER:upstream_response_time}|-) : (?:%{NUMBER:upstreamnext_response_time}|-) sec|(?:%{NUMBER:upstream_response_time}|-), (?:%{NUMBER:upstreamnext_response_time}|-) sec|(?:%{NUMBER:upstream_response_time}|-) sec)"; "(?:%{HOSTPORT:http_host}|%{IPORHOST:http_host}|-)"; "(?:%{WORD:nginx_http_request} %{URIPATHPARAM:nginx_request_desc}(?: HTTP/%{NUMBER:nginx_http_version})?|-)"; "%{NUMBER:bytes}"; (?:"(?:%{URI:referrer}|-)"|%{QS:referrer}); %{QS:agent}

This pattern was tested at http://grokconstructor.appspot.com/do/match#result

But it does not work - grokparseerror

What is wrong?

How is the grok pattern being configured in your pipeline? Can you paste the relevant plugin configuration? If referencing additional named grok patterns, can you include those separately?

cat /etc/logstash/conf.d/patterns/nginx

NGINX_ACCESS_UPSTREAM \"%{HTTPDATE:timestamp}\"; \"%{IPORHOST:clientip}\"; \"%{DATA}\"; \"(?:(?:%{IPORHOST:upstream_addr}:%{NUMBER:upstream_port} : %{IPORHOST:upstreamnext}:%{NUMBER:upstreamnext_port}|%{IPORHOST:upstream_addr}:%{NUMBER:upstream_port})|(?:%{IPORHOST:upstream_addr}:%{NUMBER:upstream_port}, %{IPORHOST:upstreamnext}:%{NUMBER:upstreamnext_port}|%{IPORHOST:upstream_addr}:%{NUMBER:upstream_port})|-)\"; \"(?:(?:%{NUMBER:upstream_status}|-) : (?:%{NUMBER:upstreamnext_status}|-)|(?:%{NUMBER:upstream_status}|-), (?:%{NUMBER:upstreamnext_status}|-)|(?:%{NUMBER:upstream_status}|-))\"; \"(?:(?:%{NUMBER:upstream_connect_time}|-) : (?:%{NUMBER:upstreamnext_connect_time}|-) sec|(?:%{NUMBER:upstream_connect_time}|-), (?:%{NUMBER:upstreamnext_connect_time}|-) sec|(?:%{NUMBER:upstream_connect_time}|-) sec)\"; \"(?:(?:%{NUMBER:upstream_response_time}|-) : (?:%{NUMBER:upstreamnext_response_time}|-) sec|(?:%{NUMBER:upstream_response_time}|-), (?:%{NUMBER:upstreamnext_response_time}|-) sec|(?:%{NUMBER:upstream_response_time}|-) sec)\"; \"(?:%{HOSTPORT:http_host}|%{IPORHOST:http_host}|-)\"; \"(?:%{WORD:nginx_http_request} %{URIPATHPARAM:nginx_request_desc}(?: HTTP/%{NUMBER:nginx_http_version})?|-)\"; \"%{NUMBER:bytes}\"; (?:\"(?:%{URI:referrer}|-)\"|%{QS:referrer}); %{QS:agent}

cat /etc/logstash/conf.d/01-beats-5044.conf

input {
  beats {
  client_inactivity_timeout => 86400
    port => 5044
  }
}

cat /etc/logstash/conf.d/22-nginx-upstream.conf

filter {

  if [type] == "nginx_upstream" {
    mutate { 
      add_tag => ["nginx_upstream"]
      add_tag => ["nginx"]
    }

    date {
      match => [ "timestamp" , "dd/MMM/yyyy:HH:mm:ssZ" ]
      target => "@timestamp"
    }

    grok {
      patterns_dir => ["/etc/logstash/conf.d/patterns"]
      match => { "message" => "%{NGINX_ACCESS_UPSTREAM}" }
    }

    geoip {
      source => "clientip"
      database => "/etc/logstash/GeoLiteCity.dat"
    }

    useragent {
      source => "browser-name"
    }

  }

}

cat /etc/logstash/conf.d/24-nginx-output.conf

output {
    if "nginx" in [tags] {

        elasticsearch {
            hosts => ["localhost:9200"]
            index => "nginx-%{+YYYY.MM.dd}"
	    manage_template => false
	    template => "/etc/logstash/nginx_template.json"
	    template_name => "nginx"
        }

    } 
}

cat /etc/logstash/nginx_template.json

{
  "mappings": {
    "_default_": {
      "_all": {
        "enabled": true,
        "norms": {
          "enabled": false
        }
      },
      "dynamic_templates": [
        {
          "template1": {
            "mapping": {
              "doc_values": true,
              "ignore_above": 1024,
              "index": "not_analyzed",
              "type": "{dynamic_type}"
            },
            "match": "*"
          }
        }
      ],
      "properties": {
        "@timestamp": {
          "type": "date"
        },
        "message": {
          "type": "string",
          "index": "analyzed"
        },
        "offset": {
          "type": "long",
          "doc_values": "true"
        },
        "geoip"  : {
          "type" : "object",
          "dynamic": true,
          "properties" : {
            "location" : { "type" : "geo_point" }
          }
        }
      }
    }
  },
  "settings": {
    "index.refresh_interval": "5s"
  },
  "template": "nginx-*"
}

This pattern works fine on another nginx log format

NGINX_ACCESS_UPSTREAM \[%{HTTPDATE:timestamp}\] %{IPORHOST:clientip} \[%{DATA}\] -> (?:(?:%{IPORHOST:upstream_addr}:%{NUMBER:upstream_port} : %{IPORHOST:upstreamnext}:%{NUMBER:upstreamnext_port}|%{IPORHOST:upstream_addr}:%{NUMBER:upstream_port})|(?:%{IPORHOST:upstream_addr}:%{NUMBER:upstream_port}, %{IPORHOST:upstreamnext}:%{NUMBER:upstreamnext_port}|%{IPORHOST:upstream_addr}:%{NUMBER:upstream_port})|-) (?:"(?:%{NUMBER:upstream_status}|-) : (?:%{NUMBER:upstreamnext_status}|-)"|"(?:%{NUMBER:upstream_status}|-), (?:%{NUMBER:upstreamnext_status}|-)"|"(?:%{NUMBER:upstream_status}|-)") (?:\[(?:%{NUMBER:upstream_connect_time}|-) : (?:%{NUMBER:upstreamnext_connect_time}|-) sec\]|\[(?:%{NUMBER:upstream_connect_time}|-), (?:%{NUMBER:upstreamnext_connect_time}|-) sec\]|\[(?:%{NUMBER:upstream_connect_time}|-) sec\]) (?:\[(?:%{NUMBER:upstream_response_time}|-) : (?:%{NUMBER:upstreamnext_response_time}|-) sec\]|\[(?:%{NUMBER:upstream_response_time}|-), (?:%{NUMBER:upstreamnext_response_time}|-) sec\]|\[(?:%{NUMBER:upstream_response_time}|-) sec\]) "(?:%{HOSTPORT:http_host}|%{IPORHOST:http_host}|-)" "(?:%{WORD:nginx_http_request} %{URIPATHPARAM:nginx_request_desc}(?: HTTP/%{NUMBER:nginx_http_version})?|-)" %{NUMBER:bytes} (?:"(?:%{URI:referrer}|-)"|%{QS:referrer}) %{QS:agent}

Any ideas?

In the simplest case, I can't replicate with Logstash 6.2.2:

"12/Mar/2018:07:45:23 +0000"; "91.198.36.14"; "UA:-:-"; "10.10.10.1:80"; "200"; "0.000 sec"; "0.003 sec"; "www.some-site.ua"; "GET /services/ HTTP/1.1"; "327"; "https://www.some-site.ua/services/"; "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36"

-- file offending-line.log

NGINX_ACCESS_UPSTREAM \"%{HTTPDATE:timestamp}\"; \"%{IPORHOST:clientip}\"; \"%{DATA}\"; \"(?:(?:%{IPORHOST:upstream_addr}:%{NUMBER:upstream_port} : %{IPORHOST:upstreamnext}:%{NUMBER:upstreamnext_port}|%{IPORHOST:upstream_addr}:%{NUMBER:upstream_port})|(?:%{IPORHOST:upstream_addr}:%{NUMBER:upstream_port}, %{IPORHOST:upstreamnext}:%{NUMBER:upstreamnext_port}|%{IPORHOST:upstream_addr}:%{NUMBER:upstream_port})|-)\"; \"(?:(?:%{NUMBER:upstream_status}|-) : (?:%{NUMBER:upstreamnext_status}|-)|(?:%{NUMBER:upstream_status}|-), (?:%{NUMBER:upstreamnext_status}|-)|(?:%{NUMBER:upstream_status}|-))\"; \"(?:(?:%{NUMBER:upstream_connect_time}|-) : (?:%{NUMBER:upstreamnext_connect_time}|-) sec|(?:%{NUMBER:upstream_connect_time}|-), (?:%{NUMBER:upstreamnext_connect_time}|-) sec|(?:%{NUMBER:upstream_connect_time}|-) sec)\"; \"(?:(?:%{NUMBER:upstream_response_time}|-) : (?:%{NUMBER:upstreamnext_response_time}|-) sec|(?:%{NUMBER:upstream_response_time}|-), (?:%{NUMBER:upstreamnext_response_time}|-) sec|(?:%{NUMBER:upstream_response_time}|-) sec)\"; \"(?:%{HOSTPORT:http_host}|%{IPORHOST:http_host}|-)\"; \"(?:%{WORD:nginx_http_request} %{URIPATHPARAM:nginx_request_desc}(?: HTTP/%{NUMBER:nginx_http_version})?|-)\"; \"%{NUMBER:bytes}\"; (?:\"(?:%{URI:referrer}|-)\"|%{QS:referrer}); %{QS:agent}

-- file patterns/nginx

╭─{ yaauie@castrovel:~/src/elastic/discuss-scratch/123497-grok-works-in-debugger-but-not-in-logstash }
╰─○ cat offending-line.log | ~/src/elastic/releases/logstash-6.2.2/bin/logstash -e 'input { stdin {} } filter { grok { patterns_dir => "'"$(pwd)/patterns"'" match => { "message" => "%{NGINX_ACCESS_UPSTREAM}"}} } output { stdout { codec => rubydebug }}'
Sending Logstash's logs to /Users/yaauie/src/elastic/releases/logstash-6.2.2/logs which is now configured via log4j2.properties
[2018-03-13T21:12:11,273][INFO ][logstash.modules.scaffold] Initializing module {:module_name=>"fb_apache", :directory=>"/Users/yaauie/src/elastic/releases/logstash-6.2.2/modules/fb_apache/configuration"}
[2018-03-13T21:12:11,297][INFO ][logstash.modules.scaffold] Initializing module {:module_name=>"netflow", :directory=>"/Users/yaauie/src/elastic/releases/logstash-6.2.2/modules/netflow/configuration"}
[2018-03-13T21:12:11,529][WARN ][logstash.config.source.multilocal] Ignoring the 'pipelines.yml' file because modules or command line options are specified
[2018-03-13T21:12:12,163][INFO ][logstash.runner          ] Starting Logstash {"logstash.version"=>"6.2.2"}
[2018-03-13T21:12:12,590][INFO ][logstash.agent           ] Successfully started Logstash API endpoint {:port=>9600}
[2018-03-13T21:12:16,023][INFO ][logstash.pipeline        ] Starting pipeline {:pipeline_id=>"main", "pipeline.workers"=>8, "pipeline.batch.size"=>125, "pipeline.batch.delay"=>50}
[2018-03-13T21:12:16,359][INFO ][logstash.pipeline        ] Pipeline started succesfully {:pipeline_id=>"main", :thread=>"#<Thread:0x2113a8f0 run>"}
[2018-03-13T21:12:16,450][INFO ][logstash.agent           ] Pipelines running {:count=>1, :pipelines=>["main"]}
{
                      "host" => "castrovel.local",
                     "agent" => "\"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36\"",
                   "message" => "\"12/Mar/2018:07:45:23 +0000\"; \"91.198.36.14\"; \"UA:-:-\"; \"10.10.10.1:80\"; \"200\"; \"0.000 sec\"; \"0.003 sec\"; \"www.some-site.ua\"; \"GET /services/ HTTP/1.1\"; \"327\"; \"https://www.some-site.ua/services/\"; \"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36\"",
             "upstream_port" => "80",
                  "clientip" => "91.198.36.14",
                 "timestamp" => "12/Mar/2018:07:45:23 +0000",
           "upstream_status" => "200",
    "upstream_response_time" => "0.003",
                  "referrer" => "https://www.some-site.ua/services/",
     "upstream_connect_time" => "0.000",
             "upstream_addr" => "10.10.10.1",
                 "http_host" => "www.some-site.ua",
                "@timestamp" => 2018-03-13T21:12:16.417Z,
        "nginx_http_request" => "GET",
        "nginx_request_desc" => "/services/",
        "nginx_http_version" => "1.1",
                  "@version" => "1",
                     "bytes" => "327"
}
[2018-03-13T21:12:17,111][INFO ][logstash.pipeline        ] Pipeline has terminated {:pipeline_id=>"main", :thread=>"#<Thread:0x2113a8f0 run>"}%
[success]

A couple of issues I do notice though:

  • I see that you're running a date filter to extract the event's timestamp from its timestamp field potentially before timestamp is populated in the following grok filter.
  • I don't see where you have the type being populated (perhaps in the Filebeat's configuration?); is it possible that this grok parse failure is coming from some other grok filter? Setting a specific tag_on_failure for each instance of the grok filter can help us debug.

[root@up01 filebeat]# cat /etc/filebeat/filebeat.yml

filebeat.modules:
filebeat.prospectors:
- input_type: log
  paths:
    - /var/log/upstream_log
  document_type: nginx_upstream
output.logstash:
  enabled: true
  hosts: ["elk.server:5044"]
logging.level: info

Well... I got it!

It does not work with patterns_dir => "'"/etc/logstash/conf.d/patterns

[root@de2log01p ~]# cat /tmp/test.log | sudo -u logstash /usr/share/logstash/bin/logstash -e 'input { stdin {} } filter { grok { patterns_dir => "'"/etc/logstash/conf.d/patterns"'" match => { "message" => "%{NGINX_ACCESS_UPSTREAM}"}} } output { stdout { codec => rubydebug }}'

{
      "@version" => "1",
          "host" => "elk.server",
    "@timestamp" => 2018-03-14T10:38:48.324Z,
       "message" => "\"12/Mar/2018:07:45:23 +0000\"; \"91.198.36.14\"; \"UA:-:-\"; \"10.10.10.1:80\"; \"200\"; \"0.000 sec\"; \"0.003 sec\"; \"www.some-site.ua\"; \"GET /services/ HTTP/1.1\"; \"327\"; \"https://www.some-site.ua/services/\"; \"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36\"",
          "tags" => [
        [0] "_grokparsefailure"
    ]
}

Everything is OK with patterns_dir => "'"/etc/logstash/conf.d/patterns/nginx

[root@de2log01p ~]# cat /tmp/test.log | sudo -u logstash /usr/share/logstash/bin/logstash -e 'input { stdin {} } filter { grok { patterns_dir => "'"/etc/logstash/conf.d/patterns/nginx"'" match => { "message" => "%{NGINX_ACCESS_UPSTREAM}"}} } output { stdout { codec => rubydebug }}'

{
                     "agent" => "\"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36\"",
             "upstream_addr" => "10.10.10.1",
        "nginx_http_request" => "GET",
        "nginx_request_desc" => "/services/",
        "nginx_http_version" => "1.1",
                   "message" => "\"12/Mar/2018:07:45:23 +0000\"; \"91.198.36.14\"; \"UA:-:-\"; \"10.10.10.1:80\"; \"200\"; \"0.000 sec\"; \"0.003 sec\"; \"www.some-site.ua\"; \"GET /services/ HTTP/1.1\"; \"327\"; \"https://www.some-site.ua/services/\"; \"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36\"",
                 "http_host" => "www.some-site.ua",
           "upstream_status" => "200",
                  "referrer" => "https://www.some-site.ua/services/",
                "@timestamp" => 2018-03-14T10:39:13.827Z,
     "upstream_connect_time" => "0.000",
                     "bytes" => "327",
                  "clientip" => "91.198.36.14",
                  "@version" => "1",
                      "host" => "elk.server",
    "upstream_response_time" => "0.003",
                 "timestamp" => "12/Mar/2018:07:45:23 +0000",
             "upstream_port" => "80"
}

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.