Elasticsearch Ingest node gsub processor escape character


#1

I am trying to use the gsub processor to replace characters such as [, ] and .

This is the example given in the elastic site (https://www.elastic.co/guide/en/elasticsearch/reference/current/gsub-processor.html):

> {
>   "gsub": {
>     "field": "field1",
>     "pattern": "\.",
>     "replacement": "-"
>   }
> }

However, when i tried to use it, i got the following error:

{"error":{"root_cause":[{"type":"parse_exception","reason":"Failed to parse content to map"}],"type":"parse_exception","reason":"Failed to parse content to map","caused_by":{"type":"json_parse_exception","reason":"Unrecognized character escape '.' (code 46)\n at [Source: org.elasticsearch.transport.netty4.ByteBufStreamInput@46502ea9; line: 1, column: 133]"}},"status":400}


(Alexander Reelsen) #2

Hey,

how about this

POST _ingest/pipeline/_simulate
{
  "pipeline": {
    "description": "describe pipeline",
    "processors": [
      {
        "gsub": {
          "field": "message",
          "pattern": "\\.",
          "replacement": "-"
        }
      }
    ]
  },
  "docs": [
    {
      "_source": {
        "message": "I am a dot..."
      }
    }
  ]
}

#3

Dosent work either

What i used:

		"gsub": {
				"field": "message",
				"pattern": "\\.",
				"replacement": "###"
		}

Error message returned:

{"error":{"root_cause":[{"type":"parse_exception","reason":"Failed to parse content to map"}],"type":"parse_exception","reason":"Failed to parse content to map","caused_by":{"type":"json_parse_exception","reason":"Unexpected character (']' (code 93)): expected a value\n at [Source: org.elasticsearch.transport.netty4.ByteBufStreamInput@70472a68; line: 1, column: 1043]"}},"status":400}


(Alexander Reelsen) #4

Hey,

this works for me in the elastic dev-tools

POST _ingest/pipeline/_simulate
{
  "pipeline": {
    "description": "describe pipeline",
    "processors": [
      {
        "gsub": {
          "field": "message",
          "pattern": "\\.",
          "replacement": "###"
        }
      }
    ]
  },
  "docs": [
    {
      "_source": {
        "message": "I am a dot..."
      }
    }
  ]
}


{
  "docs": [
    {
      "doc": {
        "_type": "_type",
        "_index": "_index",
        "_id": "_id",
        "_source": {
          "message": "I am a dot#########"
        },
        "_ingest": {
          "timestamp": "2016-12-05T10:14:11.941+0000"
        }
      }
    }
  ]
}

So, let's see what is different with your setup. Do you use kibana/dev-tools to test this out? How to do send the data to elasticsearch?

--Alex


#5

I saved the data in a .json file and send it using the following command:

curl -XPUT localhost:9200/_ingest/pipeline/pipeline-name -d@pipeline.json


(Alexander Reelsen) #6

Hey,

can you please provide a fully reproducible example instead of snippets?

$ cat /tmp/pipeline-simulate.json
{
  "pipeline": {
    "description": "describe pipeline",
    "processors": [
      {
        "gsub": {
          "field": "message",
          "pattern": "\\.",
          "replacement": "###"
        }
      }
    ]
  },
  "docs": [
    {
      "_source": {
        "message": "I am a dot..."
      }
    }
  ]
}

$ /usr/bin/curl -X POST localhost:9200/_ingest/pipeline/_simulate -d @/tmp/pipeline-simulate.json
{"docs":[{"doc":{"_type":"_type","_index":"_index","_id":"_id","_source":{"message":"I am a dot#########"},"_ingest":{"timestamp":"2016-12-05T10:24:51.209+0000"}}}]}

--Alex


#7
{
	"description" : "Filter for logs",
	"processors": [
		{
			"gsub": {
					"field": "message",
					"pattern": "\\[ | \\]",
					"replacement": "###"
			}
		},

		{
			"grok": {
	    		"field": "message",
	    		"patterns": [
	    			"(?m)[%{DATA:loglevel}] %{TIMESTAMP_ISO8601:logTime} [%{DATA:handler}] - %{WORD:flow} message:\n<%{DATA}><%{DATA}>%{WORD:transactionId}<%{DATA}><%{DATA}><%{DATA}><%{DATA}><request>\"OPCODE=%{NUMBER:opCode}::CALLER=%{DATA}::RACE=%{DATA:raceId}::CLUBMEETING=%{DATA:clubMeeting}::FIXTUREID=%{DATA:fixtureId}::FIXTUREDATE=%{NUMBER:fixtureDate}::STATUS=%{NUMBER:status}::TIMESTAMP=%{TIME:requestTime}::PROTEST=%{NUMBER}\"</request></(cus|ns2):%{WORD:requestType}>%{GREEDYDATA}",
	    			"(?m)[%{DATA:loglevel}] %{TIMESTAMP_ISO8601:logTime} [%{DATA:handler}] - %{WORD:flow} message:\n<%{DATA}><%{DATA}><%{DATA}><%{DATA}><return>%{DATA:content}</return></(cus|ns2):%{WORD:requestType}>%{GREEDYDATA}",
	    			"(?m)[%{DATA:loglevel}] %{TIMESTAMP_ISO8601:logTime} [%{DATA:handler}] - %{GREEDYDATA:errorMsg}"
	    		]
	  		}
		},
		{
			"date": {
				"field": "logTime",
				"formats": ["ISO8601"]
			}
		},
		{
			"date_index_name" : {
	        	"field" : "@timestamp",
	        	"index_name_prefix" : "index-prefix-",
	        	"date_rounding" : "M"
	    	}
  		}
	],
	"on_failure" : [
    	{
      		"set" : {
        		"field" : "_index",
        		"value" : "failed-{{ _index }}"
      		}
    	}
    ]
}

My main objective is actually to obtain values within square brackets (as you can see from my previous topic which you have also replied in). As no one replied during that time i thought a workaround was to replace the brackets with something else, but it seems like the same problem persists in the gsub processor.


(Alexander Reelsen) #8

Hey,

once I fixed the broken JSON (the array of the grok patterns ends with a comma), this worked for me as well

/usr/bin/curl -XPUT localhost:9200/_ingest/pipeline/pipeline-name -d@pipeline.json
{"acknowledged":true}

I am using Elasticsearch 5.0.3 btw

--Alex


#9

Yes, i also realised that i had an extra comma after i replied you. I have edited by previous reply.

But i still cant get it to work as intended. This is what i got in elasticsearch:

{
"_index": "failed-filebeat-2016.12.05",
"_type": "log",
"_id": "AVjOklvjjRULAGPVrY91",
"_version": 1,
"_score": 1,
"_source": {
"@timestamp": "2016-12-05T10:37:43.542Z",
"offset": 118,
"beat": {
"hostname": "CPX-I54LIOPVP3L",
"name": "CPX-I54LIOPVP3L",
"version": "5.0.1"
},
"input_type": "log",
"source": "<path of log file>",
"message": "[ERROR] 2016-11-20 09:47:03.059 [Handler] - Error msg",
"type": "log"
}
}

seems like it did not replace my square brackets with ###

P.S. I am using elasticsearch and filebeat 5.0.1


(Alexander Reelsen) #10

Hey,

you example document does not use an ISO8601 date it seems.

POST _ingest/pipeline/_simulate
{
  "pipeline": {
    "description": "Filter for logs",
    "processors": [
      {
        "gsub": {
          "field": "message",
          "pattern": "\\[ | \\]",
          "replacement": "###"
        }
      },
      {
        "grok": {
          "field": "message",
          "patterns": [
            "(?m)\\[%{DATA:loglevel}\\] %{TIMESTAMP_ISO8601:logTime} \\[%{DATA:handler}\\] - %{GREEDYDATA:errorMsg}"
          ]
        }
      },
      {
        "date": {
          "field": "logTime",
          "formats": [
            "ISO8601"
          ]
        }
      },
      {
        "date_index_name": {
          "field": "@timestamp",
          "index_name_prefix": "index-prefix-",
          "date_rounding": "M"
        }
      }
    ],
    "on_failure": [
      {
        "set": {
          "field": "_index",
          "value": "failed-{{ _index }}"
        }
      }
    ]
  },
  "docs": [
    {
      "_source": {
        "message": "[ERROR] 2016-11-20 09:47:03.059 [Handler] - Error msg"
      }
    }
  ]
}

#11

That's odd, are you sure it does not use iso8601? Cause I am able to get an output in the grok debugger.


(Alexander Reelsen) #12

I think omitting the T is a special case, where both parties have to agree on... can you test with providing the T or change the dateformat?


#13

I was actually previously using Logstash to filter the documents sent by filebeat. This worked fine in logstash but ill test it out with your suggestions again.


#14

I am still getting failures, though a small amount of documents actually passed

This is one such failure

{
"_index": "failed-filebeat-2016.12.06",
"_type": "log",
"_id": "AVjSAWgR6UaCJE7zkPWr",
"_version": 1,
"_score": 1,
"_source": {
"@timestamp": "2016-12-06T02:38:42.154Z",
"offset": 354,
"beat": {
"hostname": "CPX-I54LIOPVP3L",
"name": "CPX-I54LIOPVP3L",
"version": "5.0.1"
},
"input_type": "log",
"source": "<source>",
"message": "[ERROR] 2016-11-20 09:47:03.062 [Handler] - Unable to get Transaction Id: java.util.NoSuchElementException",
"type": "log",
"errorMsg": "Unable to get Transaction Id: java.util.NoSuchElementException"
}
}

However, i seem to be getting some documents which passed too, an example:

{
"_index": "log-index-2016-11-01",
"_type": "log",
"_id": "AVjSTDL-eEOgLibY-E5L",
"_version": 1,
"_score": 1,
"_source": {
"handler": "Handler",
"offset": 472,
"input_type": "log",
"source": "",
"message": "[ERROR] 2016-11-20 09:47:03.063 [Handler] - Unable to get Transaction Id: java.util.NoSuchElementException",
"type": "log",
"logTime": "2016-11-20 09:47:03.063",
"errorMsg": "Unable to get Transaction Id: java.util.NoSuchElementException",
"@timestamp": "2016-11-20T09:47:03.063Z",
"loglevel": "ERROR",
"beat": {
"hostname": "CPX-I54LIOPVP3L",
"name": "CPX-I54LIOPVP3L",
"version": "5.0.1"
}
}
}

This is my .json file:

{
	"description" : "Filter for logs",
	"processors": [


		{
			"grok": {
	    		"field": "message",
	    		"patterns": [
	    			"(?m)\\[%{DATA:loglevel}\\] %{DATETIME:logTime} \\[%{DATA:handler}\\] - %{GREEDYDATA:errorMsg}"
	    		],
	    		"pattern_definitions" : {
		        	"DATETIME" : "%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}(?::?%{SECOND})"
		        }
	  		}
		},
		{
			"date": {
				"field": "logTime",
				"formats": ["yyyy-MM-dd hh:mm:ss.SSS"]
			}
		},
		{
			"date_index_name" : {
	        	"field" : "@timestamp",
	        	"index_name_prefix" : "log-index-",
	        	"date_rounding" : "M"
	    	}
  		}
	],
	"on_failure" : [
    	{
      		"set" : {
        		"field" : "_index",
        		"value" : "failed-{{ _index }}"
      		}
    	}
    ]
}

(system) #15

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.