Now it is just a regex exercise I am not a regex expert... but I did a quick google search found this
POST _ingest/pipeline/_simulate
{
"pipeline": {
"processors": [
{
"gsub": {
"field": "message",
"pattern": "\\\n+",
"replacement": "\\\n",
"ignore_missing": false,
"description": "Replace multiple newlines",
"on_failure": [
{
"append": {
"description": "Record error information",
"field": "_ingestion_errors",
"value": "Processor 'gsub' with tag 'remove_page_numbers' in pipeline '{{ _ingest.on_failure_pipeline }}' failed with message '{{ _ingest.on_failure_message }}'"
}
}
]
}
}
]
},
"docs": [
{
"_source": {
"message": """String with
1 newline
Then 2 newlines
Then 3 Newlines
The End """
}
}
]
}
# Result
{
"docs": [
{
"doc": {
"_index": "_index",
"_version": "-3",
"_id": "_id",
"_source": {
"message": """String with
1 newline
Then 2 newlines
Then 3 Newlines
The End """
},
"_ingest": {
"timestamp": "2023-12-29T16:58:37.484845559Z"
}
}
}
]
}