I'm trying to write a pipeline that strips HTML. I can't get it to work, and it's so simple I don't know what could be wrong.
GET /_ingest/pipeline/simple
{
"simple": {
"processors": [
{
"html_strip": {
"field": "column"
}
}
]
}
}
POST /simple/_doc
{
"column": "LOREM<br/>IPSUM"
}
[...It works...]
GET simple/_search
{
"query": {
"match_all": {}
}
}
{
"took": 0,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 1,
"hits": [
{
"_index": "simple",
"_id": "nHpm8IIBBoxb0mlM5UtC",
"_score": 1,
"_source": {
"column": "LOREM<br/>IPSUM"
}
}
]
}
}
But if I use the simulate API it works.
GET /_ingest/pipeline/simple/_simulate
{
"docs": [
{
"_source": {
"column": "LOREM<br/>IPSUM"
}
}
]
}
{
"docs": [
{
"doc": {
"_index": "_index",
"_id": "_id",
"_version": "-3",
"_source": {
"column": """LOREM
IPSUM"""
},
"_ingest": {
"timestamp": "2022-08-30T20:18:59.183445298Z"
}
}
}
]
}
What am I doing wrong?