I'm currently using the thai tokenizer and wanted to preserve hashtag words but the tokenizer keeps on removing the hashtag symbol.
ES version: 7.16.2
GET /_analyze
{
"tokenizer": "thai",
"text": "#รายการพ #hashtag"
}
Response
{
"tokens": [
{
"token": "รายการ",
"start_offset": 1,
"end_offset": 7,
"type": "word",
"position": 0
},
{
"token": "พ",
"start_offset": 7,
"end_offset": 8,
"type": "word",
"position": 1
},
{
"token": "hashtag",
"start_offset": 10,
"end_offset": 17,
"type": "word",
"position": 2
}
]
}
I expect it to be like
{
"tokens": [
{
"token": "#รายการพ",
"start_offset": 1,
"end_offset": 7,
"type": "word",
"position": 0
},
{
"token": "#hashtag",
"start_offset": 10,
"end_offset": 17,
"type": "word",
"position": 2
}
]
}
I tried using char_filter to replace all hashtags with some placeholder and replace it on filter to hashtag symbol again. But the result is not what expected.
{
"tokenizer": "thai",
"filter": [
{
"pattern": "hashtagplaceholder([^\\s*]+)",
"type": "pattern_replace",
"replacement": "#$1"
}
],
"char_filter": [
{
"pattern": "#([^\\s*]+)",
"type": "pattern_replace",
"replacement": "hashtagplaceholder$1"
}
],
"text": "#รายการพ #hashtag"
}
Response:
{
"tokens": [
{
"token": "hashtagplaceholder",
"start_offset": 0,
"end_offset": 7,
"type": "word",
"position": 0
},
{
"token": "รายการ",
"start_offset": 7,
"end_offset": 7,
"type": "word",
"position": 1
},
{
"token": "พ",
"start_offset": 7,
"end_offset": 8,
"type": "word",
"position": 2
},
{
"token": "#hashtag",
"start_offset": 9,
"end_offset": 17,
"type": "word",
"position": 3
}
]
}