Hi @smhoeks,
any reason you are not using the built-in danish analyzer? This would probably be the simplest option.
You can use the analyze API to test your analyzer. E.g.:
PUT /sample-index
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0,
"analysis": {
"analyzer": {
"std": {
"tokenizer": "standard",
"char_filter": "html_strip",
"filter": [
"standard",
"elision",
"asciifolding",
"lowercase",
"length"
]
},
"keyword": {
"tokenizer": "keyword",
"filter": [
"asciifolding",
"lowercase"
]
},
"keyword_prefix": {
"tokenizer": "keyword",
"filter": [
"asciifolding",
"lowercase",
"edge_ngram_front"
]
},
"text_prefix": {
"tokenizer": "standard",
"char_filter": "html_strip",
"filter": [
"standard",
"elision",
"asciifolding",
"lowercase",
"edge_ngram_front"
]
},
"text_suffix": {
"tokenizer": "standard",
"char_filter": "html_strip",
"filter": [
"standard",
"elision",
"asciifolding",
"lowercase",
"edge_ngram_back"
]
}
},
"filter": {
"edge_ngram_front": {
"type": "edgeNGram",
"min_gram": 2,
"max_gram": 10,
"side": "front"
},
"edge_ngram_back": {
"type": "edgeNGram",
"min_gram": 2,
"max_gram": 10,
"side": "back"
},
"length": {
"type": "length",
"min": 1
}
}
}
}
}
POST /sample-index/_analyze
{
"analyzer": "std",
"text": "Nål"
}
produces:
{
"tokens": [
{
"token": "nal",
"start_offset": 0,
"end_offset": 3,
"type": "<ALPHANUM>",
"position": 0
}
]
}
but
POST /sample-index/_analyze
{
"analyzer": "danish",
"text": "Nål"
}
produces
{
"tokens": [
{
"token": "nål",
"start_offset": 0,
"end_offset": 3,
"type": "<ALPHANUM>",
"position": 0
}
]
}
Daniel