Truncate token filter splits character in string "𝕕𝕣𝕖𝕒𝕞𝕔𝕒𝕥𝕔𝕙𝕖𝕣".
Is possible use 32bits characters in ES? Thank you.
DELETE test
PUT test
{
"settings" : {
"analysis" : {
"analyzer" : {
"analyzer_1" : {
"tokenizer" : "standard",
"filter" : ["my_truncate"]
}
},
"filter" : {
"my_truncate" : {
"type" : "truncate",
"length" : 5
}
}
}
}
}
GET test/_analyze
{
"analyzer": "analyzer_1",
"text": "\ud835\udd55\ud835\udd63\ud835\udd56\ud835\udd52\ud835\udd5e\ud835\udd54\ud835\udd52\ud835\udd65\ud835\udd54\ud835\udd59\ud835\udd56\ud835\udd63"
}
response and return 2.5 characters:
{
"tokens":[{"token":"\uD835\uDD55\uD835\uDD63\uD835","start_offset":0,"end_offset":24,"type":"","position":0}]
}
GET /
{
"name" : "develop",
"cluster_name" : "elasticsearch",
"cluster_uuid" : "MY0ymU7BTnWBRRqUprZkHw",
"version" : {
"number" : "7.2.0",
"build_flavor" : "default",
"build_type" : "rpm",
"build_hash" : "508c38a",
"build_date" : "2019-06-20T15:54:18.811730Z",
"build_snapshot" : false,
"lucene_version" : "8.0.0",
"minimum_wire_compatibility_version" : "6.8.0",
"minimum_index_compatibility_version" : "6.0.0-beta1"
},
"tagline" : "You Know, for Search"
}