Version: 6.6.2
Issue: A word which is not present in synonym file is getting tokenised as a synonym.
Steps to reproduce
Step 1: Create Index
PUT /test1
{
"settings": {
"index": {
"refresh_interval": "5m",
"number_of_shards": "1",
"analysis": {
"filter": {
"word_delimiter_with_number_filter": {
"split_on_numerics": "true",
"generate_word_parts": "true",
"preserve_original": "false",
"catenate_words": "true",
"generate_number_parts": "true",
"catenate_all": "false",
"split_on_case_change": "false",
"type": "word_delimiter",
"stem_english_possessive": "true"
},
"my_misspelled_synonym": {
"type": "synonym",
"synonyms": [ "a6x => xyz" ]
}
},
"analyzer": {
"my_analyzer": {
"filter": [
"word_delimiter_with_number_filter",
"my_misspelled_synonym"
],
"tokenizer": "whitespace"
}
}
},
"number_of_replicas": "0"
}
}
}
Step 2: Run analyse query
POST /test1/_analyze
{
"analyzer": "my_analyzer",
"text": "a 6 x"
}
Output (Not expected since "a 6 x" is not present in synonym file):
{
"tokens": [
{
"token": "xyz",
"start_offset": 0,
"end_offset": 5,
"type": "SYNONYM",
"position": 0
}
]
}
Step 3: Creating the same index and running the same analyse query on es 2.3.4 results in different output.
Output with es 2.3.4 (Expected):
{
"tokens": [
{
"token": "a",
"start_offset": 0,
"end_offset": 1,
"type": "word",
"position": 0
},
{
"token": "6",
"start_offset": 2,
"end_offset": 3,
"type": "word",
"position": 1
},
{
"token": "x",
"start_offset": 4,
"end_offset": 5,
"type": "word",
"position": 2
}
]
}