Hi, i'm analyzing korean keyword with nori tokenizer and synonym_graph, i found some field is duplicated in analyze response object.
Elasticsearch version : 7.1.1
index settings
PUT /test-index
{
"settings" : {
"analysis": {
"filter": {
"search_synonym": {
"type": "synonym_graph",
"synonyms": ["ㅇㅇㅇ => ㅇㅇㅇ,ㅁㅇㄹ,ㅇㄹㅇ"]
}
},
"analyzer": {
"search_synonym": {
"filter": [
"search_synonym"
],
"type": "custom",
"tokenizer": "nori_tokenizer"
}
}
}
}
}
Analyze Request
GET /test-index/_analyze
{
"text" : "ㅇㅇㅇ",
"analyzer" : "search_synonym",
"explain" : true
}
Analyze Response
{
"detail": {
"custom_analyzer": true,
"tokenizer": {
"name": "nori_tokenizer",
"tokens": [
{
"token": "ㅇㅇㅇ",
"start_offset": 0,
"end_offset": 3,
"type": "word",
"position": 0,
"bytes": "[e3 85 87 e3 85 87 e3 85 87]",
"leftPOS": "UNKNOWN(Unknown)",
"morphemes": null,
"posType": "MORPHEME",
"positionLength": 1,
"reading": null,
"rightPOS": "UNKNOWN(Unknown)",
"termFrequency": 1
}
]
},
"tokenfilters": [
{
"name": "search_synonym",
"tokens": [
{
"token": "ㅇㅇㅇ",
"start_offset": 0,
"end_offset": 3,
"type": "SYNONYM",
"position": 0,
"positionLength": 2,
"bytes": "[e3 85 87 e3 85 87 e3 85 87]",
"leftPOS": null,
"morphemes": null,
"posType": null,
"positionLength": 2,
"reading": null,
"rightPOS": null,
"termFrequency": 1
},
{
"token": "ㅁ",
"start_offset": 0,
"end_offset": 3,
"type": "SYNONYM",
"position": 0,
"bytes": "[e3 85 81]",
"leftPOS": null,
"morphemes": null,
"posType": null,
"positionLength": 1,
"reading": null,
"rightPOS": null,
"termFrequency": 1
},
{
"token": "ㅇㄹㅇ",
"start_offset": 0,
"end_offset": 3,
"type": "SYNONYM",
"position": 0,
"positionLength": 2,
"bytes": "[e3 85 87 e3 84 b9 e3 85 87]",
"leftPOS": null,
"morphemes": null,
"posType": null,
"positionLength": 2,
"reading": null,
"rightPOS": null,
"termFrequency": 1
},
{
"token": "ㅇㄹ",
"start_offset": 0,
"end_offset": 3,
"type": "SYNONYM",
"position": 1,
"bytes": "[e3 85 87 e3 84 b9]",
"leftPOS": null,
"morphemes": null,
"posType": null,
"positionLength": 1,
"reading": null,
"rightPOS": null,
"termFrequency": 1
}
]
}
]
}
}
detail.tokenFilters.tokens[0] has duplicated field : positionLength.
But if i added attributes other field in analyze request, that field removed.
Analyze Request
GET /test-index/_analyze
{
"text" : "ㅇㅇㅇ",
"analyzer" : "search_synonym",
"explain" : true,
"attributes" : ["rightPOS"] // adds attributes other field
}
Response
{
"detail": {
"custom_analyzer": true,
"tokenizer": {
"name": "nori_tokenizer",
"tokens": [
{
"token": "ㅇㅇㅇ",
"start_offset": 0,
"end_offset": 3,
"type": "word",
"position": 0,
"rightPOS": "UNKNOWN(Unknown)"
}
]
},
"tokenfilters": [
{
"name": "search_synonym",
"tokens": [
{
"token": "ㅇㅇㅇ",
"start_offset": 0,
"end_offset": 3,
"type": "SYNONYM",
"position": 0,
"positionLength": 2,
"rightPOS": null
},
{
"token": "ㅁ",
"start_offset": 0,
"end_offset": 3,
"type": "SYNONYM",
"position": 0,
"rightPOS": null
},
{
"token": "ㅇㄹㅇ",
"start_offset": 0,
"end_offset": 3,
"type": "SYNONYM",
"position": 0,
"positionLength": 2,
"rightPOS": null
},
{
"token": "ㅇㄹ",
"start_offset": 0,
"end_offset": 3,
"type": "SYNONYM",
"position": 1,
"rightPOS": null
}
]
}
]
}
}
Although the JSON doesn't saying "Not allow duplicated field", But recommended thing.
I'm using Elasticsearch-rest-high-level-client, and an error occurs when request analyze in this situation.
com.fasterxml.jackson.core.JsonParseException: Duplicate field 'positionLength'
Is this normal behavior? or nori_tokenizer's bug?