Duplicated fields in analyze response object when analyzing korean with nori_tokenizer

maruoovv · May 30, 2022, 12:05pm

Hi, i'm analyzing korean keyword with nori tokenizer and synonym_graph, i found some field is duplicated in analyze response object.

Elasticsearch version : 7.1.1

index settings

PUT /test-index
{
  "settings" : {
  "analysis": {
          "filter": {
            "search_synonym": {
              "type": "synonym_graph",
              "synonyms": ["ㅇㅇㅇ => ㅇㅇㅇ,ㅁㅇㄹ,ㅇㄹㅇ"]
            }
          },
          "analyzer": {
            "search_synonym": {
              "filter": [
                "search_synonym"
              ],
              "type": "custom",
              "tokenizer": "nori_tokenizer"
            }
          }
        }
  }
}

Analyze Request

GET /test-index/_analyze
{
    "text" : "ㅇㅇㅇ",
    "analyzer" : "search_synonym",
    "explain" : true
}

Analyze Response

{
    "detail": {
        "custom_analyzer": true,
        "tokenizer": {
            "name": "nori_tokenizer",
            "tokens": [
                {
                    "token": "ㅇㅇㅇ",
                    "start_offset": 0,
                    "end_offset": 3,
                    "type": "word",
                    "position": 0,
                    "bytes": "[e3 85 87 e3 85 87 e3 85 87]",
                    "leftPOS": "UNKNOWN(Unknown)",
                    "morphemes": null,
                    "posType": "MORPHEME",
                    "positionLength": 1,
                    "reading": null,
                    "rightPOS": "UNKNOWN(Unknown)",
                    "termFrequency": 1
                }
            ]
        },
        "tokenfilters": [
            {
                "name": "search_synonym",
                "tokens": [
                    {
                        "token": "ㅇㅇㅇ",
                        "start_offset": 0,
                        "end_offset": 3,
                        "type": "SYNONYM",
                        "position": 0,
                        "positionLength": 2,
                        "bytes": "[e3 85 87 e3 85 87 e3 85 87]",
                        "leftPOS": null,
                        "morphemes": null,
                        "posType": null,
                        "positionLength": 2,
                        "reading": null,
                        "rightPOS": null,
                        "termFrequency": 1
                    },
                    {
                        "token": "ㅁ",
                        "start_offset": 0,
                        "end_offset": 3,
                        "type": "SYNONYM",
                        "position": 0,
                        "bytes": "[e3 85 81]",
                        "leftPOS": null,
                        "morphemes": null,
                        "posType": null,
                        "positionLength": 1,
                        "reading": null,
                        "rightPOS": null,
                        "termFrequency": 1
                    },
                    {
                        "token": "ㅇㄹㅇ",
                        "start_offset": 0,
                        "end_offset": 3,
                        "type": "SYNONYM",
                        "position": 0,
                        "positionLength": 2,
                        "bytes": "[e3 85 87 e3 84 b9 e3 85 87]",
                        "leftPOS": null,
                        "morphemes": null,
                        "posType": null,
                        "positionLength": 2,
                        "reading": null,
                        "rightPOS": null,
                        "termFrequency": 1
                    },
                    {
                        "token": "ㅇㄹ",
                        "start_offset": 0,
                        "end_offset": 3,
                        "type": "SYNONYM",
                        "position": 1,
                        "bytes": "[e3 85 87 e3 84 b9]",
                        "leftPOS": null,
                        "morphemes": null,
                        "posType": null,
                        "positionLength": 1,
                        "reading": null,
                        "rightPOS": null,
                        "termFrequency": 1
                    }
                ]
            }
        ]
    }
}

detail.tokenFilters.tokens[0] has duplicated field : positionLength.
But if i added attributes other field in analyze request, that field removed.

Analyze Request

GET /test-index/_analyze
{
    "text" : "ㅇㅇㅇ",
    "analyzer" : "search_synonym",
    "explain" : true,
    "attributes" : ["rightPOS"] // adds attributes other field
}

Response

{
    "detail": {
        "custom_analyzer": true,
        "tokenizer": {
            "name": "nori_tokenizer",
            "tokens": [
                {
                    "token": "ㅇㅇㅇ",
                    "start_offset": 0,
                    "end_offset": 3,
                    "type": "word",
                    "position": 0,
                    "rightPOS": "UNKNOWN(Unknown)"
                }
            ]
        },
        "tokenfilters": [
            {
                "name": "search_synonym",
                "tokens": [
                    {
                        "token": "ㅇㅇㅇ",
                        "start_offset": 0,
                        "end_offset": 3,
                        "type": "SYNONYM",
                        "position": 0,
                        "positionLength": 2,
                        "rightPOS": null
                    },
                    {
                        "token": "ㅁ",
                        "start_offset": 0,
                        "end_offset": 3,
                        "type": "SYNONYM",
                        "position": 0,
                        "rightPOS": null
                    },
                    {
                        "token": "ㅇㄹㅇ",
                        "start_offset": 0,
                        "end_offset": 3,
                        "type": "SYNONYM",
                        "position": 0,
                        "positionLength": 2,
                        "rightPOS": null
                    },
                    {
                        "token": "ㅇㄹ",
                        "start_offset": 0,
                        "end_offset": 3,
                        "type": "SYNONYM",
                        "position": 1,
                        "rightPOS": null
                    }
                ]
            }
        ]
    }
}

Although the JSON doesn't saying "Not allow duplicated field", But recommended thing.
I'm using Elasticsearch-rest-high-level-client, and an error occurs when request analyze in this situation.

com.fasterxml.jackson.core.JsonParseException: Duplicate field 'positionLength'

Is this normal behavior? or nori_tokenizer's bug?

system · June 27, 2022, 12:06pm

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.