Some Error in nori tokenizer

Hi I'm using Nori tokenizer in Elasticsearch(7.16.1) for analyzing korean comments and i think i found some errors in nori tokenizer.

input :

GET _analyze
{
  "tokenizer": "nori_tokenizer",
  "explain": "true",
  "text": "전체적으로 양이 적어서 배고픈 적이 있다"
}

output:

{
  "detail" : {
    "custom_analyzer" : true,
    "charfilters" : [ ],
    "tokenizer" : {
      "name" : "nori_tokenizer",
      "tokens" : [
        {
          "token" : "전체",
          "start_offset" : 0,
          "end_offset" : 2,
          "type" : "word",
          "position" : 0,
          "bytes" : "[ec a0 84 ec b2 b4]",
          "leftPOS" : "NNG(General Noun)",
          "morphemes" : null,
          "posType" : "MORPHEME",
          "positionLength" : 1,
          "reading" : null,
          "rightPOS" : "NNG(General Noun)",
          "termFrequency" : 1
        },
        {
          "token" : "적",
          "start_offset" : 2,
          "end_offset" : 3,
          "type" : "word",
          "position" : 1,
          "bytes" : "[ec a0 81]",
          "leftPOS" : "XSN(Noun Suffix)",
          "morphemes" : null,
          "posType" : "MORPHEME",
          "positionLength" : 1,
          "reading" : null,
          "rightPOS" : "XSN(Noun Suffix)",
          "termFrequency" : 1
        },
        {
          "token" : "으로",
          "start_offset" : 3,
          "end_offset" : 5,
          "type" : "word",
          "position" : 2,
          "bytes" : "[ec 9c bc eb a1 9c]",
          "leftPOS" : "J(Ending Particle)",
          "morphemes" : null,
          "posType" : "MORPHEME",
          "positionLength" : 1,
          "reading" : null,
          "rightPOS" : "J(Ending Particle)",
          "termFrequency" : 1
        },
        {
          "token" : "양",
          "start_offset" : 6,
          "end_offset" : 7,
          "type" : "word",
          "position" : 3,
          "bytes" : "[ec 96 91]",
          "leftPOS" : "NNG(General Noun)",
          "morphemes" : null,
          "posType" : "MORPHEME",
          "positionLength" : 1,
          "reading" : null,
          "rightPOS" : "NNG(General Noun)",
          "termFrequency" : 1
        },
        {
          "token" : "이",
          "start_offset" : 7,
          "end_offset" : 8,
          "type" : "word",
          "position" : 4,
          "bytes" : "[ec 9d b4]",
          "leftPOS" : "J(Ending Particle)",
          "morphemes" : null,
          "posType" : "MORPHEME",
          "positionLength" : 1,
          "reading" : null,
          "rightPOS" : "J(Ending Particle)",
          "termFrequency" : 1
        },
        {
          "token" : "적",
          "start_offset" : 9,
          "end_offset" : 10,
          "type" : "word",
          "position" : 5,
          "bytes" : "[ec a0 81]",
          "leftPOS" : "VA(Adjective)",
          "morphemes" : null,
          "posType" : "MORPHEME",
          "positionLength" : 1,
          "reading" : null,
          "rightPOS" : "VA(Adjective)",
          "termFrequency" : 1
        },
        {
          "token" : "어서",
          "start_offset" : 10,
          "end_offset" : 12,
          "type" : "word",
          "position" : 6,
          "bytes" : "[ec 96 b4 ec 84 9c]",
          "leftPOS" : "E(Verbal endings)",
          "morphemes" : null,
          "posType" : "MORPHEME",
          "positionLength" : 1,
          "reading" : null,
          "rightPOS" : "E(Verbal endings)",
          "termFrequency" : 1
        },
        {
          "token" : "배고프",
          "start_offset" : 13,
          "end_offset" : 16,
          "type" : "word",
          "position" : 7,
          "bytes" : "[eb b0 b0 ea b3 a0 ed 94 84]",
          "leftPOS" : "VA(Adjective)",
          "morphemes" : null,
          "posType" : "MORPHEME",
          "positionLength" : 1,
          "reading" : null,
          "rightPOS" : "VA(Adjective)",
          "termFrequency" : 1
        },
        {
          "token" : "ᆫ",
          "start_offset" : 13,
          "end_offset" : 16,
          "type" : "word",
          "position" : 8,
          "bytes" : "[e1 86 ab]",
          "leftPOS" : "E(Verbal endings)",
          "morphemes" : null,
          "posType" : "MORPHEME",
          "positionLength" : 1,
          "reading" : null,
          "rightPOS" : "E(Verbal endings)",
          "termFrequency" : 1
        },
        {
          "token" : "간",
          "start_offset" : 17,
          "end_offset" : 18,
          "type" : "word",
          "position" : 9,
          "bytes" : "[ea b0 84]",
          "leftPOS" : "NNB(Dependent noun)",
          "morphemes" : null,
          "posType" : "MORPHEME",
          "positionLength" : 1,
          "reading" : null,
          "rightPOS" : "NNB(Dependent noun)",
          "termFrequency" : 1
        },
        {
          "token" : "이",
          "start_offset" : 18,
          "end_offset" : 19,
          "type" : "word",
          "position" : 10,
          "bytes" : "[ec 9d b4]",
          "leftPOS" : "J(Ending Particle)",
          "morphemes" : null,
          "posType" : "MORPHEME",
          "positionLength" : 1,
          "reading" : null,
          "rightPOS" : "J(Ending Particle)",
          "termFrequency" : 1
        },
        {
          "token" : "있",
          "start_offset" : 20,
          "end_offset" : 21,
          "type" : "word",
          "position" : 11,
          "bytes" : "[ec 9e 88]",
          "leftPOS" : "VV(Verb)",
          "morphemes" : null,
          "posType" : "MORPHEME",
          "positionLength" : 1,
          "reading" : null,
          "rightPOS" : "VV(Verb)",
          "termFrequency" : 1
        },
        {
          "token" : "다",
          "start_offset" : 21,
          "end_offset" : 22,
          "type" : "word",
          "position" : 12,
          "bytes" : "[eb 8b a4]",
          "leftPOS" : "E(Verbal endings)",
          "morphemes" : null,
          "posType" : "MORPHEME",
          "positionLength" : 1,
          "reading" : null,
          "rightPOS" : "E(Verbal endings)",
          "termFrequency" : 1
        }
      ]
    },
    "tokenfilters" : [ ]
  }
}

image
there's three '적'(red box) in input(image above) and it's tokened to image below

last '적' is tokened to '간' which is different word.

so I'm wondering there's some reason '적' is changed to '간' or it's just an error.

thanks in advance

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.