Please teach me search substrings in Elasticsearch query

t_rin · May 14, 2025, 5:21am

DELETE content-search-test

PUT content-search-test
{
    "mappings": {
        "properties": {
            "content": {
                "type": "text",
                "analyzer": "kuromoji_analyzer",
                "fields": {
                    "ngram": {"type": "text", "analyzer": "ngram_analyzer"
                    }
                }
            }
        }
    },
    "settings": {
      "index.max_ngram_diff": 2,
      "analysis": {
          "analyzer": {
              "kuromoji_analyzer": {
                  "type": "custom",
                  "tokenizer": "kuromoji_tokenizer",
                  "char_filter": ["icu_normalizer"]
              },
              "ngram_analyzer": {
                  "type": "custom",
                  "tokenizer": "ngram_tokenizer",
                  "filter": ["lowercase"],
                  "char_filter": ["icu_normalizer"]
              }
          },
              "tokenizer": {
                "ngram_tokenizer": {
                    "type": "ngram",
                    "min_gram": 1,
                    "max_gram": 3,
                    "token_chars": ["letter", "digit"]
                }
              }
          }
      }
  
}

POST content-search-test/_analyze
{
  "analyzer": "kuromoji_analyzer",
  "text": "ウインウイリアムズのスキヤンダラスな回顧録"
}
POST content-search-test/_analyze
{
  "analyzer": "ngram_analyzer",
  "text": "ウインウイリアムズのスキヤンダラスな回顧録"
}


POST _bulk
{"index":{"_index":"content-search-test"}}
{"content":""}
{"index":{"_index":"content-search-test"}}
{"content":"ウインウイリアムズのスキヤンダラスな回顧録"}
{"index":{"_index":"content-search-test"}}
{"content":"ウイリキヤンダラスな回顧録"}


// 0 hits...
// must hits: ウインウイリアムズのスキヤンダラスな回顧録
POST content-search-test/_search
{
  "query": {
    "match_phrase": {
      "content.ngram": "ウインウイリア"
    }
  }
}
// 0 hits...
// must hits: ウインウイリアムズのスキヤンダラスな回顧録
POST content-search-test/_search
{
  "query": {
    "match_phrase": {
      "content.ngram": "イン"
    }
  }
}

t_rin · May 14, 2025, 5:56am

1-2gram, OK
1-3gram, NG
[context]
2-3gram would have been enough, but I also need to search for a single character.
I thought 1-2gram would be slow, so I used 1-3gram.
However, match_pharse probably cannot be used with 1-3gram.
(Due to the positions are not consecutive?)

1-3

POST content-search-test/_analyze
{
  "analyzer": "ngram_analyzer",
  "text": "ウインウイリアムズのスキヤンダラスな回顧録"
}
{
  "tokens": [
    {
      "token": "ウ",
      "start_offset": 0,
      "end_offset": 1,
      "type": "word",
      "position": 0
    },
    {
      "token": "ウイ",
      "start_offset": 0,
      "end_offset": 2,
      "type": "word",
      "position": 1
    },
    {
      "token": "ウイン",
      "start_offset": 0,
      "end_offset": 3,
      "type": "word",
      "position": 2
    },
    {
      "token": "イ",
      "start_offset": 1,
      "end_offset": 2,
      "type": "word",
      "position": 3
    },
    {
      "token": "イン",
      "start_offset": 1,
      "end_offset": 3,
      "type": "word",
      "position": 4
    },
    {
      "token": "インウ",
      "start_offset": 1,
      "end_offset": 4,
      "type": "word",
      "position": 5
    },
    {
      "token": "ン",
      "start_offset": 2,
      "end_offset": 3,
      "type": "word",
      "position": 6
    },
    {
      "token": "ンウ",
      "start_offset": 2,
      "end_offset": 4,
      "type": "word",
      "position": 7
    },
    {
      "token": "ンウイ",
      "start_offset": 2,
      "end_offset": 5,
      "type": "word",
      "position": 8
    },
    {
      "token": "ウ",
      "start_offset": 3,
      "end_offset": 4,
      "type": "word",
      "position": 9
    },
    {
      "token": "ウイ",
      "start_offset": 3,
      "end_offset": 5,
      "type": "word",
      "position": 10
    },
    {
      "token": "ウイリ",
      "start_offset": 3,
      "end_offset": 6,
      "type": "word",
      "position": 11
    },
    {
      "token": "イ",
      "start_offset": 4,
      "end_offset": 5,
      "type": "word",
      "position": 12
    },
    {
      "token": "イリ",
      "start_offset": 4,
      "end_offset": 6,
      "type": "word",
      "position": 13
    },
    {
      "token": "イリア",
      "start_offset": 4,
      "end_offset": 7,
      "type": "word",
      "position": 14
    },
    {
      "token": "リ",
      "start_offset": 5,
      "end_offset": 6,
      "type": "word",
      "position": 15
    },
    {
      "token": "リア",
      "start_offset": 5,
      "end_offset": 7,
      "type": "word",
      "position": 16
    },
    {
      "token": "リアム",
      "start_offset": 5,
      "end_offset": 8,
      "type": "word",
      "position": 17
    },
    {
      "token": "ア",
      "start_offset": 6,
      "end_offset": 7,
      "type": "word",
      "position": 18
    },
    {
      "token": "アム",
      "start_offset": 6,
      "end_offset": 8,
      "type": "word",
      "position": 19
    },
    {
      "token": "アムズ",
      "start_offset": 6,
      "end_offset": 9,
      "type": "word",
      "position": 20
    },
    {
      "token": "ム",
      "start_offset": 7,
      "end_offset": 8,
      "type": "word",
      "position": 21
    },
    {
      "token": "ムズ",
      "start_offset": 7,
      "end_offset": 9,
      "type": "word",
      "position": 22
    },
    {
      "token": "ムズの",
      "start_offset": 7,
      "end_offset": 10,
      "type": "word",
      "position": 23
    },
    {
      "token": "ズ",
      "start_offset": 8,
      "end_offset": 9,
      "type": "word",
      "position": 24
    },
    {
      "token": "ズの",
      "start_offset": 8,
      "end_offset": 10,
      "type": "word",
      "position": 25
    },
    {
      "token": "ズのス",
      "start_offset": 8,
      "end_offset": 11,
      "type": "word",
      "position": 26
    },
    {
      "token": "の",
      "start_offset": 9,
      "end_offset": 10,
      "type": "word",
      "position": 27
    },
    {
      "token": "のス",
      "start_offset": 9,
      "end_offset": 11,
      "type": "word",
      "position": 28
    },
    {
      "token": "のスキ",
      "start_offset": 9,
      "end_offset": 12,
      "type": "word",
      "position": 29
    },
    {
      "token": "ス",
      "start_offset": 10,
      "end_offset": 11,
      "type": "word",
      "position": 30
    },
    {
      "token": "スキ",
      "start_offset": 10,
      "end_offset": 12,
      "type": "word",
      "position": 31
    },
    {
      "token": "スキヤ",
      "start_offset": 10,
      "end_offset": 13,
      "type": "word",
      "position": 32
    },
    {
      "token": "キ",
      "start_offset": 11,
      "end_offset": 12,
      "type": "word",
      "position": 33
    },
    {
      "token": "キヤ",
      "start_offset": 11,
      "end_offset": 13,
      "type": "word",
      "position": 34
    },
    {
      "token": "キヤン",
      "start_offset": 11,
      "end_offset": 14,
      "type": "word",
      "position": 35
    },
    {
      "token": "ヤ",
      "start_offset": 12,
      "end_offset": 13,
      "type": "word",
      "position": 36
    },
    {
      "token": "ヤン",
      "start_offset": 12,
      "end_offset": 14,
      "type": "word",
      "position": 37
    },
    {
      "token": "ヤンダ",
      "start_offset": 12,
      "end_offset": 15,
      "type": "word",
      "position": 38
    },
    {
      "token": "ン",
      "start_offset": 13,
      "end_offset": 14,
      "type": "word",
      "position": 39
    },
    {
      "token": "ンダ",
      "start_offset": 13,
      "end_offset": 15,
      "type": "word",
      "position": 40
    },
    {
      "token": "ンダラ",
      "start_offset": 13,
      "end_offset": 16,
      "type": "word",
      "position": 41
    },
    {
      "token": "ダ",
      "start_offset": 14,
      "end_offset": 15,
      "type": "word",
      "position": 42
    },
    {
      "token": "ダラ",
      "start_offset": 14,
      "end_offset": 16,
      "type": "word",
      "position": 43
    },
    {
      "token": "ダラス",
      "start_offset": 14,
      "end_offset": 17,
      "type": "word",
      "position": 44
    },
    {
      "token": "ラ",
      "start_offset": 15,
      "end_offset": 16,
      "type": "word",
      "position": 45
    },
    {
      "token": "ラス",
      "start_offset": 15,
      "end_offset": 17,
      "type": "word",
      "position": 46
    },
    {
      "token": "ラスな",
      "start_offset": 15,
      "end_offset": 18,
      "type": "word",
      "position": 47
    },
    {
      "token": "ス",
      "start_offset": 16,
      "end_offset": 17,
      "type": "word",
      "position": 48
    },
    {
      "token": "スな",
      "start_offset": 16,
      "end_offset": 18,
      "type": "word",
      "position": 49
    },
    {
      "token": "スな回",
      "start_offset": 16,
      "end_offset": 19,
      "type": "word",
      "position": 50
    },
    {
      "token": "な",
      "start_offset": 17,
      "end_offset": 18,
      "type": "word",
      "position": 51
    },
    {
      "token": "な回",
      "start_offset": 17,
      "end_offset": 19,
      "type": "word",
      "position": 52
    },
    {
      "token": "な回顧",
      "start_offset": 17,
      "end_offset": 20,
      "type": "word",
      "position": 53
    },
    {
      "token": "回",
      "start_offset": 18,
      "end_offset": 19,
      "type": "word",
      "position": 54
    },
    {
      "token": "回顧",
      "start_offset": 18,
      "end_offset": 20,
      "type": "word",
      "position": 55
    },
    {
      "token": "回顧録",
      "start_offset": 18,
      "end_offset": 21,
      "type": "word",
      "position": 56
    },
    {
      "token": "顧",
      "start_offset": 19,
      "end_offset": 20,
      "type": "word",
      "position": 57
    },
    {
      "token": "顧録",
      "start_offset": 19,
      "end_offset": 21,
      "type": "word",
      "position": 58
    },
    {
      "token": "録",
      "start_offset": 20,
      "end_offset": 21,
      "type": "word",
      "position": 59
    }
  ]
}

1-2

{
  "tokens": [
    {
      "token": "ウ",
      "start_offset": 0,
      "end_offset": 1,
      "type": "word",
      "position": 0
    },
    {
      "token": "ウイ",
      "start_offset": 0,
      "end_offset": 2,
      "type": "word",
      "position": 1
    },
    {
      "token": "イ",
      "start_offset": 1,
      "end_offset": 2,
      "type": "word",
      "position": 2
    },
    {
      "token": "イン",
      "start_offset": 1,
      "end_offset": 3,
      "type": "word",
      "position": 3
    },
    {
      "token": "ン",
      "start_offset": 2,
      "end_offset": 3,
      "type": "word",
      "position": 4
    },
    {
      "token": "ンウ",
      "start_offset": 2,
      "end_offset": 4,
      "type": "word",
      "position": 5
    },
    {
      "token": "ウ",
      "start_offset": 3,
      "end_offset": 4,
      "type": "word",
      "position": 6
    },
    {
      "token": "ウイ",
      "start_offset": 3,
      "end_offset": 5,
      "type": "word",
      "position": 7
    },
    {
      "token": "イ",
      "start_offset": 4,
      "end_offset": 5,
      "type": "word",
      "position": 8
    },
    {
      "token": "イリ",
      "start_offset": 4,
      "end_offset": 6,
      "type": "word",
      "position": 9
    },
    {
      "token": "リ",
      "start_offset": 5,
      "end_offset": 6,
      "type": "word",
      "position": 10
    },
    {
      "token": "リア",
      "start_offset": 5,
      "end_offset": 7,
      "type": "word",
      "position": 11
    },
    {
      "token": "ア",
      "start_offset": 6,
      "end_offset": 7,
      "type": "word",
      "position": 12
    },
    {
      "token": "アム",
      "start_offset": 6,
      "end_offset": 8,
      "type": "word",
      "position": 13
    },
    {
      "token": "ム",
      "start_offset": 7,
      "end_offset": 8,
      "type": "word",
      "position": 14
    },
    {
      "token": "ムズ",
      "start_offset": 7,
      "end_offset": 9,
      "type": "word",
      "position": 15
    },
    {
      "token": "ズ",
      "start_offset": 8,
      "end_offset": 9,
      "type": "word",
      "position": 16
    },
    {
      "token": "ズの",
      "start_offset": 8,
      "end_offset": 10,
      "type": "word",
      "position": 17
    },
    {
      "token": "の",
      "start_offset": 9,
      "end_offset": 10,
      "type": "word",
      "position": 18
    },
    {
      "token": "のス",
      "start_offset": 9,
      "end_offset": 11,
      "type": "word",
      "position": 19
    },
    {
      "token": "ス",
      "start_offset": 10,
      "end_offset": 11,
      "type": "word",
      "position": 20
    },
    {
      "token": "スキ",
      "start_offset": 10,
      "end_offset": 12,
      "type": "word",
      "position": 21
    },
    {
      "token": "キ",
      "start_offset": 11,
      "end_offset": 12,
      "type": "word",
      "position": 22
    },
    {
      "token": "キヤ",
      "start_offset": 11,
      "end_offset": 13,
      "type": "word",
      "position": 23
    },
    {
      "token": "ヤ",
      "start_offset": 12,
      "end_offset": 13,
      "type": "word",
      "position": 24
    },
    {
      "token": "ヤン",
      "start_offset": 12,
      "end_offset": 14,
      "type": "word",
      "position": 25
    },
    {
      "token": "ン",
      "start_offset": 13,
      "end_offset": 14,
      "type": "word",
      "position": 26
    },
    {
      "token": "ンダ",
      "start_offset": 13,
      "end_offset": 15,
      "type": "word",
      "position": 27
    },
    {
      "token": "ダ",
      "start_offset": 14,
      "end_offset": 15,
      "type": "word",
      "position": 28
    },
    {
      "token": "ダラ",
      "start_offset": 14,
      "end_offset": 16,
      "type": "word",
      "position": 29
    },
    {
      "token": "ラ",
      "start_offset": 15,
      "end_offset": 16,
      "type": "word",
      "position": 30
    },
    {
      "token": "ラス",
      "start_offset": 15,
      "end_offset": 17,
      "type": "word",
      "position": 31
    },
    {
      "token": "ス",
      "start_offset": 16,
      "end_offset": 17,
      "type": "word",
      "position": 32
    },
    {
      "token": "スな",
      "start_offset": 16,
      "end_offset": 18,
      "type": "word",
      "position": 33
    },
    {
      "token": "な",
      "start_offset": 17,
      "end_offset": 18,
      "type": "word",
      "position": 34
    },
    {
      "token": "な回",
      "start_offset": 17,
      "end_offset": 19,
      "type": "word",
      "position": 35
    },
    {
      "token": "回",
      "start_offset": 18,
      "end_offset": 19,
      "type": "word",
      "position": 36
    },
    {
      "token": "回顧",
      "start_offset": 18,
      "end_offset": 20,
      "type": "word",
      "position": 37
    },
    {
      "token": "顧",
      "start_offset": 19,
      "end_offset": 20,
      "type": "word",
      "position": 38
    },
    {
      "token": "顧録",
      "start_offset": 19,
      "end_offset": 21,
      "type": "word",
      "position": 39
    },
    {
      "token": "録",
      "start_offset": 20,
      "end_offset": 21,
      "type": "word",
      "position": 40
    }
  ]
}

t_rin · May 14, 2025, 5:59am

(I achieved my goal by using 1-2 grams, but I would like some advice on whether I can search more effectively using 1-3 grams. or more faster.)

Topic		Replies	Views
Better effective substring query idea? Elasticsearch	13	1529	July 6, 2017
NGram search troubles(Is it possible to match entire string) Elasticsearch	5	3560	September 4, 2019
Substring search Elasticsearch	2	464	July 6, 2017
Term matching with elastic search edge n gram Elasticsearch	8	1901	March 7, 2017
Elastic search : EdgeGram, prefix, suffix Elasticsearch	2	2154	July 6, 2017

Please teach me search substrings in Elasticsearch query

Related topics