DELETE content-search-test
PUT content-search-test
{
"mappings": {
"properties": {
"content": {
"type": "text",
"analyzer": "kuromoji_analyzer",
"fields": {
"ngram": {"type": "text", "analyzer": "ngram_analyzer"
}
}
}
}
},
"settings": {
"index.max_ngram_diff": 2,
"analysis": {
"analyzer": {
"kuromoji_analyzer": {
"type": "custom",
"tokenizer": "kuromoji_tokenizer",
"char_filter": ["icu_normalizer"]
},
"ngram_analyzer": {
"type": "custom",
"tokenizer": "ngram_tokenizer",
"filter": ["lowercase"],
"char_filter": ["icu_normalizer"]
}
},
"tokenizer": {
"ngram_tokenizer": {
"type": "ngram",
"min_gram": 1,
"max_gram": 3,
"token_chars": ["letter", "digit"]
}
}
}
}
}
POST content-search-test/_analyze
{
"analyzer": "kuromoji_analyzer",
"text": "ウインウイリアムズのスキヤンダラスな回顧録"
}
POST content-search-test/_analyze
{
"analyzer": "ngram_analyzer",
"text": "ウインウイリアムズのスキヤンダラスな回顧録"
}
POST _bulk
{"index":{"_index":"content-search-test"}}
{"content":""}
{"index":{"_index":"content-search-test"}}
{"content":"ウインウイリアムズのスキヤンダラスな回顧録"}
{"index":{"_index":"content-search-test"}}
{"content":"ウイリキヤンダラスな回顧録"}
// 0 hits...
// must hits: ウインウイリアムズのスキヤンダラスな回顧録
POST content-search-test/_search
{
"query": {
"match_phrase": {
"content.ngram": "ウインウイリア"
}
}
}
// 0 hits...
// must hits: ウインウイリアムズのスキヤンダラスな回顧録
POST content-search-test/_search
{
"query": {
"match_phrase": {
"content.ngram": "イン"
}
}
}
1-2gram, OK
1-3gram, NG
[context]
2-3gram would have been enough, but I also need to search for a single character.
I thought 1-2gram would be slow, so I used 1-3gram.
However, match_pharse probably cannot be used with 1-3gram.
(Due to the positions are not consecutive?)
1-3
POST content-search-test/_analyze
{
"analyzer": "ngram_analyzer",
"text": "ウインウイリアムズのスキヤンダラスな回顧録"
}
{
"tokens": [
{
"token": "ウ",
"start_offset": 0,
"end_offset": 1,
"type": "word",
"position": 0
},
{
"token": "ウイ",
"start_offset": 0,
"end_offset": 2,
"type": "word",
"position": 1
},
{
"token": "ウイン",
"start_offset": 0,
"end_offset": 3,
"type": "word",
"position": 2
},
{
"token": "イ",
"start_offset": 1,
"end_offset": 2,
"type": "word",
"position": 3
},
{
"token": "イン",
"start_offset": 1,
"end_offset": 3,
"type": "word",
"position": 4
},
{
"token": "インウ",
"start_offset": 1,
"end_offset": 4,
"type": "word",
"position": 5
},
{
"token": "ン",
"start_offset": 2,
"end_offset": 3,
"type": "word",
"position": 6
},
{
"token": "ンウ",
"start_offset": 2,
"end_offset": 4,
"type": "word",
"position": 7
},
{
"token": "ンウイ",
"start_offset": 2,
"end_offset": 5,
"type": "word",
"position": 8
},
{
"token": "ウ",
"start_offset": 3,
"end_offset": 4,
"type": "word",
"position": 9
},
{
"token": "ウイ",
"start_offset": 3,
"end_offset": 5,
"type": "word",
"position": 10
},
{
"token": "ウイリ",
"start_offset": 3,
"end_offset": 6,
"type": "word",
"position": 11
},
{
"token": "イ",
"start_offset": 4,
"end_offset": 5,
"type": "word",
"position": 12
},
{
"token": "イリ",
"start_offset": 4,
"end_offset": 6,
"type": "word",
"position": 13
},
{
"token": "イリア",
"start_offset": 4,
"end_offset": 7,
"type": "word",
"position": 14
},
{
"token": "リ",
"start_offset": 5,
"end_offset": 6,
"type": "word",
"position": 15
},
{
"token": "リア",
"start_offset": 5,
"end_offset": 7,
"type": "word",
"position": 16
},
{
"token": "リアム",
"start_offset": 5,
"end_offset": 8,
"type": "word",
"position": 17
},
{
"token": "ア",
"start_offset": 6,
"end_offset": 7,
"type": "word",
"position": 18
},
{
"token": "アム",
"start_offset": 6,
"end_offset": 8,
"type": "word",
"position": 19
},
{
"token": "アムズ",
"start_offset": 6,
"end_offset": 9,
"type": "word",
"position": 20
},
{
"token": "ム",
"start_offset": 7,
"end_offset": 8,
"type": "word",
"position": 21
},
{
"token": "ムズ",
"start_offset": 7,
"end_offset": 9,
"type": "word",
"position": 22
},
{
"token": "ムズの",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 23
},
{
"token": "ズ",
"start_offset": 8,
"end_offset": 9,
"type": "word",
"position": 24
},
{
"token": "ズの",
"start_offset": 8,
"end_offset": 10,
"type": "word",
"position": 25
},
{
"token": "ズのス",
"start_offset": 8,
"end_offset": 11,
"type": "word",
"position": 26
},
{
"token": "の",
"start_offset": 9,
"end_offset": 10,
"type": "word",
"position": 27
},
{
"token": "のス",
"start_offset": 9,
"end_offset": 11,
"type": "word",
"position": 28
},
{
"token": "のスキ",
"start_offset": 9,
"end_offset": 12,
"type": "word",
"position": 29
},
{
"token": "ス",
"start_offset": 10,
"end_offset": 11,
"type": "word",
"position": 30
},
{
"token": "スキ",
"start_offset": 10,
"end_offset": 12,
"type": "word",
"position": 31
},
{
"token": "スキヤ",
"start_offset": 10,
"end_offset": 13,
"type": "word",
"position": 32
},
{
"token": "キ",
"start_offset": 11,
"end_offset": 12,
"type": "word",
"position": 33
},
{
"token": "キヤ",
"start_offset": 11,
"end_offset": 13,
"type": "word",
"position": 34
},
{
"token": "キヤン",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 35
},
{
"token": "ヤ",
"start_offset": 12,
"end_offset": 13,
"type": "word",
"position": 36
},
{
"token": "ヤン",
"start_offset": 12,
"end_offset": 14,
"type": "word",
"position": 37
},
{
"token": "ヤンダ",
"start_offset": 12,
"end_offset": 15,
"type": "word",
"position": 38
},
{
"token": "ン",
"start_offset": 13,
"end_offset": 14,
"type": "word",
"position": 39
},
{
"token": "ンダ",
"start_offset": 13,
"end_offset": 15,
"type": "word",
"position": 40
},
{
"token": "ンダラ",
"start_offset": 13,
"end_offset": 16,
"type": "word",
"position": 41
},
{
"token": "ダ",
"start_offset": 14,
"end_offset": 15,
"type": "word",
"position": 42
},
{
"token": "ダラ",
"start_offset": 14,
"end_offset": 16,
"type": "word",
"position": 43
},
{
"token": "ダラス",
"start_offset": 14,
"end_offset": 17,
"type": "word",
"position": 44
},
{
"token": "ラ",
"start_offset": 15,
"end_offset": 16,
"type": "word",
"position": 45
},
{
"token": "ラス",
"start_offset": 15,
"end_offset": 17,
"type": "word",
"position": 46
},
{
"token": "ラスな",
"start_offset": 15,
"end_offset": 18,
"type": "word",
"position": 47
},
{
"token": "ス",
"start_offset": 16,
"end_offset": 17,
"type": "word",
"position": 48
},
{
"token": "スな",
"start_offset": 16,
"end_offset": 18,
"type": "word",
"position": 49
},
{
"token": "スな回",
"start_offset": 16,
"end_offset": 19,
"type": "word",
"position": 50
},
{
"token": "な",
"start_offset": 17,
"end_offset": 18,
"type": "word",
"position": 51
},
{
"token": "な回",
"start_offset": 17,
"end_offset": 19,
"type": "word",
"position": 52
},
{
"token": "な回顧",
"start_offset": 17,
"end_offset": 20,
"type": "word",
"position": 53
},
{
"token": "回",
"start_offset": 18,
"end_offset": 19,
"type": "word",
"position": 54
},
{
"token": "回顧",
"start_offset": 18,
"end_offset": 20,
"type": "word",
"position": 55
},
{
"token": "回顧録",
"start_offset": 18,
"end_offset": 21,
"type": "word",
"position": 56
},
{
"token": "顧",
"start_offset": 19,
"end_offset": 20,
"type": "word",
"position": 57
},
{
"token": "顧録",
"start_offset": 19,
"end_offset": 21,
"type": "word",
"position": 58
},
{
"token": "録",
"start_offset": 20,
"end_offset": 21,
"type": "word",
"position": 59
}
]
}
1-2
{
"tokens": [
{
"token": "ウ",
"start_offset": 0,
"end_offset": 1,
"type": "word",
"position": 0
},
{
"token": "ウイ",
"start_offset": 0,
"end_offset": 2,
"type": "word",
"position": 1
},
{
"token": "イ",
"start_offset": 1,
"end_offset": 2,
"type": "word",
"position": 2
},
{
"token": "イン",
"start_offset": 1,
"end_offset": 3,
"type": "word",
"position": 3
},
{
"token": "ン",
"start_offset": 2,
"end_offset": 3,
"type": "word",
"position": 4
},
{
"token": "ンウ",
"start_offset": 2,
"end_offset": 4,
"type": "word",
"position": 5
},
{
"token": "ウ",
"start_offset": 3,
"end_offset": 4,
"type": "word",
"position": 6
},
{
"token": "ウイ",
"start_offset": 3,
"end_offset": 5,
"type": "word",
"position": 7
},
{
"token": "イ",
"start_offset": 4,
"end_offset": 5,
"type": "word",
"position": 8
},
{
"token": "イリ",
"start_offset": 4,
"end_offset": 6,
"type": "word",
"position": 9
},
{
"token": "リ",
"start_offset": 5,
"end_offset": 6,
"type": "word",
"position": 10
},
{
"token": "リア",
"start_offset": 5,
"end_offset": 7,
"type": "word",
"position": 11
},
{
"token": "ア",
"start_offset": 6,
"end_offset": 7,
"type": "word",
"position": 12
},
{
"token": "アム",
"start_offset": 6,
"end_offset": 8,
"type": "word",
"position": 13
},
{
"token": "ム",
"start_offset": 7,
"end_offset": 8,
"type": "word",
"position": 14
},
{
"token": "ムズ",
"start_offset": 7,
"end_offset": 9,
"type": "word",
"position": 15
},
{
"token": "ズ",
"start_offset": 8,
"end_offset": 9,
"type": "word",
"position": 16
},
{
"token": "ズの",
"start_offset": 8,
"end_offset": 10,
"type": "word",
"position": 17
},
{
"token": "の",
"start_offset": 9,
"end_offset": 10,
"type": "word",
"position": 18
},
{
"token": "のス",
"start_offset": 9,
"end_offset": 11,
"type": "word",
"position": 19
},
{
"token": "ス",
"start_offset": 10,
"end_offset": 11,
"type": "word",
"position": 20
},
{
"token": "スキ",
"start_offset": 10,
"end_offset": 12,
"type": "word",
"position": 21
},
{
"token": "キ",
"start_offset": 11,
"end_offset": 12,
"type": "word",
"position": 22
},
{
"token": "キヤ",
"start_offset": 11,
"end_offset": 13,
"type": "word",
"position": 23
},
{
"token": "ヤ",
"start_offset": 12,
"end_offset": 13,
"type": "word",
"position": 24
},
{
"token": "ヤン",
"start_offset": 12,
"end_offset": 14,
"type": "word",
"position": 25
},
{
"token": "ン",
"start_offset": 13,
"end_offset": 14,
"type": "word",
"position": 26
},
{
"token": "ンダ",
"start_offset": 13,
"end_offset": 15,
"type": "word",
"position": 27
},
{
"token": "ダ",
"start_offset": 14,
"end_offset": 15,
"type": "word",
"position": 28
},
{
"token": "ダラ",
"start_offset": 14,
"end_offset": 16,
"type": "word",
"position": 29
},
{
"token": "ラ",
"start_offset": 15,
"end_offset": 16,
"type": "word",
"position": 30
},
{
"token": "ラス",
"start_offset": 15,
"end_offset": 17,
"type": "word",
"position": 31
},
{
"token": "ス",
"start_offset": 16,
"end_offset": 17,
"type": "word",
"position": 32
},
{
"token": "スな",
"start_offset": 16,
"end_offset": 18,
"type": "word",
"position": 33
},
{
"token": "な",
"start_offset": 17,
"end_offset": 18,
"type": "word",
"position": 34
},
{
"token": "な回",
"start_offset": 17,
"end_offset": 19,
"type": "word",
"position": 35
},
{
"token": "回",
"start_offset": 18,
"end_offset": 19,
"type": "word",
"position": 36
},
{
"token": "回顧",
"start_offset": 18,
"end_offset": 20,
"type": "word",
"position": 37
},
{
"token": "顧",
"start_offset": 19,
"end_offset": 20,
"type": "word",
"position": 38
},
{
"token": "顧録",
"start_offset": 19,
"end_offset": 21,
"type": "word",
"position": 39
},
{
"token": "録",
"start_offset": 20,
"end_offset": 21,
"type": "word",
"position": 40
}
]
}
(I achieved my goal by using 1-2 grams, but I would like some advice on whether I can search more effectively using 1-3 grams. or more faster.)