Elasticsearch Version: 5.3.1
kuromoji_stemmer
token filterとkuromoji_readingform
token filterを使って、最後の長音の除去とカタカナ読みへの変換を同時に設定しようとしたのですが、kuromoji_stemmer
の後にkuromoji_readingform
を設定すると、kuromoji_stemmer
で除去されたはずの長音がkuromoji_readingform
で再び追加されてしまいます。
設定例:
PUT http://localhost:9200/my_index
{
"settings": {
"analysis": {
"filter": {
"katakana_readingform": {
"type": "kuromoji_readingform",
"use_romaji": false
}
},
"analyzer": {
"stemmer": {
"type": "custom",
"tokenizer": "kuromoji_tokenizer",
"filter": [
"kuromoji_stemmer"
]
},
"reading": {
"type": "custom",
"tokenizer": "kuromoji_tokenizer",
"filter": [
"katakana_readingform"
]
},
"stemmer_reading": {
"type": "custom",
"tokenizer": "kuromoji_tokenizer",
"filter": [
"kuromoji_stemmer",
"katakana_readingform"
]
},
"reading_stemmer": {
"type": "custom",
"tokenizer": "kuromoji_tokenizer",
"filter": [
"katakana_readingform",
"kuromoji_stemmer"
]
}
}
}
}
}
クエリ例:
POST http://localhost:9200/my_index/_analyze
{
"analyzer": "stemmer_reading",
"text": "パーカー"
}
クエリ結果:
{
"tokens": [
{
"token": "パーカー",
"start_offset": 0,
"end_offset": 4,
"type": "word",
"position": 0
}
]
}
クエリ例(explain=true):
POST http://localhost:9200/my_index/_analyze
{
"analyzer": "stemmer_reading",
"text": "パーカー",
"explain": true
}
クエリ結果(explain=true):
{
"detail": {
"custom_analyzer": true,
"charfilters": [],
"tokenizer": {
"name": "kuromoji_tokenizer",
"tokens": [
{
"token": "パーカー",
"start_offset": 0,
"end_offset": 4,
"type": "word",
"position": 0,
"baseForm": null,
"bytes": "[e3 83 91 e3 83 bc e3 82 ab e3 83 bc]",
"inflectionForm": null,
"inflectionForm (en)": null,
"inflectionType": null,
"inflectionType (en)": null,
"partOfSpeech": "名詞-一般",
"partOfSpeech (en)": "noun-common",
"positionLength": 1,
"pronunciation": "パーカー",
"pronunciation (en)": "paka",
"reading": "パーカー",
"reading (en)": "paka"
}
]
},
"tokenfilters": [
{
"name": "kuromoji_stemmer",
"tokens": [
{
"token": "パーカ",
"start_offset": 0,
"end_offset": 4,
"type": "word",
"position": 0,
"baseForm": null,
"bytes": "[e3 83 91 e3 83 bc e3 82 ab]",
"inflectionForm": null,
"inflectionForm (en)": null,
"inflectionType": null,
"inflectionType (en)": null,
"keyword": false,
"partOfSpeech": "名詞-一般",
"partOfSpeech (en)": "noun-common",
"positionLength": 1,
"pronunciation": "パーカー",
"pronunciation (en)": "paka",
"reading": "パーカー",
"reading (en)": "paka"
}
]
},
{
"name": "katakana_readingform",
"tokens": [
{
"token": "パーカー",
"start_offset": 0,
"end_offset": 4,
"type": "word",
"position": 0,
"baseForm": null,
"bytes": "[e3 83 91 e3 83 bc e3 82 ab e3 83 bc]",
"inflectionForm": null,
"inflectionForm (en)": null,
"inflectionType": null,
"inflectionType (en)": null,
"keyword": false,
"partOfSpeech": "名詞-一般",
"partOfSpeech (en)": "noun-common",
"positionLength": 1,
"pronunciation": "パーカー",
"pronunciation (en)": "paka",
"reading": "パーカー",
"reading (en)": "paka"
}
]
}
]
}
}
この問題は、kuromoji_stemmer
とkuromoji_readingform
の順番を逆にすることでひとまず回避できそうですが、もし原因がわかれば教えていただきたいです。
POST http://localhost:9200/my_index/_analyze
{
"analyzer": "reading_stemmer",
"text": "パーカー"
}
{
"tokens": [
{
"token": "パーカ",
"start_offset": 0,
"end_offset": 4,
"type": "word",
"position": 0
}
]
}
よろしくお願いいたします。