I tried synonyms but it doesn't solve the problem.
It won't work when my synonym contains spaces.
Like this:
analysis/synonym.txt
thiopental allergy,penthiobarbital allergy,pentothiobarbital allergy
d-mannitol allergy,mannitol allergy
cefotaxime allergy
cephalosporin allergy
amodiaquine allergy,camoquin allergy,flavoquine allergy
The result I want is
GET test_index/_analyze
{
"analyzer":"my_analyzer",
"text": "thiopental allergy is a test"
}
result:
{
"tokens": [
{
"token": "thiopental allergy",
"start_offset": 0,
"end_offset": 18,
"type": "word",
"position": 0
},
{
"token": "penthiobarbital allergy",
"start_offset": 0,
"end_offset": 18,
"type": "word",
"position": 0
},
{
"token": "pentothiobarbital allergy",
"start_offset": 0,
"end_offset": 18,
"type": "word",
"position": 0
},
{
"token": "is",
"start_offset": 19,
"end_offset": 21,
"type": "word",
"position": 0
},
{
"token": "a",
"start_offset": 22,
"end_offset": 23,
"type": "word",
"position": 0
},
{
"token": "test",
"start_offset": 24,
"end_offset": 28,
"type": "word",
"position": 0
},
]
}
but, the truth is
1. when I use "," tokenizer and synonyms
PUT test_index
{
"settings": {
"index": {
"number_of_shards": 5,
"number_of_replicas": 1,
"analysis": {
"analyzer": {
"index_analyzer": {
"tokenizer": "standard",
"filter": ["lowercase"],
"type": "custom"
},
"my_analyzer": {
"tokenizer": "comma",
"filter": ["my_synonym","lowercase"],
"type": "custom"
}
},
"filter": {
"my_synonym": {
"ignore_case": "true",
"expand": "true",
"type": "synonym",
"synonyms_path": "analysis/synonym.txt"
}
},
"tokenizer":{
"comma":{
"type": "pattern",
"pattern":",|,"
}
}
}
}
},
"mappings": {
"properties": {
"abstract": {
"type": "text",
"analyzer": "index_analyzer",
"search_analyzer": "my_analyzer"
}
}
}
}
GET test_index/_analyze
{
"analyzer":"my_analyzer",
"text": "thiopental allergy is a test"
}
result:
{
"tokens": [
{
"token": "thiopental allergy is a test",
"start_offset": 0,
"end_offset": 28,
"type": "word",
"position": 0
}
]
}
2. when I use standard tokenizer and synonyms
PUT test_index2
{
"settings": {
"index": {
"number_of_shards": 5,
"number_of_replicas": 1,
"analysis": {
"analyzer": {
"index_analyzer": {
"tokenizer": "standard",
"filter": ["lowercase"],
"type": "custom"
},
"my_analyzer": {
"tokenizer": "standard",
"filter": ["my_synonym","lowercase"],
"type": "custom"
}
},
"filter": {
"my_synonym": {
"ignore_case": "true",
"expand": "true",
"type": "synonym",
"synonyms_path": "analysis/synonymOld.txt"
}
}
}
}
},
"mappings": {
"properties": {
"abstract": {
"type": "text",
"analyzer": "index_analyzer",
"search_analyzer": "my_analyzer"
}
}
}
}
GET test_index2/_analyze
{
"analyzer":"my_analyzer",
"text": "thiopental allergy is a test"
}
result
{
"tokens": [
{
"token": "thiopental",
"start_offset": 0,
"end_offset": 10,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "penthiobarbital",
"start_offset": 0,
"end_offset": 10,
"type": "SYNONYM",
"position": 0
},
{
"token": "pentothiobarbital",
"start_offset": 0,
"end_offset": 10,
"type": "SYNONYM",
"position": 0
},
{
"token": "allergy",
"start_offset": 11,
"end_offset": 18,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "allergy",
"start_offset": 11,
"end_offset": 18,
"type": "SYNONYM",
"position": 1
},
{
"token": "allergy",
"start_offset": 11,
"end_offset": 18,
"type": "SYNONYM",
"position": 1
},
{
"token": "is",
"start_offset": 19,
"end_offset": 21,
"type": "<ALPHANUM>",
"position": 2
},
{
"token": "a",
"start_offset": 22,
"end_offset": 23,
"type": "<ALPHANUM>",
"position": 3
},
{
"token": "test",
"start_offset": 24,
"end_offset": 28,
"type": "<ALPHANUM>",
"position": 4
}
]
}
Neither of these results met my needs