I feel this suppose to be a very straightforward problem but for some reason I can put my head around it.
I want to build a product search engine using Elasticsearch. I have a problem when it comes to concatenated words, for example I want to search for Smart watch. I running two different queries: (1) "Smart watch" and (2) "smartwatch".
In (1) I get both results that have "smartwatch and "smart watch" in their product title. However in (2) I'm getting only products that have "smartwatch" I will not get any variation with whitespace between smart and watch:
This is my index config:
config = {
"settings": {
"analysis": {
"analyzer": {
"nGram_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"char_filter":["html_strip","custom_char_filter","space_maker_2", "space_maker_3" ],
"filter": [
"lowercase",
"asciifolding",
"nGram_filter"
]
},
"whitespace_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"char_filter": ["space_maker_2", "space_maker_3"
],
"filter": [
"lowercase",
"asciifolding",
"synonym_apply",
"special_stopwards"
]
}
},
"char_filter": {
"custom_char_filter": {
"type": "mapping",
"mappings": [
"$ => dollar"
]
},
"space_maker_1": {
"type": "pattern_replace",
"pattern": "(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[a-z])",
"replacement": " "
},
"space_maker_2": {
"type": "pattern_replace",
"pattern": "(?<=\\p{Digit})(?=\\p{Alpha})|(?<=\\p{Alpha})(?=\\p{Digit})",
"replacement": " "
},
"space_maker_3": {
"type": "pattern_replace",
"pattern": "(?<=[a-zA-Z0-9])(?=[^a-zA-Z0-9])|(?<=[^a-zA-Z0-9])(?=[a-zA-Z0-9])",
"replacement": " "
}
},
"filter": {
"nGram_filter": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": 20,
"token_chars": [
"letter",
"digit",
"punctuation",
"symbol"
]
},
"synonym_apply": {
"type": "synonym",
"lenient": "true",
"synonyms": [ "kilo, kilogram => kg",
"buck, dollar => usd"
]
},
"special_stopwards": {
"type": "stop",
"stopwords": [ "ass", "butt" ]
}
}
}
},
"mappings": {
"properties": {
"brand": {
"type": "keyword"
},
"category": {
"type": "keyword"
},
"tags": {
"type": "keyword"
},
"domain": {
"type": "keyword"
},
"image": {
"type": "text"
},
"purchases": {
"type": "double"
},
"views": {
"type": "double"
},
"price": {
"type": "double"
},
"product_id": {
"type": "text"
},
"product_url": {
"type": "text"
},
"title": {
"type": "text",
"analyzer": "nGram_analyzer",
"search_analyzer": "nGram_analyzer",
},
"description": {
"type": "text"
},
"country": {
"type": "integer"
},
"last_seen_date": {
"type": "text"
}
}
}
}
And I'm using simple match query currently only the product title.
How can I change my query or index to solve this issue? or is it even solvable?