Dealing with concatenated words in elasticsearch

I feel this suppose to be a very straightforward problem but for some reason I can put my head around it.

I want to build a product search engine using Elasticsearch. I have a problem when it comes to concatenated words, for example I want to search for Smart watch. I running two different queries: (1) "Smart watch" and (2) "smartwatch".

In (1) I get both results that have "smartwatch and "smart watch" in their product title. However in (2) I'm getting only products that have "smartwatch" I will not get any variation with whitespace between smart and watch:

This is my index config:

config = {
 "settings": {
    "analysis": {
      "analyzer": {
        "nGram_analyzer": {
          "type": "custom",
          "tokenizer": "whitespace",
          "char_filter":["html_strip","custom_char_filter","space_maker_2", "space_maker_3" ],
          "filter": [
            "lowercase",
            "asciifolding",
            "nGram_filter"
          ]
        },
        "whitespace_analyzer": {
          "type": "custom",
          "tokenizer": "whitespace",
          "char_filter": ["space_maker_2", "space_maker_3"
          ],
          "filter": [
            "lowercase",
            "asciifolding",
            "synonym_apply",
            "special_stopwards"
          ]
        }
      },
      "char_filter": {
        "custom_char_filter": {
          "type": "mapping",
          "mappings": [
            "$ => dollar"
          ]
        },
        "space_maker_1": {
          "type": "pattern_replace", 
          "pattern": "(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[a-z])",
          "replacement": " "
        },
        "space_maker_2": {
          "type": "pattern_replace",
          "pattern": "(?<=\\p{Digit})(?=\\p{Alpha})|(?<=\\p{Alpha})(?=\\p{Digit})",
          "replacement": " "
        },
        "space_maker_3": {
          "type": "pattern_replace",
          "pattern": "(?<=[a-zA-Z0-9])(?=[^a-zA-Z0-9])|(?<=[^a-zA-Z0-9])(?=[a-zA-Z0-9])",
          "replacement": " "
        }
      },
       "filter": {
        "nGram_filter": {
          "type": "edge_ngram",
          "min_gram": 2,
          "max_gram": 20,
          "token_chars": [
            "letter",
            "digit",
            "punctuation",
            "symbol"
          ]
        },
        "synonym_apply": {
            "type": "synonym",
            "lenient": "true",
            "synonyms": [ "kilo, kilogram => kg",
            "buck, dollar => usd"
            ]
          },
        "special_stopwards": {
            "type": "stop",
            "stopwords": [ "ass", "butt" ]
          }
      }
    }
  },

    "mappings": {
        "properties": {
            "brand": {
                "type": "keyword"
            },
            "category": {
                "type": "keyword"
            },
            "tags": {
                "type": "keyword"
            },
            "domain": {
                "type": "keyword"
            },
            "image": {
                "type": "text"
            },
            "purchases": {
                "type": "double"
            },
            "views": {
                "type": "double"
            },
            "price": {
                "type": "double"
            },
            "product_id": {
                "type": "text"
            },
            "product_url": {
                "type": "text"
            },
            "title": {
                "type": "text",
                "analyzer": "nGram_analyzer",
                "search_analyzer": "nGram_analyzer",
            },
            "description": {
                "type": "text"
            },
            "country": {
                "type": "integer"
            },
            "last_seen_date": {
                "type": "text"
            }
        }
    }
}

And I'm using simple match query currently only the product title.

How can I change my query or index to solve this issue? or is it even solvable?

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.