Organize search by priority with/without spaces

Hello. I am using elastic 7.15.0.
I want to organize search by priority with/without spaces.
What does it mean?

query - "surf coff"

  1. I want to see records contains or startswith "surf coff" - (SURF COFFEE, SURF CAFFETERIA, SURFCOFFEE MAN)
  2. When records contains or startswith "surf" - (SURF, SURF LOVE, ENDLESS SURF)
  3. When records contains or startswith "coff" - (LOVE COFFEE, COFFEE MAN)

query - "surfcoff"

  1. i want to see records which contains or startswith "surfcoff" - (SURF COFFEE, SURF CAFFETERIA, SURFCOFFEE MAN) only.

I created the analyzer with filters:

  • lowercase
  • word_delimiter_graph
  • shingle
  • edge n gram
  • pattern replace for spaces
{
   "settings":{
       "index": {
            "max_shingle_diff" : 9,
            "max_ngram_diff": 9
       },
      "analysis":{
         "analyzer":{
            "word_join_analyzer":{
               "tokenizer":"standard",
               "filter":[
                  "lowercase",
                  "word_delimiter_graph",
                  "my_shingle",
                  "my_edge_ngram",
                  "my_char_filter"
               ]
            }
         },
         "filter":{
            "my_shingle":{
               "type":"shingle",
               "min_shingle_size": 2,
                "max_shingle_size": 10
            },
            "my_edge_ngram": { 
                "type": "edge_ngram",
                "min_gram": 2,
                "max_gram": 10,
                "token_chars": ["letter", "digit"]
            },
            "my_char_filter": {
                "type": "pattern_replace",
                "pattern": " ",
                "replacement": ""
            }
         }
      }
   }
}

So when i analyzed text = "SURF COFFEE", i got this result

{
    "tokens": [
        {
            "token": "su",
            "start_offset": 0,
            "end_offset": 4,
            "type": "<ALPHANUM>",
            "position": 0
        },
        {
            "token": "sur",
            "start_offset": 0,
            "end_offset": 4,
            "type": "<ALPHANUM>",
            "position": 0
        },
        {
            "token": "surf",
            "start_offset": 0,
            "end_offset": 4,
            "type": "<ALPHANUM>",
            "position": 0
        },
        {
            "token": "su",
            "start_offset": 0,
            "end_offset": 11,
            "type": "shingle",
            "position": 0,
            "positionLength": 2
        },
        {
            "token": "sur",
            "start_offset": 0,
            "end_offset": 11,
            "type": "shingle",
            "position": 0,
            "positionLength": 2
        },
        {
            "token": "surf",
            "start_offset": 0,
            "end_offset": 11,
            "type": "shingle",
            "position": 0,
            "positionLength": 2
        },
        {
            "token": "surf",
            "start_offset": 0,
            "end_offset": 11,
            "type": "shingle",
            "position": 0,
            "positionLength": 2
        },
        {
            "token": "surfc",
            "start_offset": 0,
            "end_offset": 11,
            "type": "shingle",
            "position": 0,
            "positionLength": 2
        },
        {
            "token": "surfco",
            "start_offset": 0,
            "end_offset": 11,
            "type": "shingle",
            "position": 0,
            "positionLength": 2
        },
        {
            "token": "surfcof",
            "start_offset": 0,
            "end_offset": 11,
            "type": "shingle",
            "position": 0,
            "positionLength": 2
        },
        {
            "token": "surfcoff",
            "start_offset": 0,
            "end_offset": 11,
            "type": "shingle",
            "position": 0,
            "positionLength": 2
        },
        {
            "token": "surfcoffe",
            "start_offset": 0,
            "end_offset": 11,
            "type": "shingle",
            "position": 0,
            "positionLength": 2
        },
        {
            "token": "co",
            "start_offset": 5,
            "end_offset": 11,
            "type": "<ALPHANUM>",
            "position": 1
        },
        {
            "token": "cof",
            "start_offset": 5,
            "end_offset": 11,
            "type": "<ALPHANUM>",
            "position": 1
        },
        {
            "token": "coff",
            "start_offset": 5,
            "end_offset": 11,
            "type": "<ALPHANUM>",
            "position": 1
        },
        {
            "token": "coffe",
            "start_offset": 5,
            "end_offset": 11,
            "type": "<ALPHANUM>",
            "position": 1
        },
        {
            "token": "coffee",
            "start_offset": 5,
            "end_offset": 11,
            "type": "<ALPHANUM>",
            "position": 1
        }
    ]
}

As you can see, there is token "surfcoff".

How my search should be organized?

I've tried to combine approaches by bool should query with -
query_string, match_phrase_prefix, match_prefix and others.

But none of them gave correct results.

Can you please, help me.

How my query should be built?
Or maybe i should try other analyzer filters.

For example query

{
  "query": {
    "bool": {
      "should": [
        {
          "query_string": {
                "query": "surf coff",
                "default_field": "text",
                "default_operator": "AND"
            }
        },
        {
          "query_string": {
                "query": "surf",
                "default_field": "text"
            }
        },
        {
          "query_string": {
                "query": "coff",
                "default_field": "text"
            }
        }
      ]
    }
  }
}

or this query

{
  "query": {
    "bool": {
      "should": [
        {
          "query_string": {
                "query": "(surf coff) OR (surf) OR (coff)",
                "default_field": "text"
            }
        }
      ]
    }
  }
}

or this query

{
  "query": {
    "bool": {
      "should": [
        {
          "query_string": {
                "query": "((surf AND coff)^3 OR (surf)^2 OR (coff)^1)",
                "default_field": "text"
            }
        }
      ]
    }
  }
}

or

{
  "query": {
    "match_bool_prefix" : {
      "text" : "surf coff"
    }
  }
}

gives

  1. SURF COFFEE SURFING NEVER ALONE
  2. CONOSUR COLCHAGUA CONO SUR
  3. SUNRISE CONCHA TORO SUNRISE 300 DAYS
  4. SUN COFFEE
  5. SURF COFFEE PROPAGANDA
    ....

but its strange for me, i think i misunderstand something.

{
  "query": {
    "bool": {
      "should": [
        {
          "query_string": {
                "query": "(surf* AND coff*)^3 OR (surf*)^2 OR (coff*)^1",
                "default_field": "text"
            }
        }
      ]
    }
  }
}
{
   "settings":{
       "index": {
            "max_shingle_diff" : 9,
            "max_ngram_diff": 9
       },
      "analysis":{
         "analyzer":{
            "word_join_analyzer":{
               "tokenizer":"standard",
               "filter":[
                  "lowercase",
                  "word_delimiter_graph",
                   "my_shingle",
                   "my_char_filter"
               ]
            }
         },
         "filter":{
            "my_shingle":{
               "type":"shingle",
               "min_shingle_size": 2,
                "max_shingle_size": 10
            },
            "my_char_filter": {
                "type": "pattern_replace",
                "pattern": " ",
                "replacement": ""
            }
         }
      }
   }
}

removing edge-n-gram and adding wilcard query with priority resolved my question.
But I still don't understand why edge n gram didn't work.

Finally resolved with

"filter":[
                  "lowercase",
                  "word_delimiter_graph",
                  "my_shingle",
                   "my_edge_ngram",
                  "my_char_filter"
               ]

the problem was with search_analyzer, because docs said " Sometimes, though, it can make sense to use a different analyzer at search time, such as when using the edge_ngram tokenizer for autocomplete or when using search-time synonyms."

So i added standard search_analyzer to my text field:

"text": { "type": "text", "analyzer": "word_join_analyzer", "search_analyzer": "standard" }

Search query:

{
  "query": {
    "bool": {
      "should": [
        {
          "query_string": {
                "query": "surf coff",
                "default_field": "text"
            }
        }
      ]
    }
  }
}

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.