Elasticsearch Multi-word synonyms and stopword issue

I have created an index as follows

PUT /wordforms_example
{
"mappings": {
      "dynamic": "strict",
      "properties": {
        "test_field": {
          "type": "text",
          "analyzer": "synonym_words_analyzer_index",
          "search_analyzer": "synonym_words_analyzer_search"
        }
        
      }
},
"settings": {
      "index": {
        "analysis": {
          "filter": {
            "english_stopwords": {
            "type":"stop",
            "language":"english",
            "stopwords":["a","an","and","are","as","at","be","but","by","for","if","in","into","is","it","no","not",
              "of","on","or","such","that","the","their","then","there","these","they",
              "this", "to", "was", "will", "with", "vs"]
            },
            "english_synonyms_index": {
              "type": "synonym",
              "expand": true,
              "synonyms": ["ICC, I.C.C, International Cricket Council",
"ICB, I.C.B, International Cricket Board"]
            },
            "english_synonyms_search": {
              "type": "synonym_graph",
              "expand": true,
              "synonyms": ["ICC, I.C.C, International Cricket Council",
"ICB, I.C.B, International Cricket Board"]
            }
          },
          "analyzer": {
            "synonym_words_analyzer_search": {
              "filter": [
                "lowercase",
                "english_synonyms_search",
                "english_stopwords"
              ],
              "tokenizer": "standard"
            },
            "synonym_words_analyzer_index": {
              "filter": [
                "lowercase",
                "english_synonyms_index",
                "english_stopwords"
              ],
              "tokenizer": "standard"
            }
          }
        },
        "number_of_replicas": "0"
      }
    }
}

Then indexed some data as shown below

POST wordforms_example/_doc
{
  "test_field": "International Cricket Council vs bcci"
}

POST wordforms_example/_doc
{
  "test_field": "I.C.C vs bcci"
}

POST /wordforms_example/_doc
{
  "test_field": "ICC vs bcci"
}

POST /wordforms_example/_doc
{
  "test_field": "ICB vs bcci"
}

POST /wordforms_example/_doc
{
  "test_field": "I.C.B vs bcci"
}

POST wordforms_example/_doc
{
  "test_field": "International Cricket Board vs bcci"
}

Then tried to get the data using the match_phrase query, as follow

GET wordforms_example/_search
{
   "track_total_hits":true,
   "highlight":{
      "require_field_match":true,
      "fields":{
         "*":{
            
         }
      },
      "pre_tags":[
         "<b>"
      ],
      "post_tags":[
         "</b>"
      ]
   },
   "query":{
      "bool":{
         "must":[
            {
               "match_phrase":{
                  "test_field":{
                     "query":"ICC vs bcci"
                  }
               }
            }
         ]
      }
   },
   "sort":{
      "_score":"desc"
   }
}

But elasticsearch didn't return any results.

But when I searched for ICC vs instead of ICC vs bcci with match_phrase query itself it returned the following results

  "took": 1,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 3,
      "relation": "eq"
    },
    "max_score": 3.9206777,
    "hits": [
      {
        "_index": "wordforms_example",
        "_id": "bfjjE4QBPemNUyXPwGRi",
        "_score": 3.9206777,
        "_source": {
          "test_field": "I.C.C vs bcci"
        },
        "highlight": {
          "test_field": [
            "<b>I.C.C</b> <b>vs</b> <b>bcci</b>"
          ]
        }
      },
      {
        "_index": "wordforms_example",
        "_id": "bvjjE4QBPemNUyXPyWTI",
        "_score": 3.9206777,
        "_source": {
          "test_field": "ICC vs bcci"
        },
        "highlight": {
          "test_field": [
            "<b>ICC</b> <b>vs</b> <b>bcci</b>"
          ]
        }
      },
      {
        "_index": "wordforms_example",
        "_id": "bPjjE4QBPemNUyXPtmRv",
        "_score": 3.7698822,
        "_source": {
          "test_field": "International Cricket Council vs bcci"
        },
        "highlight": {
          "test_field": [
            "<b>International Cricket</b> <b>Council</b> vs bcci"
          ]
        }
      }
    ]
  }
}

If we check the results we can see that results have some invalid highlights also

Then for the below query also I get invalid highlighting, following is a match query

GET wordforms_example/_search
{
   "track_total_hits":true,
   "timeout":"5s",
   "highlight":{
      "require_field_match":true,
      "fields":{
         "*":{
            
         }
      },
      "pre_tags":[
         "<b>"
      ],
      "post_tags":[
         "</b>"
      ]
   },
   "query":{
      "bool":{
         "must":[
            {
               "match":{
                  "test_field":{
                     "query":"icc",
                     "zero_terms_query":"all"
                  }
               }
            },
            {
               "match":{
                  "test_field":{
                     "query":"vs",
                     "zero_terms_query":"all"
                  }
               }
            },
            {
               "match":{
                  "test_field":{
                     "query":"bcci",
                     "zero_terms_query":"all"
                  }
               }
            }
         ]
      }
   },
   "from":0,
   "size":10
}

Here problem I am facing is

  1. Highlighting is not as expected
  2. If I am searching for some text with match_phrase, which has stopword and multiword synonyms then I am not getting any results, even though there is a match. (In the above example I am searching for ICC vs bcci)

I have referred this but didn't solve my usecase

Also I checked the analyzers,and it generates valid tokens

POST wordforms_example/_analyze
{
  "text": "ICC vs bcci",
  "analyzer": "synonym_words_analyzer_index"
}
POST wordforms_example/_analyze
{
  "text": "ICC vs bcci",
  "analyzer": "synonym_words_analyzer_search"
}

Tokens it generated:

{
  "tokens": [
    {
      "token": "i.c.c",
      "start_offset": 0,
      "end_offset": 3,
      "type": "SYNONYM",
      "position": 0,
      "positionLength": 3
    },
    {
      "token": "international",
      "start_offset": 0,
      "end_offset": 3,
      "type": "SYNONYM",
      "position": 0
    },
    {
      "token": "icc",
      "start_offset": 0,
      "end_offset": 3,
      "type": "<ALPHANUM>",
      "position": 0,
      "positionLength": 3
    },
    {
      "token": "cricket",
      "start_offset": 0,
      "end_offset": 3,
      "type": "SYNONYM",
      "position": 1
    },
    {
      "token": "council",
      "start_offset": 0,
      "end_offset": 3,
      "type": "SYNONYM",
      "position": 2
    },
    {
      "token": "bcci",
      "start_offset": 7,
      "end_offset": 11,
      "type": "<ALPHANUM>",
      "position": 4
    }
  ]
}

Thank you....in advance.

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.