Minimum_should_match ignored in ES 2.4.6

@jimczi Posted it here too!

I am getting weird behaviour with "minimum_should_match" used inside "multi_match".
ES version: 2.4.6

Plugins installed: Kibana, sense

Java version:
java version "1.8.0_102"
Java(TM) SE Runtime Environment (build 1.8.0_102-b14)
Java HotSpot(TM) 64-Bit Server VM (build 25.102-b14, mixed mode)

OS:
Darwin Kernel Version 17.7.0

Here's the request I am using to create the index:

PUT /docs 
{
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 1,
        "index": {
            "analysis": {
            "filter": {
                "trigrams_filter": {
                    "type":     "ngram",
                    "min_gram": 3,
                    "max_gram": 3
                }
            },
            "analyzer": {
                "analyzer_startswith": {
                    "tokenizer": "keyword",
                    "filter": "lowercase"
                },
                "snowball" : {
                  "type" : "snowball"
                },
                "trigrams": {
                    "type":      "custom",
                    "tokenizer": "standard",
                    "filter":   [
                        "lowercase",
                        "trigrams_filter"
                    ]
                }
            }
        }
    }
},
"mappings": {
    "modelresult": {
        "properties": {
          "allergies" : {
            "type" : "string",
            "analyzer" : "snowball"
          },
          "diets" : {
            "type" : "string",
            "analyzer" : "snowball"
          },
          "mealtypes" : {
            "type" : "string",
            "analyzer" : "snowball"
          },
          "medications" : {
            "type" : "string",
            "analyzer" : "snowball"
          },
          "django_ct" : {
            "type" : "string",
            "index" : "not_analyzed",
            "include_in_all" : "false"
          },
          "django_id" : {
            "type" : "string",
            "index" : "not_analyzed",
            "include_in_all" : "false"
          },
          "id" : {
            "type" : "string"
          },
          "is_published" : {
            "type" : "boolean"
          },
          "outdated" : {
            "type" : "boolean"
          },
          "source_db" : {
            "type" : "string"
          },
          "FdGrp_Cd" : {
            "type" : "string"
          },
          "original_country" : {
            "type" : "string"
          },
          "additional_information" : {
            "type" : "string"
          },
          "text" : {
            "type" : "string",
            "fields": {
              "en": {
                "type":     "string",
                "analyzer": "english"
              },
              "fi": {
                "type":     "string",
                "analyzer": "finnish"
              },
              "de": {
                "type":     "string",
                "analyzer": "german"
              },
              "sv": {
                "type":     "string",
                "analyzer": "swedish"
              },
              "general": {
                "type":     "string",
                "analyzer": "trigrams"
              }
            }
          },
          "user_id" : {
            "type" : "string",
            "analyzer" : "snowball"
          },
          "homecare_area" : {
            "type" : "string",
            "analyzer" : "snowball"
          },
          "lang" : {
            "type" : "string",
            "analyzer" : "snowball"
          }
        }
    }
}
}

Here's the query I am running:

GET /docs/_search
{
  "query": {
    "filtered": {
      "filter": {
        "bool": {
          "must": [
            {
              "term": {
                "outdated": "false"
              }
            },
            {
              "term": {
                "lang": "fi"
              }
            }
          ],
          "must_not": [
            {
              "term": {
                "django_ct": "web.plantranslation"
              }
            },
            {
              "term": {
                "django_ct": "web.recipetranslation"
              }
            },
            {
              "term": {
                "django_ct": "web.fooddestranslation"
              }
            },
            {
              "term": {
                "django_ct": "web.dishtranslation"
              }
            },
            {
              "term": {
                "django_ct": "web.customerinformation"
              }
            }
          ],
          "should": []
        }
      },
      "query": {
        "bool": {
          "should": [
            {
              "span_first": {
                "boost": 10,
                "end": 1,
                "match": {
                  "span_term": {
                    "text": "makkara"
                  }
                }
              }
            }
          ],
          "must": [
            {
              "multi_match": {
                "query": "makkara",
                "type": "most_fields",
                "fields": ["text.general", "text.fi"],
                "minimum_should_match": "50%"
              }
            }
          ]
        }
      }
    }
  },
  "size": 1000
}

The minimum_should_match is being ignored in ES 2.4.6. It doesn't matter what value it is (50%, 100%), it always returns way too many results.
This worked for ES 1.7, 2.0.1 and 2.0.2, I've tested the same query. But it's being ignored for ES 2.4.6. What is the reason? Was there some change introduced that makes "minimum_should_match" obsolete?

Here are some examples of the text returned results:
legit results: MAKSAMAKKARA, BALKANMAKKARA
wrong results: PERSIKKARAHKA, MUSTIKKAPIIRAKKA 2 KPL, LIHAPIIRAKKA 4 KPL

This is caused by a change in Lucene query parser that considers each terms at the same position as synonyms. You're using an ngram filter that sets each ngram at the same position so makkara is analyzed as mak, akk, kka, kar, ara but each of these tokens are indexed at the same position. If you want to correctly index ngrams you should use an ngram_tokenizer which will set the correct positions for the different tokens.

1 Like

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.