Highlights not generated for certain Fuzzy & Wildcard Phrase match queries using Span

Hey everybody, I'm running into an issue with highlights not being generated for certain phrase match queries using wildcards or fuzzy matching with spans.

Included below are steps for test replication from Kibana.

Elasticsearch version 7.9.3

1. Create Index

PUT /test-index-1
{
  "mappings": {
    "properties": {
      "transcription_en": {
        "type" : "text",
        "fields" : {
          "keyword" : {
            "type" : "keyword",
            "ignore_above" : 10000
          }
        }
      }
    }
  }
}

2. Create Document

PUT /test-index-1/_doc/1
{
  "transcription_en": "this is an example of a transcription string for fuzzy and wildcard phrase matching"
}

3. Check Queries

VALID QUERIES - these generate highlights as expected

3a. Basic phrase match query:

GET test-index-1/_search
{
  "highlight": {
    "fields": {
      "transcription_en": {}
    },
    "number_of_fragments": 0
  },
  "query": {
    "bool": {
      "must": [
        {
          "bool": {
            "should": [
              {"match_phrase": {"transcription_en": "phrase matching"}}
            ]
          }
        }
      ]
    }
  }
}

3a. Response:

{
  "took" : 5,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 1,
      "relation" : "eq"
    },
    "max_score" : 0.5753642,
    "hits" : [
      {
        "_index" : "test-index-1",
        "_type" : "_doc",
        "_id" : "1",
        "_score" : 0.5753642,
        "_source" : {
          "transcription_en" : "this is an example of a transcription string for fuzzy and wildcard phrase matching"
        },
        "highlight" : {
          "transcription_en" : [
            "this is an example of a transcription string for fuzzy and wildcard <em>phrase</em> <em>matching</em>"
          ]
        }
      }
    ]
  }
}

3b. Fuzzy phrase match query:

GET test-index-1/_search
{
  "highlight": {
    "fields": {
      "transcription_en": {}
    },
    "number_of_fragments": 0
  },
  "query": {
    "bool": {
      "must": [
        {
          "bool": {
            "should": [
              {
                "span_near": {
                  "clauses": [
                    {
                      "span_multi": {
                        "match": {
                          "fuzzy": {
                            "transcription_en": {
                              "fuzziness": "auto",
                              "value": "phrasw"
                            }
                          }
                        }
                      }
                    },
                    {
                      "span_multi": {
                        "match": {
                          "fuzzy": {
                            "transcription_en": {
                              "fuzziness": "auto",
                              "value": "matching"
                            }
                          }
                        }
                      }
                    }
                  ],
                  "in_order": true,
                  "slop": 0
                }
              }
            ]
          }
        }
      ]
    }
  }
}

3b. Response:

{
  "took" : 23,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 1,
      "relation" : "eq"
    },
    "max_score" : 0.5753642,
    "hits" : [
      {
        "_index" : "test-index-1",
        "_type" : "_doc",
        "_id" : "1",
        "_score" : 0.5753642,
        "_source" : {
          "transcription_en" : "this is an example of a transcription string for fuzzy and wildcard phrase matching"
        },
        "highlight" : {
          "transcription_en" : [
            "this is an example of a transcription string for fuzzy and wildcard <em>phrase</em> <em>matching</em>"
          ]
        }
      }
    ]
  }
}

========================================

INVALID QUERIES - these do NOT generate highlights

3c. Wildcard phrase match query:

GET test-index-1/_search
{
  "highlight": {
    "fields": {
      "transcription_en": {}
    },
    "number_of_fragments": 0
  },
  "query": {
    "bool": {
      "must": [
        {
          "bool": {
            "should": [
              {
                "span_near": {
                  "clauses": [
                    {
                      "span_multi": {
                        "match": {
                          "wildcard": {"transcription_en": {"value": "phras?"}}
                        }
                      }
                    },
                    {
                      "span_multi": {
                        "match": {
                          "wildcard": {"transcription_en": {"value": "matching"}}
                        }
                      }
                    }
                  ],
                  "in_order": true,
                  "slop": 0
                }
              }
            ]
          }
        }
      ]
    }
  }
}

3c. Response:

{
  "took" : 0,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 1,
      "relation" : "eq"
    },
    "max_score" : 0.5753642,
    "hits" : [
      {
        "_index" : "test-index-1",
        "_type" : "_doc",
        "_id" : "1",
        "_score" : 0.5753642,
        "_source" : {
          "transcription_en" : "this is an example of a transcription string for fuzzy and wildcard phrase matching"
        }
      }
    ]
  }
}

3d. Alternate fuzzy phrase match query:

GET test-index-1/_search
{
  "highlight": {
    "fields": {
      "transcription_en": {}
    },
    "number_of_fragments": 0
  },
  "query": {
    "bool": {
      "must": [
        {
          "bool": {
            "should": [
              {
                "span_near": {
                  "clauses": [
                    {
                      "span_multi": {
                        "match": {
                          "fuzzy": {
                            "transcription_en": {
                              "fuzziness": "auto",
                              "value": "this"
                            }
                          }
                        }
                      }
                    },
                    {
                      "span_multi": {
                        "match": {
                          "fuzzy": {
                            "transcription_en": {
                              "fuzziness": "auto",
                              "value": "is"
                            }
                          }
                        }
                      }
                    },
                    {
                      "span_multi": {
                        "match": {
                          "fuzzy": {
                            "transcription_en": {
                              "fuzziness": "auto",
                              "value": "an"
                            }
                          }
                        }
                      }
                    },
                    {
                      "span_multi": {
                        "match": {
                          "fuzzy": {
                            "transcription_en": {
                              "fuzziness": "auto",
                              "value": "examplw"
                            }
                          }
                        }
                      }
                    }
                  ],
                  "in_order": true,
                  "slop": 0
                }
              }
            ] 
          }
        }
      ]
    }
  }
}

3d. Response:

{
  "took" : 7,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 1,
      "relation" : "eq"
    },
    "max_score" : 1.1507283,
    "hits" : [
      {
        "_index" : "test-index-1",
        "_type" : "_doc",
        "_id" : "1",
        "_score" : 1.1507283,
        "_source" : {
          "transcription_en" : "this is an example of a transcription string for fuzzy and wildcard phrase matching"
        }
      }
    ]
  }
}

It seems that using the plain highlighter rather than the default unified highlighter may resolve this issue but I am concerned about the performance implications of searching the number of documents I have with this change.

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.