Ngram highlighting differences between 5.3.0 and 6.2.3

Whilst switching my application to 6.2.3 I noticed some differences related to highlighting using ngram.

I've a document having this content:
http://subdomain.domain.tld/path/script.extension?query=param#hash
and I've various cases where I ngram match into that.

For example, I try to match query and:

  • in 5.3.0 got back
    'http:&#x2F;&#x2F;subdomain.domain.tld&#x2F;path&#x2F;script.extension?<span class="highlight">query</span>=param#hash'
  • in 6.2.3 I get back
    <span class=\"highlight\">query</span>=param#hash

As you can see, it's missing everything before the …extension?.

I've other test cases where it fails (i.e. is different) but also cases where it works the same in both versions. Assuming always I'm performing a "query": "<searchtoken>" match:

  • works the same: http, subdomain
  • works differently: param, hash, http://subdomain.domain.tld

Below I've a self-contained reproducible case for 5.3.0 and 6.2.3. I've the feeling I'm missing something. I went through https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-highlighting.html but can't spot it.

Reproducible case for 5.3.0

Create index:

curl --silent -XPUT http://localhost:9200/test1 -d '
{
  "mappings": {
    "_default_": {
      "dynamic": "strict"
    },
    "document": {
      "_all": {
        "enabled": false
      },
      "properties": {
        "message": {
          "type": "text",
          "term_vector": "with_positions_offsets",
          "fields": {
            "ngram": {
              "type": "text",
              "term_vector": "with_positions_offsets",
              "analyzer": "standard_ngram",
              "search_analyzer": "standard"
            }
          }
        }
      }
    }
  },
  "settings": {
    "index": {
      "refresh_interval": "1s",
      "analysis": {
        "filter": {
          "custom_ngram": {
            "type": "ngram",
            "min_gram": 3,
            "max_gram": 32
          }
        },
        "tokenizer": {
          "autocomplete": {
            "type": "edge_ngram",
            "min_gram": 1,
            "max_gram": 15
          }
        },
        "analyzer": {
          "standard_ngram": {
            "type": "custom",
            "tokenizer": "standard",
            "filter": [
              "standard",
              "lowercase",
              "custom_ngram"
            ]
          },
          "index_autocomplete": {
            "type": "custom",
            "tokenizer": "autocomplete",
            "filter": "lowercase"
          }
        }
      }
    }
  }
}
' |  jq .

Index document:

{
  "message": "http://subdomain.domain.tld/path/script.extension?query=param#hash"
}
' | jq .

Search:

curl --silent -XPOST http://localhost:9200/test1/document/_search -d '
{
  "query": {
    "bool": {
      "should": [
        {
          "multi_match": {
            "query": "query",
            "fields": [
              "message.ngram"
            ]
          }
        }
      ]
    }
  },
  "highlight": {
    "pre_tags": [
      "<span class=\"highlight\">"
    ],
    "post_tags": [
      "</span>"
    ],
    "encoder": "html",
    "fields": {
      "message": {},
      "message.ngram": {}
    }
  }
}
' | jq .

Result

{
  "hits": {
    "hits": [
      {
        "highlight": {
          "message.ngram": [
            "http:&#x2F;&#x2F;subdomain.domain.tld&#x2F;path&#x2F;script.extension?<span class=\"highlight\">query</span>=param#hash"
          ]
        },
        "_source": {
          "message": "http://subdomain.domain.tld/path/script.extension?query=param#hash"
        },
        "_score": 0.4789082,
        "_id": "1",
        "_type": "document",
        "_index": "test1"
      }
    ],
    "max_score": 0.4789082,
    "total": 1
  },
  "_shards": {
    "failed": 0,
    "successful": 5,
    "total": 5
  },
  "timed_out": false,
  "took": 26
}

Reproducible case for 6.2.3

Create index

curl --silent -XPUT -H 'Content-Type: application/json' http://localhost:9200/test1 -d '
{
  "mappings": {
    "_doc": {
      "dynamic": "strict",
      "properties": {
        "message": {
          "type": "text",
          "term_vector": "with_positions_offsets",
          "fields": {
            "ngram": {
              "type": "text",
              "term_vector": "with_positions_offsets",
              "analyzer": "standard_ngram",
              "search_analyzer": "standard"
            }
          }
        }
      }
    }
  },
  "settings": {
    "index": {
      "refresh_interval": "1s",
      "analysis": {
        "filter": {
          "custom_ngram": {
            "type": "ngram",
            "min_gram": 3,
            "max_gram": 32
          }
        },
        "tokenizer": {
          "autocomplete": {
            "type": "edge_ngram",
            "min_gram": 1,
            "max_gram": 15
          }
        },
        "analyzer": {
          "standard_ngram": {
            "type": "custom",
            "tokenizer": "standard",
            "filter": [
              "standard",
              "lowercase",
              "custom_ngram"
            ]
          },
          "index_autocomplete": {
            "type": "custom",
            "tokenizer": "autocomplete",
            "filter": "lowercase"
          }
        }
      },
      "max_ngram_diff": 29
    }
  }
}
' |  jq .

Index document

curl --silent -XPOST -H 'Content-Type: application/json' http://localhost:9200/test1/_doc/1 -d '
{
"message": "http://subdomain.domain.tld/path/script.extension?query=param#hash"
}
' | jq .

Search

curl --silent -XPOST -H 'Content-Type: application/json' http://localhost:9200/test1/_doc/_search -d '
{
  "query": {
    "bool": {
      "should": [
        {
          "multi_match": {
            "query": "query",
            "fields": [
              "message.ngram"
            ]
          }
        }
      ]
    }
  },
  "highlight": {
    "force_source": true,
    "pre_tags": [
      "<span class=\"highlight\">"
    ],
    "post_tags": [
      "</span>"
    ],
    "encoder": "html",
    "fields": {
      "message": {},
      "message.ngram": {}
    }
  }
}
' | jq .

Result

{
  "hits": {
    "hits": [
      {
        "highlight": {
          "message.ngram": [
            "<span class=\"highlight\">query</span>=param#hash"
          ]
        },
        "_source": {
          "message": "http://subdomain.domain.tld/path/script.extension?query=param#hash"
        },
        "_score": 0.47903025,
        "_id": "1",
        "_type": "_doc",
        "_index": "test1"
      }
    ],
    "max_score": 0.47903025,
    "total": 1
  },
  "_shards": {
    "failed": 0,
    "skipped": 0,
    "successful": 5,
    "total": 5
  },
  "timed_out": false,
  "took": 6
}

thanks,

  • Markus

This is because 6.x defaults to the unified highlighter which breaks the text into sentences to find relevant snippets. You can set number_of_fragments to 0 if you want to disable the splits on sentences, this will return the full content of the field with the highlighting or you can force the fvh highlighter if you want to retrieve the 5.x behavior.

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.