Whilst switching my application to 6.2.3 I noticed some differences related to highlighting using ngram.
I've a document having this content:
http://subdomain.domain.tld/path/script.extension?query=param#hash
and I've various cases where I ngram match into that.
For example, I try to match query and:
- in 5.3.0 got back
'http://subdomain.domain.tld/path/script.extension?<span class="highlight">query</span>=param#hash' - in 6.2.3 I get back
<span class=\"highlight\">query</span>=param#hash
As you can see, it's missing everything before the …extension?.
I've other test cases where it fails (i.e. is different) but also cases where it works the same in both versions. Assuming always I'm performing a "query": "<searchtoken>" match:
- works the same:
http,subdomain - works differently:
param,hash,http://subdomain.domain.tld
Below I've a self-contained reproducible case for 5.3.0 and 6.2.3. I've the feeling I'm missing something. I went through https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-highlighting.html but can't spot it.
Reproducible case for 5.3.0
Create index:
curl --silent -XPUT http://localhost:9200/test1 -d '
{
"mappings": {
"_default_": {
"dynamic": "strict"
},
"document": {
"_all": {
"enabled": false
},
"properties": {
"message": {
"type": "text",
"term_vector": "with_positions_offsets",
"fields": {
"ngram": {
"type": "text",
"term_vector": "with_positions_offsets",
"analyzer": "standard_ngram",
"search_analyzer": "standard"
}
}
}
}
}
},
"settings": {
"index": {
"refresh_interval": "1s",
"analysis": {
"filter": {
"custom_ngram": {
"type": "ngram",
"min_gram": 3,
"max_gram": 32
}
},
"tokenizer": {
"autocomplete": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 15
}
},
"analyzer": {
"standard_ngram": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"standard",
"lowercase",
"custom_ngram"
]
},
"index_autocomplete": {
"type": "custom",
"tokenizer": "autocomplete",
"filter": "lowercase"
}
}
}
}
}
}
' | jq .
Index document:
{
"message": "http://subdomain.domain.tld/path/script.extension?query=param#hash"
}
' | jq .
Search:
curl --silent -XPOST http://localhost:9200/test1/document/_search -d '
{
"query": {
"bool": {
"should": [
{
"multi_match": {
"query": "query",
"fields": [
"message.ngram"
]
}
}
]
}
},
"highlight": {
"pre_tags": [
"<span class=\"highlight\">"
],
"post_tags": [
"</span>"
],
"encoder": "html",
"fields": {
"message": {},
"message.ngram": {}
}
}
}
' | jq .
Result
{
"hits": {
"hits": [
{
"highlight": {
"message.ngram": [
"http://subdomain.domain.tld/path/script.extension?<span class=\"highlight\">query</span>=param#hash"
]
},
"_source": {
"message": "http://subdomain.domain.tld/path/script.extension?query=param#hash"
},
"_score": 0.4789082,
"_id": "1",
"_type": "document",
"_index": "test1"
}
],
"max_score": 0.4789082,
"total": 1
},
"_shards": {
"failed": 0,
"successful": 5,
"total": 5
},
"timed_out": false,
"took": 26
}
Reproducible case for 6.2.3
Create index
curl --silent -XPUT -H 'Content-Type: application/json' http://localhost:9200/test1 -d '
{
"mappings": {
"_doc": {
"dynamic": "strict",
"properties": {
"message": {
"type": "text",
"term_vector": "with_positions_offsets",
"fields": {
"ngram": {
"type": "text",
"term_vector": "with_positions_offsets",
"analyzer": "standard_ngram",
"search_analyzer": "standard"
}
}
}
}
}
},
"settings": {
"index": {
"refresh_interval": "1s",
"analysis": {
"filter": {
"custom_ngram": {
"type": "ngram",
"min_gram": 3,
"max_gram": 32
}
},
"tokenizer": {
"autocomplete": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 15
}
},
"analyzer": {
"standard_ngram": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"standard",
"lowercase",
"custom_ngram"
]
},
"index_autocomplete": {
"type": "custom",
"tokenizer": "autocomplete",
"filter": "lowercase"
}
}
},
"max_ngram_diff": 29
}
}
}
' | jq .
Index document
curl --silent -XPOST -H 'Content-Type: application/json' http://localhost:9200/test1/_doc/1 -d '
{
"message": "http://subdomain.domain.tld/path/script.extension?query=param#hash"
}
' | jq .
Search
curl --silent -XPOST -H 'Content-Type: application/json' http://localhost:9200/test1/_doc/_search -d '
{
"query": {
"bool": {
"should": [
{
"multi_match": {
"query": "query",
"fields": [
"message.ngram"
]
}
}
]
}
},
"highlight": {
"force_source": true,
"pre_tags": [
"<span class=\"highlight\">"
],
"post_tags": [
"</span>"
],
"encoder": "html",
"fields": {
"message": {},
"message.ngram": {}
}
}
}
' | jq .
Result
{
"hits": {
"hits": [
{
"highlight": {
"message.ngram": [
"<span class=\"highlight\">query</span>=param#hash"
]
},
"_source": {
"message": "http://subdomain.domain.tld/path/script.extension?query=param#hash"
},
"_score": 0.47903025,
"_id": "1",
"_type": "_doc",
"_index": "test1"
}
],
"max_score": 0.47903025,
"total": 1
},
"_shards": {
"failed": 0,
"skipped": 0,
"successful": 5,
"total": 5
},
"timed_out": false,
"took": 6
}
thanks,
- Markus