Hi all. Is it possible to highlight "best matches" using Elastic Highlighting API?
By "best match", I mean exact matches in a word even the whole word is matched by a query. For example:
- document content is
Dubai
- search query is
duba
- and the desired result is
<b>Duba</b>i
But the problem is in the query I have. The query has some "fuzziness" queries.
Here is the example index configuration:
PUT /highlight_best_match
{
"settings": {
"number_of_shards": "1",
"number_of_replicas": "1",
"analysis": {
"filter": {
"language_stemmer": {
"name": "german2",
"type": "stemmer"
},
"language_stopwords": {
"type": "stop",
"stopwords": "_german_"
}
},
"char_filter": {
"ampersand_to_and": {
"type": "mapping",
"mappings": [
"&=> and "
]
}
},
"analyzer": {
"prefix_analyzer": {
"type": "custom",
"tokenizer": "edge_ngram_tokenizer",
"filter": [
"german_normalization",
"lowercase"
]
},
"match_analyzer": {
"char_filter": [
"html_strip",
"ampersand_to_and"
],
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"asciifolding",
"language_stopwords",
"language_stemmer"
]
},
"search_analyzer": {
"type": "custom",
"tokenizer": "keyword",
"filter": [
"german_normalization",
"lowercase"
]
}
},
"tokenizer": {
"edge_ngram_tokenizer": {
"type": "edge_ngram",
"min_gram": "2",
"max_gram": "20",
"token_chars": [
"letter",
"digit"
]
}
}
}
},
"mappings": {
"default": {
"dynamic": "false",
"properties": {
"id": {
"type": "integer"
},
"title": {
"type": "keyword",
"fields": {
"match": {
"type": "text",
"term_vector": "with_positions_offsets",
"index_options": "offsets",
"analyzer": "match_analyzer"
},
"prefix": {
"type": "text",
"term_vector": "with_positions_offsets",
"index_options": "offsets",
"analyzer": "prefix_analyzer",
"search_analyzer": "search_analyzer"
}
}
}
}
}
}
}
and some data illustrating the example:
POST /_bulk
{"create": {"_id": "1", "_index": "highlight_best_match", "_type": "default"} }
{"title": "Dubai"}
{"create": {"_id": "2", "_index": "highlight_best_match", "_type": "default"} }
{"title": "Dumai"}
{"create": {"_id": "3", "_index": "highlight_best_match", "_type": "default"} }
{"title": "Cuba"}
{"create": {"_id": "4", "_index": "highlight_best_match", "_type": "default"} }
{"title": "Kuba Südküste"}
{"create": {"_id": "5", "_index": "highlight_best_match", "_type": "default"} }
{"title": "Dubai Kreuzfahrt"}
The query is
GET /highlight_best_match/_search
{
"query": {
"bool": {
"must": {
"bool": {
"should": [
{
"match": {
"title.prefix": {
"query": "duba",
"fuzziness": 1,
"boost": 1
}
}
},
{
"match": {
"title.match": {
"query": "duba",
"fuzziness": 1,
"boost": 1
}
}
}
]
}
},
"should": [
{
"match_phrase_prefix": {
"title.match": {
"query": "duba",
"boost": 5
}
}
},
{
"match": {
"title.prefix": {
"query": "duba",
"fuzziness": 0,
"boost": 3
}
}
},
{
"match": {
"title.match": {
"query": "duba",
"fuzziness": 0,
"boost": 10
}
}
}
]
}
},
"highlight": {
"encoder": "plain",
"order": "score",
"pre_tags": [
"<b>"
],
"post_tags": [
"</b>"
],
"fields": {
"title.prefix": {
"type": "fvh",
"matched_fields": [
"title.match",
"title.prefix"
]
}
}
}
}
and the result is
{
"took": 4,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 5,
"max_score": 11.402948,
"hits": [
{
"_index": "highlight_best_match",
"_type": "default",
"_id": "1",
"_score": 11.402948,
"_source": {
"title": "Dubai"
},
"highlight": {
"title.prefix": [
"<b>Dubai</b>"
]
}
},
{
"_index": "highlight_best_match",
"_type": "default",
"_id": "5",
"_score": 6.812179,
"_source": {
"title": "Dubai Kreuzfahrt"
},
"highlight": {
"title.prefix": [
"<b>Dubai</b> Kreuzfahrt"
]
}
},
{
"_index": "highlight_best_match",
"_type": "default",
"_id": "3",
"_score": 1.5331156,
"_source": {
"title": "Cuba"
},
"highlight": {
"title.prefix": [
"<b>Cuba</b>"
]
}
},
{
"_index": "highlight_best_match",
"_type": "default",
"_id": "4",
"_score": 1.0343978,
"_source": {
"title": "Kuba Südküste"
},
"highlight": {
"title.prefix": [
"<b>Kuba</b> Südküste"
]
}
},
{
"_index": "highlight_best_match",
"_type": "default",
"_id": "2",
"_score": 0.7896109,
"_source": {
"title": "Dumai"
},
"highlight": {
"title.prefix": [
"<b>Duma</b>i"
]
}
}
]
}
}
Please take a look at the results with ID = 1 and ID = 5. Is it possible to highlight only duba
there like <b>Duma</b>i
in the result with ID = 2?
I know that I can set highlight_query
with the exact query only.
But I'd like to have highlighted results in any case but exact macthes are preferred if they are.
Thank you in advance!