We are dealing with html content storing in Elasticsearch and our task is to search and highlight matched text.
Issue occurs with text like this -> <span>Hello</span>World
and search by HelloWorld
word which leads to the response like this:
<span><hi>Hello</span>World</hi>
. I see that this situation might be difficult to resolve.
Please see my mapping config below:
{
"settings": {
"analysis": {
"tokenizer": {
"ngram_tokenizer": {
"type": "ngram",
"min_gram": 2,
"max_gram": 30,
"token_chars": ["letter", "digit"]
}
},
"filter": {
"my_synonym_filter": {
"type": "synonym",
"synonyms": [
"english,british",
"usa,united states of america,us"
]
}
},
"char_filter": {
"my_html_filter": {
"type": "html_strip"
}
},
"analyzer": {
"my_analyzer": {
"tokenizer": "ngram_tokenizer",
"char_filter": ["my_html_filter"],
"filter": [
"lowercase",
"asciifolding",
"my_synonym_filter"
]
}
}
}
},
"mappings": {
"doc": {
"properties": {
"text": {
"type": "text",
"analyzer": "my_analyzer",
"search_analyzer": "standard"
}
}
}
}
}
Pushing the doc to doc index:
{
"text": "<span>Hello</span>World"
}
Search:
{
"query": {
"multi_match": {
"fields": ["text"],
"query": "helloworld"
}
},
"highlight" : {
"type": "unified",
"require_field_match": false,
"number_of_fragments": 0,
"fields" : {
"*" : { "pre_tags" : ["<hi>"], "post_tags" : ["</hi>"] }
}
}
}
Result:
{
"took": 7,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.29032138,
"hits": [
{
"_index": "test",
"_type": "doc",
"_id": "1",
"_score": 0.29032138,
"_source": {
"text": "<span>Hello</span>World"
},
"highlight": {
"text": [
"<span><hi>Hello</span>World</hi>"
]
}
}
]
}
}
As you see html tags are overlapped in the result.
Please describe what is the best way to resolve this issue and make it not to overlap and receive the result like this:
<span><hi>Hello</hi></span></hi>World</hi>
Thanks!