Highlight "best matches" using Elastic Highlighting API


(Alexander Evtushenko) #1

Hi all. Is it possible to highlight "best matches" using Elastic Highlighting API?
By "best match", I mean exact matches in a word even the whole word is matched by a query. For example:

  • document content is Dubai
  • search query is duba
  • and the desired result is <b>Duba</b>i

But the problem is in the query I have. The query has some "fuzziness" queries.
Here is the example index configuration:

PUT /highlight_best_match
{
  "settings": {
    "number_of_shards": "1",
    "number_of_replicas": "1",
    "analysis": {
      "filter": {
        "language_stemmer": {
          "name": "german2",
          "type": "stemmer"
        },
        "language_stopwords": {
          "type": "stop",
          "stopwords": "_german_"
        }
      },
      "char_filter": {
        "ampersand_to_and": {
          "type": "mapping",
          "mappings": [
            "&=> and "
          ]
        }
      },
      "analyzer": {
        "prefix_analyzer": {
          "type": "custom",
          "tokenizer": "edge_ngram_tokenizer",
          "filter": [
            "german_normalization",
            "lowercase"
          ]
        },
        "match_analyzer": {
          "char_filter": [
            "html_strip",
            "ampersand_to_and"
          ],
          "type": "custom",
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "asciifolding",
            "language_stopwords",
            "language_stemmer"
          ]
        },
        "search_analyzer": {
          "type": "custom",
          "tokenizer": "keyword",
          "filter": [
            "german_normalization",
            "lowercase"
          ]
        }
      },
      "tokenizer": {
        "edge_ngram_tokenizer": {
          "type": "edge_ngram",
          "min_gram": "2",
          "max_gram": "20",
          "token_chars": [
            "letter",
            "digit"
          ]
        }
      }
    }
  },
  "mappings": {
    "default": {
      "dynamic": "false",
      "properties": {
        "id": {
          "type": "integer"
        },
        "title": {
          "type": "keyword",
          "fields": {
            "match": {
              "type": "text",
              "term_vector": "with_positions_offsets",
              "index_options": "offsets",
              "analyzer": "match_analyzer"
            },
            "prefix": {
              "type": "text",
              "term_vector": "with_positions_offsets",
              "index_options": "offsets",
              "analyzer": "prefix_analyzer",
              "search_analyzer": "search_analyzer"
            }
          }
        }
      }
    }
  }
}

and some data illustrating the example:

POST /_bulk
{"create": {"_id": "1", "_index": "highlight_best_match", "_type": "default"} }
{"title": "Dubai"}
{"create": {"_id": "2", "_index": "highlight_best_match", "_type": "default"} }
{"title": "Dumai"}
{"create": {"_id": "3", "_index": "highlight_best_match", "_type": "default"} }
{"title": "Cuba"}
{"create": {"_id": "4", "_index": "highlight_best_match", "_type": "default"} }
{"title": "Kuba Südküste"}
{"create": {"_id": "5", "_index": "highlight_best_match", "_type": "default"} }
{"title": "Dubai Kreuzfahrt"}

The query is

GET /highlight_best_match/_search
{
  "query": {
    "bool": {
      "must": {
        "bool": {
          "should": [
            {
              "match": {
                "title.prefix": {
                  "query": "duba",
                  "fuzziness": 1,
                  "boost": 1
                }
              }
            },
            {
              "match": {
                "title.match": {
                  "query": "duba",
                  "fuzziness": 1,
                  "boost": 1
                }
              }
            }
          ]
        }
      },
      "should": [
        {
          "match_phrase_prefix": {
            "title.match": {
              "query": "duba",
              "boost": 5
            }
          }
        },
        {
          "match": {
            "title.prefix": {
              "query": "duba",
              "fuzziness": 0,
              "boost": 3
            }
          }
        },
        {
          "match": {
            "title.match": {
              "query": "duba",
              "fuzziness": 0,
              "boost": 10
            }
          }
        }
      ]
    }
  },
  "highlight": {
    "encoder": "plain",
    "order": "score",
    "pre_tags": [
      "<b>"
    ],
    "post_tags": [
      "</b>"
    ],
    "fields": {
      "title.prefix": {
        "type": "fvh",
        "matched_fields": [
          "title.match",
          "title.prefix"
        ]
      }
    }
  }
}

and the result is

{
  "took": 4,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 5,
    "max_score": 11.402948,
    "hits": [
      {
        "_index": "highlight_best_match",
        "_type": "default",
        "_id": "1",
        "_score": 11.402948,
        "_source": {
          "title": "Dubai"
        },
        "highlight": {
          "title.prefix": [
            "<b>Dubai</b>"
          ]
        }
      },
      {
        "_index": "highlight_best_match",
        "_type": "default",
        "_id": "5",
        "_score": 6.812179,
        "_source": {
          "title": "Dubai Kreuzfahrt"
        },
        "highlight": {
          "title.prefix": [
            "<b>Dubai</b> Kreuzfahrt"
          ]
        }
      },
      {
        "_index": "highlight_best_match",
        "_type": "default",
        "_id": "3",
        "_score": 1.5331156,
        "_source": {
          "title": "Cuba"
        },
        "highlight": {
          "title.prefix": [
            "<b>Cuba</b>"
          ]
        }
      },
      {
        "_index": "highlight_best_match",
        "_type": "default",
        "_id": "4",
        "_score": 1.0343978,
        "_source": {
          "title": "Kuba Südküste"
        },
        "highlight": {
          "title.prefix": [
            "<b>Kuba</b> Südküste"
          ]
        }
      },
      {
        "_index": "highlight_best_match",
        "_type": "default",
        "_id": "2",
        "_score": 0.7896109,
        "_source": {
          "title": "Dumai"
        },
        "highlight": {
          "title.prefix": [
            "<b>Duma</b>i"
          ]
        }
      }
    ]
  }
}

Please take a look at the results with ID = 1 and ID = 5. Is it possible to highlight only duba there like <b>Duma</b>i in the result with ID = 2?

I know that I can set highlight_query with the exact query only.
But I'd like to have highlighted results in any case but exact macthes are preferred if they are.

Thank you in advance!


(system) closed #2

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.