Ignore shingle tokens when highlighting


(Ben Slinger) #1

I'm using shingles in our index to allow for matches on multi-words when a search using a joined version of the word is entered, eg. to allow 'giantbomb' to match a field with the term 'giant bomb' in it.

This is working fine, but because I shingle the whole field, the highlighter is matching the shingles and highlighting additional words. Is there a way to tell the highlighter to only match on tokens that aren't generated by the shingle filter?

Example:

DELETE /test

PUT /test
{
  "settings": {
    "index": {
      "number_of_replicas": 0,
      "analysis": {
          "filter": {
            "shingle_join": {
              "max_shingle_size": "2",
              "token_separator": "",
              "output_unigrams": "true",
              "type": "shingle",
              "min_shinge_size": "2"
            },
            "autocomplete_filter": {
              "type": "edge_ngram",
              "min_gram": "1",
              "max_gram": "20"
            }
          },
          "analyzer": {
            "autocomplete": {
              "filter": [
                "lowercase",
                "shingle_join",
                "autocomplete_filter"
              ],
              "type": "custom",
              "tokenizer": "standard"
            }
          }
      }
    }
  },
  "mappings": {
    "test":{
      "properties": {
        "test":{
          "type":"string",
          "analyzer": "autocomplete"
        }
      }
    }
  }
}

PUT /test/test/1
{
  "test":"giant bomb"
}

GET /test/test/_search
{
  "query":{
    "match":{
      "test":{
        "query":"giant"
      }
    }
  },
  "highlight": {
    "fields": {
      "test":{}
    }
  }
}

Output:

    {
      "took": 16334,
      "timed_out": false,
      "_shards": {
        "total": 5,
        "successful": 5,
        "failed": 0
      },
      "hits": {
        "total": 1,
        "max_score": 0.60692424,
        "hits": [
          {
            "_index": "test",
            "_type": "test",
            "_id": "1",
            "_score": 0.60692424,
            "_source": {
              "test": "giant bomb"
            },
            "highlight": {
              "test": [
                "<em>giant bomb</em>"
              ]
            }
          }
        ]
      }
    }

Desired output:

{
  "took": 16334,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "failed": 0
  },
  "hits": {
    "total": 1,
    "max_score": 0.60692424,
    "hits": [
      {
        "_index": "test",
        "_type": "test",
        "_id": "1",
        "_score": 0.60692424,
        "_source": {
          "test": "giant bomb"
        },
        "highlight": {
          "test": [
            "<em>giant</em> bomb"
          ]
        }
      }
    ]
  }
}

(Simon Willnauer) #2

did you try a highlight_query (see https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-highlighting.html#highlighting-settings that might help here. If you run in to issues please ping back. You might also need to set min_shingle_size to 1 otherwise it won't match a single value


(Ben Slinger) #3

Hi Simon,

I get tokens for the single words because I'm using "output_unigrams":"true", right? Here are the tokens for that term using that analyzer:

{
  "tokens": [
    {
      "token": "g",
      "start_offset": 0,
      "end_offset": 5,
      "type": "<ALPHANUM>",
      "position": 0
    },
    {
      "token": "gi",
      "start_offset": 0,
      "end_offset": 5,
      "type": "<ALPHANUM>",
      "position": 0
    },
    {
      "token": "gia",
      "start_offset": 0,
      "end_offset": 5,
      "type": "<ALPHANUM>",
      "position": 0
    },
    {
      "token": "gian",
      "start_offset": 0,
      "end_offset": 5,
      "type": "<ALPHANUM>",
      "position": 0
    },
    {
      "token": "giant",
      "start_offset": 0,
      "end_offset": 5,
      "type": "<ALPHANUM>",
      "position": 0
    },
    {
      "token": "g",
      "start_offset": 0,
      "end_offset": 10,
      "type": "shingle",
      "position": 0,
      "positionLength": 2
    },
    {
      "token": "gi",
      "start_offset": 0,
      "end_offset": 10,
      "type": "shingle",
      "position": 0,
      "positionLength": 2
    },
    {
      "token": "gia",
      "start_offset": 0,
      "end_offset": 10,
      "type": "shingle",
      "position": 0,
      "positionLength": 2
    },
    {
      "token": "gian",
      "start_offset": 0,
      "end_offset": 10,
      "type": "shingle",
      "position": 0,
      "positionLength": 2
    },
    {
      "token": "giant",
      "start_offset": 0,
      "end_offset": 10,
      "type": "shingle",
      "position": 0,
      "positionLength": 2
    },
    {
      "token": "giantb",
      "start_offset": 0,
      "end_offset": 10,
      "type": "shingle",
      "position": 0,
      "positionLength": 2
    },
    {
      "token": "giantbo",
      "start_offset": 0,
      "end_offset": 10,
      "type": "shingle",
      "position": 0,
      "positionLength": 2
    },
    {
      "token": "giantbom",
      "start_offset": 0,
      "end_offset": 10,
      "type": "shingle",
      "position": 0,
      "positionLength": 2
    },
    {
      "token": "giantbomb",
      "start_offset": 0,
      "end_offset": 10,
      "type": "shingle",
      "position": 0,
      "positionLength": 2
    },
    {
      "token": "b",
      "start_offset": 6,
      "end_offset": 10,
      "type": "<ALPHANUM>",
      "position": 1
    },
    {
      "token": "bo",
      "start_offset": 6,
      "end_offset": 10,
      "type": "<ALPHANUM>",
      "position": 1
    },
    {
      "token": "bom",
      "start_offset": 6,
      "end_offset": 10,
      "type": "<ALPHANUM>",
      "position": 1
    },
    {
      "token": "bomb",
      "start_offset": 6,
      "end_offset": 10,
      "type": "<ALPHANUM>",
      "position": 1
    }
  ]
}

I did try the highlight_query but I'm not sure how to get it to ignore certain tokens - trying the following gets me the same highlight results as in my original post:

GET /test/test/_search
{
  "query":{
    "match":{
      "test":{
        "query":"giant"
      }
    }
  },
  "highlight": {
    "fields": {
      "test":{}
    },
    "highlight_query":{
      "match":{
        "test":{
          "query":"giant"
        }
      }
    }
  }
}

(Ben Slinger) #4

Just to add to this, I tried created a separate field that isn't using shingles so I could have the highlight_query use that instead - this gave desired results in certain situations, but not others.

Here are my new mappings:

PUT /test
{
  "settings": {
    "index": {
      "number_of_replicas": 0,
      "analysis": {
          "filter": {
            "shingle_join": {
              "max_shingle_size": "2",
              "token_separator": "",
              "output_unigrams": "true",
              "type": "shingle",
              "min_shingle_size": "2"
            },
            "autocomplete_filter": {
              "type": "edge_ngram",
              "min_gram": "1",
              "max_gram": "20"
            }
          },
          "analyzer": {
            "autocomplete": {
              "filter": [
                "lowercase",
                "autocomplete_filter"
              ],
              "type": "custom",
              "tokenizer": "standard"
            },
            "shingle": {
              "filter": [
                "lowercase",
                "shingle_join"
              ],
              "type": "custom",
              "tokenizer": "standard"
            }
          }
      }
    }
  },
  "mappings": {
    "test":{
      "properties": {
        "test":{
          "type":"string",
          "analyzer": "shingle",
          "fields":{
            "autocomplete": {
              "type":"string",
              "analyzer": "autocomplete"
            }
          }
        }
      }
    }
  }
}

And my new query:

GET /test/test/_search
{
  "query":{
    "match":{
      "test.autocomplete":{
        "query":"giantbo"
      }
    }
  },
  "highlight": {
    "fields": {
      "test":{}
    },
    "highlight_query":{
      "match":{
        "test":{
          "query":"giantbo"
        }
      }
    }
  }
}

This works correctly if I don't require it to match on the nGrams created by the "autocomplete" analyzer - search terms for both "giant" and "giant bomb" and "giantbomb" all work correctly, but searching for "giantbo" as above matches the main query but not the highlight query.


(Simon Willnauer) #5

I think the problem in your case is due to the fact that we have to preserve the offsets during indexing and if you choose a different indexing strategy stuff like the compounding / de-compounding won't really work. I think you need have to make a decision with a certain tradeoff. all the possible highlights that you are looking for won't work.


(system) #6

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.