I'm using shingles in our index to allow for matches on multi-words when a search using a joined version of the word is entered, eg. to allow 'giantbomb' to match a field with the term 'giant bomb' in it.
This is working fine, but because I shingle the whole field, the highlighter is matching the shingles and highlighting additional words. Is there a way to tell the highlighter to only match on tokens that aren't generated by the shingle filter?
Example:
DELETE /test
PUT /test
{
"settings": {
"index": {
"number_of_replicas": 0,
"analysis": {
"filter": {
"shingle_join": {
"max_shingle_size": "2",
"token_separator": "",
"output_unigrams": "true",
"type": "shingle",
"min_shinge_size": "2"
},
"autocomplete_filter": {
"type": "edge_ngram",
"min_gram": "1",
"max_gram": "20"
}
},
"analyzer": {
"autocomplete": {
"filter": [
"lowercase",
"shingle_join",
"autocomplete_filter"
],
"type": "custom",
"tokenizer": "standard"
}
}
}
}
},
"mappings": {
"test":{
"properties": {
"test":{
"type":"string",
"analyzer": "autocomplete"
}
}
}
}
}
PUT /test/test/1
{
"test":"giant bomb"
}
GET /test/test/_search
{
"query":{
"match":{
"test":{
"query":"giant"
}
}
},
"highlight": {
"fields": {
"test":{}
}
}
}
Output:
{
"took": 16334,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.60692424,
"hits": [
{
"_index": "test",
"_type": "test",
"_id": "1",
"_score": 0.60692424,
"_source": {
"test": "giant bomb"
},
"highlight": {
"test": [
"<em>giant bomb</em>"
]
}
}
]
}
}
Desired output:
{
"took": 16334,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.60692424,
"hits": [
{
"_index": "test",
"_type": "test",
"_id": "1",
"_score": 0.60692424,
"_source": {
"test": "giant bomb"
},
"highlight": {
"test": [
"<em>giant</em> bomb"
]
}
}
]
}
}