Hi,
One possible way to check this is to use the _termvectors
API that tells you about index terms for a specific document. For the ""Just A Dream" doc for example I get:
{
"_index" : "my_index",
"_type" : "_doc",
"_id" : "2",
"_version" : 1,
"found" : true,
"took" : 1,
"term_vectors" : {
"my_field._3gram" : {
"field_statistics" : {
"sum_doc_freq" : 5,
"doc_count" : 4,
"sum_ttf" : 5
},
"terms" : {
"just a dream" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 12
}
]
}
}
},
"my_field._2gram" : {
"field_statistics" : {
"sum_doc_freq" : 11,
"doc_count" : 6,
"sum_ttf" : 11
},
"terms" : {
"a dream" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 5,
"end_offset" : 12
}
]
},
"just a" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 6
}
]
}
}
},
"my_field._index_prefix" : {
"field_statistics" : {
"sum_doc_freq" : 181,
"doc_count" : 9,
"sum_ttf" : 183
},
"terms" : {
"a" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 5,
"end_offset" : 12
}
]
},
"a " : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 5,
"end_offset" : 12
}
]
},
"a d" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 5,
"end_offset" : 12
}
]
},
"a dr" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 5,
"end_offset" : 12
}
]
},
"a dre" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 5,
"end_offset" : 12
}
]
},
"a drea" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 5,
"end_offset" : 12
}
]
},
"a dream" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 5,
"end_offset" : 12
}
]
},
"a dream " : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 5,
"end_offset" : 12
}
]
},
"d" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 2,
"start_offset" : 7,
"end_offset" : 12
}
]
},
"dr" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 2,
"start_offset" : 7,
"end_offset" : 12
}
]
},
"dre" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 2,
"start_offset" : 7,
"end_offset" : 12
}
]
},
"drea" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 2,
"start_offset" : 7,
"end_offset" : 12
}
]
},
"dream" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 2,
"start_offset" : 7,
"end_offset" : 12
}
]
},
"dream " : {
"term_freq" : 1,
"tokens" : [
{
"position" : 2,
"start_offset" : 7,
"end_offset" : 12
}
]
},
"dream " : {
"term_freq" : 1,
"tokens" : [
{
"position" : 2,
"start_offset" : 7,
"end_offset" : 12
}
]
},
"j" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 12
}
]
},
"ju" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 12
}
]
},
"jus" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 12
}
]
},
"just" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 12
}
]
},
"just " : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 12
}
]
},
"just a" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 12
}
]
},
"just a " : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 12
}
]
},
"just a d" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 12
}
]
},
"just a dr" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 12
}
]
},
"just a dre" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 12
}
]
},
"just a drea" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 12
}
]
},
"just a dream" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 12
}
]
}
}
},
"my_field" : {
"field_statistics" : {
"sum_doc_freq" : 20,
"doc_count" : 9,
"sum_ttf" : 20
},
"terms" : {
"a" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 5,
"end_offset" : 6
}
]
},
"dream" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 2,
"start_offset" : 7,
"end_offset" : 12
}
]
},
"just" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 4
}
]
}
}
}
}
}
In order to include e.g. exact matches and rank them higher, you can probably explore ways of combining several queries in a bool
query should
clause or something the like. Getting exactly the ranking you proposed above might be a bit twiddly but for example this query gives me exact "dre" matches scored higher than the prefix matches alone:
GET my_index/_search
{
"query": {
"bool": {
"should": [
{
"multi_match": {
"query": "dre",
"type": "bool_prefix",
"fuzziness": 2,
"fields": [
"my_field",
"my_field._2gram",
"my_field._3gram",
"my_field._index_prefix"
]
}
},
{
"match_phrase": {
"my_field": "dre"
}
}
]
}
}
}
About the possibility of "infix" matches like returning "Balladream" for "dream" I'm not entirely sure, the documentation mentions something along those lines but is short on details. Will do some digging, while I hope the above pointers will bring you a bit further towards your desired goal.