Hello,
Since an upgrade from 7.9 to 7.10, ES tokenizes "as_you_type" fields with a "simple" analyzer differently, causing less results to be returned. While this analyzer should split on any non-letter, it stopped splitting on dots:
PUT origin/
PUT origin/_mapping
{
"properties": {
"url": {
"type": "text",
"analyzer": "simple",
"fields": {
"as_you_type": {
"type": "search_as_you_type",
"analyzer": "simple"
}
}
}
}
}
POST origin/_doc/
{"url": "http://barbaz.qux"}
POST origin/_search?pretty
{"query": {"bool": {"must": [{"multi_match": {"query": "qu", "type": "bool_prefix", "operator": "and", "fields": ["url.as_you_type", "url.as_you_type._2gram", "url.as_you_type._3gram"]}}]}}}
With ES 7.10, this returns:
{
"took" : 35,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 0,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
}
}
while earlier versions would return:
{
"took" : 39,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "origin",
"_type" : "_doc",
"_id" : "hBlj5XUBA6F2j0NP1UH9",
"_score" : 1.0,
"_source" : {
"url" : "http://barbaz.qux"
}
}
]
}
}
Running:
GET origin/_termvectors/<id>
shows a different in tokenization between the two versions:
For ES 7.10:
{
"_index" : "origin",
"_type" : "_doc",
"_id" : "ybBI5XUBJDHqP9p-BVs2",
"_version" : 1,
"found" : true,
"took" : 1,
"term_vectors" : {
"url.as_you_type._index_prefix" : {
"field_statistics" : {
"sum_doc_freq" : 28,
"doc_count" : 1,
"sum_ttf" : 28
},
"terms" : {
"b" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 7,
"end_offset" : 17
}
]
},
"ba" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 7,
"end_offset" : 17
}
]
},
"bar" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 7,
"end_offset" : 17
}
]
},
"barb" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 7,
"end_offset" : 17
}
]
},
"barba" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 7,
"end_offset" : 17
}
]
},
"barbaz" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 7,
"end_offset" : 17
}
]
},
"barbaz." : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 7,
"end_offset" : 17
}
]
},
"barbaz.q" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 7,
"end_offset" : 17
}
]
},
"barbaz.qu" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 7,
"end_offset" : 17
}
]
},
"barbaz.qux" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 7,
"end_offset" : 17
}
]
},
"barbaz.qux " : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 7,
"end_offset" : 17
}
]
},
"barbaz.qux " : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 7,
"end_offset" : 17
}
]
},
"h" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
},
"ht" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
},
"htt" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
},
"http" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
},
"http " : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
},
"http b" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
},
"http ba" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
},
"http bar" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
},
"http barb" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
},
"http barba" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
},
"http barbaz" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
},
"http barbaz." : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
},
"http barbaz.q" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
},
"http barbaz.qu" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
},
"http barbaz.qux" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
},
"http barbaz.qux " : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
}
}
},
"url.as_you_type._2gram" : {
"field_statistics" : {
"sum_doc_freq" : 1,
"doc_count" : 1,
"sum_ttf" : 1
},
"terms" : {
"http barbaz.qux" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 17
}
]
}
}
},
"url.as_you_type" : {
"field_statistics" : {
"sum_doc_freq" : 2,
"doc_count" : 1,
"sum_ttf" : 2
},
"terms" : {
"barbaz.qux" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 7,
"end_offset" : 17
}
]
},
"http" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 4
}
]
}
}
},
"url" : {
"field_statistics" : {
"sum_doc_freq" : 3,
"doc_count" : 1,
"sum_ttf" : 3
},
"terms" : {
"barbaz" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 7,
"end_offset" : 13
}
]
},
"http" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 4
}
]
},
"qux" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 2,
"start_offset" : 14,
"end_offset" : 17
}
]
}
}
}
}
}
You will notice that terms such as "qu"
are missing.