We have an application that store and query data in Elasticsearch. We have a problem when searching for partial substrings that contains a word with special chats like "notepad++.exe" or "/folder1/file1.txt" or "new text doc". Per example searching for "1 document" the search return strange things like file with names as "1 - copy.rtf", "1.rtf", etc
I think this is very common scenario for searches of files with spaces and special chars but looks like there is no out of the box solution for this?
We have tried these analyzer but looks like they always split the the string in tokens when there is a special char:
ana_s_lc
ana_w_lc
ana_s_lc_st
Sample query to search all document that match or contais 1 document :
GET /fileevents-*/_search
{
"from":0,
"_source":{"includes":["fileName"]},
"query":{
"bool":{
"must":[
{
"query_string":{
"allow_leading_wildcard":true,
"default_field":"fileName",
"query":"* 1 document *"
}
}
]
}
}
}
result:
{
"took" : 18,
"timed_out" : false,
"_shards" : {
"total" : 4,
"successful" : 4,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 10000,
"relation" : "gte"
},
"max_score" : 8.271101,
"hits" : [
{
"_index" : "fileevents-2021.11",
"_type" : "_doc",
"_id" : "80229def-0c8d-4471-a352-a711f6e212e2",
"_score" : 8.271101,
"_source" : {
"fileName" : "1.rtf"
}
},
{
"_index" : "fileevents-2021.11",
"_type" : "_doc",
"_id" : "cd8edb43-47de-4213-a2f1-7a2f97b84221",
"_score" : 8.271101,
"_source" : {
"fileName" : "1 - Copy.rtf"
}
},
{
"_index" : "fileevents-2021.11",
"_type" : "_doc",
"_id" : "a366be45-440c-44ea-8c19-dc9516877ce8",
"_score" : 8.271101,
"_source" : {
"fileName" : "1.rtf"
}
},
{
"_index" : "fileevents-2021.11",
"_type" : "_doc",
"_id" : "e4a8dd40-7ce5-4981-a99c-0050d1e36b68",
"_score" : 8.271101,
"_source" : {
"fileName" : "1 - Copy.rtf"
}
},
{
"_index" : "fileevents-2021.11",
"_type" : "_doc",
"_id" : "6ab91de1-60ed-4bd5-88f1-b907c67d6edc",
"_score" : 8.271101,
"_source" : {
"fileName" : "1.txt"
}
},
{
"_index" : "fileevents-2021.11",
"_type" : "_doc",
"_id" : "df29ca8a-d26c-49b6-a7e3-b07c86a16bc5",
"_score" : 8.271101,
"_source" : {
"fileName" : "1.txt"
}
},
{
"_index" : "fileevents-2021.11",
"_type" : "_doc",
"_id" : "ae724e3e-f773-4b5d-a6e0-f4249a7f0086",
"_score" : 7.0889215,
"_source" : {
"fileName" : "Document sin etiquetar manual.docx"
}
},
{
"_index" : "fileevents-2021.11",
"_type" : "_doc",
"_id" : "e856f160-1d85-4598-be0d-3e8a7dff6574",
"_score" : 7.0889215,
"_source" : {
"fileName" : "Document sin etiquetar manual.docx"
}
},
{
"_index" : "fileevents-2021.11",
"_type" : "_doc",
"_id" : "d330fad1-490a-4d60-a6c1-39d92aeede59",
"_score" : 7.0129147,
"_source" : {
"fileName" : "msword_file@1.docx"
}
},
{
"_index" : "fileevents-2021.11",
"_type" : "_doc",
"_id" : "62ba1863-80a8-43a9-b5e9-147f92288225",
"_score" : 7.0129147,
"_source" : {
"fileName" : "msword_file@1.docx"
}
}
]
}
}
index settings
{
"fileevents-2021.12" : {
"settings" : {
"index" : {
"routing" : {
"allocation" : {
"include" : {
"_tier_preference" : "data_content"
}
}
},
"number_of_shards" : "1",
"provided_name" : "fileevents-2021.12",
"creation_date" : "1638452525018",
"analysis" : {
"filter" : {
"lowercase" : {
"type" : "lowercase"
}
},
"normalizer" : {
"nor_lc" : {
"filter" : [
"lowercase"
],
"type" : "custom"
}
},
"analyzer" : {
"ana_path_lc" : {
"filter" : [
"lowercase"
],
"type" : "custom",
"tokenizer" : "tok_path"
},
"ana_w_lc" : {
"filter" : [
"lowercase"
],
"type" : "custom",
"tokenizer" : "whitespace"
},
"ana_s_lc" : {
"filter" : [
"lowercase"
],
"type" : "custom",
"tokenizer" : "standard"
}
},
"tokenizer" : {
"tok_path" : {
"type" : "path_hierarchy",
"delimiter" : """\"""
}
}
},
"number_of_replicas" : "1",
"uuid" : "Jcq710KrR0yxxhzCf9qpzA",
"version" : {
"created" : "7130299"
}
}
}
}
}
So any search we do for a partial string that contains special chars or whitespaces doesnt
work.