When you create a language-specific engine, AppSearch applies language-specific analyzers to the index containing engine's documents.
As an example, I create an engine called japanese-test with data from your article. Documents are stored in an index called .ent-search-engine-documents-japanese-test
. The index has the following analysis
settings:
{
"filter" : {
"front_ngram" : {
"type" : "edge_ngram",
"min_gram" : "1",
"max_gram" : "12"
},
"bigram_joiner" : {
"max_shingle_size" : "2",
"token_separator" : "",
"output_unigrams" : "false",
"type" : "shingle"
},
"bigram_max_size" : {
"type" : "length",
"max" : "16",
"min" : "0"
},
"bigram_joiner_unigrams" : {
"max_shingle_size" : "2",
"token_separator" : "",
"output_unigrams" : "true",
"type" : "shingle"
},
"delimiter" : {
"split_on_numerics" : "true",
"generate_word_parts" : "true",
"preserve_original" : "false",
"catenate_words" : "true",
"generate_number_parts" : "true",
"catenate_all" : "true",
"split_on_case_change" : "true",
"type" : "word_delimiter_graph",
"catenate_numbers" : "true",
"stem_english_possessive" : "true"
},
"ja-stop-words-filter" : {
"type" : "stop",
"stopwords" : "_english_"
},
"ja-stem-filter" : {
"name" : "light_english",
"type" : "stemmer"
}
},
"analyzer" : {
"i_prefix" : {
"filter" : [
"cjk_width",
"lowercase",
"asciifolding",
"front_ngram"
],
"tokenizer" : "standard"
},
"iq_text_delimiter" : {
"filter" : [
"delimiter",
"cjk_width",
"lowercase",
"asciifolding",
"ja-stop-words-filter",
"ja-stem-filter",
"cjk_bigram"
],
"tokenizer" : "whitespace"
},
"q_prefix" : {
"filter" : [
"cjk_width",
"lowercase",
"asciifolding"
],
"tokenizer" : "standard"
},
"iq_text_base" : {
"filter" : [
"cjk_width",
"lowercase",
"asciifolding",
"ja-stop-words-filter"
],
"tokenizer" : "standard"
},
"iq_text_stem" : {
"filter" : [
"cjk_width",
"lowercase",
"asciifolding",
"ja-stop-words-filter",
"ja-stem-filter",
"cjk_bigram"
],
"tokenizer" : "standard"
},
"i_text_bigram" : {
"filter" : [
"cjk_width",
"lowercase",
"asciifolding",
"ja-stem-filter",
"bigram_joiner",
"bigram_max_size"
],
"tokenizer" : "standard"
},
"q_text_bigram" : {
"filter" : [
"cjk_width",
"lowercase",
"asciifolding",
"ja-stem-filter",
"bigram_joiner_unigrams",
"bigram_max_size"
],
"tokenizer" : "standard"
}
}
}
and here are the mappings for my_field
:
{
"type" : "text",
"fields" : {
"date" : {
"type" : "date",
"format" : "strict_date_time||strict_date",
"ignore_malformed" : true
},
"delimiter" : {
"type" : "text",
"index_options" : "freqs",
"analyzer" : "iq_text_delimiter"
},
"enum" : {
"type" : "keyword",
"ignore_above" : 2048
},
"float" : {
"type" : "double",
"ignore_malformed" : true
},
"joined" : {
"type" : "text",
"index_options" : "freqs",
"analyzer" : "i_text_bigram",
"search_analyzer" : "q_text_bigram"
},
"location" : {
"type" : "geo_point",
"ignore_malformed" : true,
"ignore_z_value" : false
},
"prefix" : {
"type" : "text",
"index_options" : "docs",
"analyzer" : "i_prefix",
"search_analyzer" : "q_prefix"
},
"stem" : {
"type" : "text",
"analyzer" : "iq_text_stem"
}
},
"index_options" : "freqs",
"analyzer" : "iq_text_base"
}
You can already see that analysis
settings in AppSearch are not the same as in your article. Moreover, in the article, the queries that are used are not simple search queries. They aggregate the search results, and sort them by number of occurrences. You see how when you search for 日本
, it comes up on top, because there are 6 documents with that exact text? In App Search, when you use query tester to search for a substring, it does no do aggregations.
I hopes this helps.