PUT candidate-index
{
"settings": {
"index": {
"max_ngram_diff": 20,
"number_of_replicas": 0,
"number_of_shards": 1
},
"analysis": {
"normalizer": {
"lowercase_normalize": {
"type": "custom",
"filter": ["lowercase"]
}
},
"tokenizer": {
"grams_tokenizer": {
"type": "ngram",
"min_gram": 2,
"max_gram": 20,
"token_chars": ["letter", "digit", "punctuation", "symbol"]
},
"phone_grams_tokenizer": {
"type": "ngram",
"min_gram": 2,
"max_gram": 3,
"token_chars": ["digit"]
},
"search_keywords_tokenizer": {
"type": "char_group",
"tokenize_on_chars": ["whitespace", ","]
}
},
"filter": {
"non_space_tokens_filter": {
"type": "pattern_capture",
"preserve_original": true,
"patterns": ["([\\d\\w]+)", "([^\\w]+)", "([^\\W_]+)"]
},
"email_filter": {
"type": "pattern_capture",
"preserve_original": true,
"patterns": ["([^@]+)", "(\\p{L}+)", "(\\d+)", "@(.+)", "([^-@]+)"]
},
"rm_empty_string_tokens": {
"type": "length",
"min": 3,
"max": 30
}
},
"char_filter": {
"rm_non_digits_char_filter": {
"type": "pattern_replace",
"pattern": "([^0-9]+)",
"replacement": ""
}
},
"analyzer": {
"default": {
"tokenizer": "whitespace",
"type": "custom",
"filter": ["non_space_tokens_filter", "lowercase", "unique"]
},
"default_search": {
"filter": ["lowercase"],
"type": "custom",
"tokenizer": "search_keywords_tokenizer"
},
"email_analyzer": {
"type": "custom",
"tokenizer": "uax_url_email",
"filter": ["email_filter", "lowercase", "unique"]
},
"search_analyzer": {
"type": "custom",
"filter": ["lowercase"],
"tokenizer": "search_keywords_tokenizer"
},
"grams_analyzer": {
"type": "custom",
"filter": ["lowercase"],
"tokenizer": "grams_tokenizer"
},
"phone_grams_analyzer": {
"type": "custom",
"char_filter": ["rm_non_digits_char_filter"],
"tokenizer": "phone_grams_tokenizer"
},
"phone_keyword_analyzer": {
"type": "custom",
"char_filter": ["rm_non_digits_char_filter"],
"filter": ["rm_empty_string_tokens"],
"tokenizer": "keyword"
}
}
}
},
"mappings": {
"dynamic": "false",
"properties": {
"currentlocation": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
},
"lowercase_keyword": {
"type": "keyword",
"normalizer": "lowercase_normalize",
"ignore_above": 256
}
}
}
}
}
}
There is an existing index with the mentioned settings and there is a field named "currentlocation" with the mentioned mapping. Now I want to use pattern analyzer and trim the currentlocation field value before storing a document. Need a help with usage of pattern analyzer and not to affect any existing mapping of the field
Example:
If I'm creating a document as mentioned below
POST candidate-index/_doc/1
{
"currentlocation": " Hosur, Bangalore, India "
}
It should be stored in the index as mentioned below
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "candidate-index",
"_type" : "_doc",
"_id" : "1",
"_score" : 1.0,
"_source" : {
"currentlocation" : "Hosur, Bangalore, India"
}
}
]
}
}
I want to replace the multiple spaces in the string with single space.
Case 2:
Say document stored as mentioned below:
POST candidate-index/_doc/2
{
"currentlocation": "Qatar , As ia "
}
It should be stored as mentioned below
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "candidate-index",
"_type" : "_doc",
"_id" : "1",
"_score" : 1.0,
"_source" : {
"currentlocation" : "Qatar , Asia"
}
}
]
}
}