Hello,
I am new to Elasticsearch and I am probably missing something.
It seems that enrich processor is not using custom normalizer.
// My custom Normalizer
PUT /_component_template/comptpl_norm_letters
{
"template": {
"settings": {
"analysis": {
"char_filter": {
"chfilter_only_letters": {
"type": "pattern_replace",
"pattern": """(\P{L}+)""",
"replacement": " "
}
},
"normalizer": {
"norm_letters": {
"type": "custom",
"char_filter": [
"chfilter_only_letters"
],
"filter": [
"lowercase",
"asciifolding",
"trim"
]
}
}
}
}
}
}
// my template for a firstname referentiel
PUT /_index_template/tplidx_my-firstname_ref
{
"index_patterns": [
"my-firstname"
],
"priority": 2,
"template": {
"mappings": {
"dynamic":"strict",
"properties": {
"firstname": {
"type": "keyword",
"normalizer": "norm_letters",
"eager_global_ordinals": true
}
}
}
},
"composed_of": [
"comptpl_norm_letters"
]
}
// some firstname are ingested
POST my-firstname/_doc
{
"firstname":"Marie-Françoise"
}
POST my-firstname/_doc
{
"firstname":"Marie"
}
POST my-firstname/_doc
{
"firstname":"Françoise"
}
// let's create a template for my index to be enriched
PUT /_index_template/tplidx_my-error
{
"index_patterns": [
"error_between_chair_and_keyboard"
],
"priority": 2,
"template": {
"settings": {
"default_pipeline": "pipeline-my-firstname"
},
"mappings": {
"properties": {
"firstname": {
"type": "keyword",
"normalizer": "norm_letters",
"eager_global_ordinals": true
}
}
}
},
"composed_of": [
"comptpl_norm_letters"
]
}
// the related policy
PUT _enrich/policy/policy-my-firstname
{
"match": {
"indices": "my-firstname",
"match_field": "firstname",
"enrich_fields": [ "firstname"]
}
}
// let's run the policy
POST /_enrich/policy/policy-my-firstname/_execute
//let's create an ingest pipeline to enrich my firstnames
PUT _ingest/pipeline/pipeline-my-firstname
{
"processors": [
{
"enrich": {
"description": "try to find firstname",
"policy_name": "policy-my-firstname",
"field": "firstname",
"target_field": "enriched"
}
}
]
}
// let's bulk load doc inside my index
POST error_between_chair_and_keyboard/_bulk
{ "index" : { } }
{ "firstname": "Marie-Françoise"}
{ "index" : { } }
{ "firstname": "Marie#Françoise"}
{ "index" : { } }
{ "firstname": "Marie’Francoise"}
{ "index" : { } }
{ "firstname": "Françoise"}
{ "index" : { } }
{ "firstname": "Marie"}
{ "index" : { } }
{ "firstname": "marie-Françoise"}
{ "index" : { } }
{ "firstname": "marie-Françoise"}
// let's have a look to the docs
GET error_between_chair_and_keyboard/_search
It returns (as we can see, enriched field is not always generated):
{
"took": 0,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 7,
"relation": "eq"
},
"max_score": 1,
"hits": [
{
"_index": "error_between_chair_and_keyboard",
"_id": "FjG5f4YBgPb-ljrw_r6A",
"_score": 1,
"_source": {
"firstname": "Marie-Françoise",
"enriched": {
"firstname": "Marie-Françoise"
}
}
},
{
"_index": "error_between_chair_and_keyboard",
"_id": "FzG5f4YBgPb-ljrw_r6A",
"_score": 1,
"_source": {
"firstname": "Marie#Françoise"
}
},
{
"_index": "error_between_chair_and_keyboard",
"_id": "GDG5f4YBgPb-ljrw_r6A",
"_score": 1,
"_source": {
"firstname": "Marie’Francoise"
}
},
{
"_index": "error_between_chair_and_keyboard",
"_id": "GTG5f4YBgPb-ljrw_r6A",
"_score": 1,
"_source": {
"firstname": "Françoise",
"enriched": {
"firstname": "Françoise"
}
}
},
{
"_index": "error_between_chair_and_keyboard",
"_id": "GjG5f4YBgPb-ljrw_r6A",
"_score": 1,
"_source": {
"firstname": "Marie",
"enriched": {
"firstname": "Marie"
}
}
},
{
"_index": "error_between_chair_and_keyboard",
"_id": "GzG5f4YBgPb-ljrw_r6A",
"_score": 1,
"_source": {
"firstname": "marie-Françoise"
}
},
{
"_index": "error_between_chair_and_keyboard",
"_id": "HDG5f4YBgPb-ljrw_r6A",
"_score": 1,
"_source": {
"firstname": "marie-Françoise"
}
}
]
}
}
Yet, if i run:
GET my-firstname/_search
{
"size":100,
"query": {
"bool":{
"must" : [
{"term": {"firstname": "marie francoise"}}
]
}
}
}
It returns:
{
"took": 0,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 0.9808291,
"hits": [
{
"_index": "my-firstname",
"_id": "_zGzf4YBgPb-ljrw-702",
"_score": 0.9808291,
"_source": {
"firstname": "Marie-Françoise"
}
}
]
}
}
And if I run:
GET error_between_chair_and_keyboard/_search
{
"size":100,
"query": {
"bool":{
"must" : [
{"term": {"firstname": "marie francoise"}}
]
}
}
}
It returns:
{
"took": 0,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 5,
"relation": "eq"
},
"max_score": 0.37469345,
"hits": [
{
"_index": "error_between_chair_and_keyboard",
"_id": "FjG5f4YBgPb-ljrw_r6A",
"_score": 0.37469345,
"_source": {
"firstname": "Marie-Françoise",
"enriched": {
"firstname": "Marie-Françoise"
}
}
},
{
"_index": "error_between_chair_and_keyboard",
"_id": "FzG5f4YBgPb-ljrw_r6A",
"_score": 0.37469345,
"_source": {
"firstname": "Marie#Françoise"
}
},
{
"_index": "error_between_chair_and_keyboard",
"_id": "GDG5f4YBgPb-ljrw_r6A",
"_score": 0.37469345,
"_source": {
"firstname": "Marie’Francoise"
}
},
{
"_index": "error_between_chair_and_keyboard",
"_id": "GzG5f4YBgPb-ljrw_r6A",
"_score": 0.37469345,
"_source": {
"firstname": "marie-Françoise"
}
},
{
"_index": "error_between_chair_and_keyboard",
"_id": "HDG5f4YBgPb-ljrw_r6A",
"_score": 0.37469345,
"_source": {
"firstname": "marie-Françoise"
}
}
]
}
}
What am I missing?
Thanks in advance,
Best regards