Hi ES community,
I have a problem having asciifolder working with ingest-attachment pipeline.
Here I provide the whole code (to avoid hidden mistake ).
You can see that I use lowercase and my own stopwords filters. Those filters are working fine, but asciifolder not. The last command search a word without diacritic that should match.
#delete old pipeline and index
curl -X DELETE "localhost:9200/_ingest/pipeline/doctestpl"
curl -X DELETE "localhost:9200/doctest"
# create my customized index
curl -X PUT "localhost:9200/doctest" -H 'Content-Type: application/json' -d'
{
"settings": {
"index": {
"number_of_shards": 20,
"number_of_replicas": 1
},
"analysis": {
"filter": {
"general_stop_words": {
"type": "stop",
"stopwords": ["a", "an", "and", "as", "at", "be", "but", "by", "for", "had", "has", "have", "he", "her", "him", "his", "how", "i", "if",
"in", "is", "it", "me", "my", "no", "of", "on", "or", "so", "some", "such", "than", "that", "the", "then", "these", "this",
"those", "to", "we", "who", "''s",
"alors", "au", "aussi", "avec", "car", "ce", "c''", "cela", "de", "dont", "ces", "ci", "comme", "dans", "des", "du", "donc",
"elle", "elles", "en", "est", "et", "eu", "il", "ils", "je", "la", "le", "les", "leur", "ma", "mais", "mes", "mon", "meme", "ni", "nous", "ou", "on", "or",
"par", "pas", "pour", "puis", "que", "qui", "sa", "ses", "si", "son", "sur", "ta", "tes", "ton", "tous", "tout", "tres", "tu",
"votre", "vous", "vu", "ca", "ete", "etre", "y"]
}
},
"tokenizer": {
"test_tokenizer": {
"type": "pattern",
"pattern": "(?i)([a-z0-9._%+-]+@[a-z0-9.-]+\\.[a-z]{2,6}|c\\+\\+|c#|j\\+\\+|f#|x\\+\\+|c--|j#|d\\+\\+|go!|c/al|[a-z0-9]+)",
"group": 1
}
},
"analyzer": {
"test_analyzer": {
"filter": ["asciifolding", "lowercase", "general_stop_words"],
"tokenizer": "test_tokenizer"
}
}
}
},
"mappings": {
"document": {
"properties": {
"attachment": {
"properties": {
"content": {
"type": "text",
"analyzer": "test_analyzer",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"filename": {
"type": "text"
},
"docid": {
"type": "text"
},
"insertdate": {
"type": "date"
},
"islastversion": {
"type": "boolean"
},
"docversion": {
"type": "long"
},
"downloadcount": {
"type": "long"
}
}
}
}
}'
# create pipeline for test docs
curl -X PUT "localhost:9200/_ingest/pipeline/doctestpl" -H 'Content-Type: application/json' -d'
{
"description": "Extract attachment information dedicated to test",
"processors": [{
"attachment": {
"field": "payload",
"indexed_chars": "-1"
}
}
]
}'
# inject the sentence.
curl -X PUT "localhost:9200/doctest/document/doc2?pipeline=doctestpl" -H 'Content-Type: application/json' -d'
{
"payload": "TGUgcMOpbGVyaW4gcMOqY2hlIHNvbiBwb2lzc29uIGF1eCBBw6dvcmVzLg==",
"filename": "kiki.txt",
"docid": "010234",
"docversion": "1",
"insertdate": "2019-01-25T17:26:00Z",
"downloadcount": "0",
"islastversion": "true"
}'
# search word peche: Nothing returned
curl -X GET "localhost:9200/doctest/_search" -H 'Content-Type: application/json' -d'
{
"query": {
"constant_score": {
"filter":{
"bool": {
"filter":{
"term": {"islastversion": true}
}
,
"must": [
{
"match":{
"attachment.content":"peche"
}
}
]
}
}
}
}
}'