hello,
to optimize my disk space, i'm excluding my attachment content in the mapping of my index. but i can't no longer search inside it . i don't know what am i messing !! . is there any solution for this issue ??
Welcome!
Could you share exactly what you did?
i'm using ingest pipeline to decode my base64 pdf file:
PUT _ingest/pipeline/attachment
{
"description" : "Extract attachment information",
"processors" : [
{
"attachment": {
"field": "smart_content"
}
},
{
"remove": {
"field": "smart_content"
}
}
]
}
and i'm using this mapping:
{
"_source" : {
"excludes" : [
"smart_content",
"attachment.content"
]
},
"properties" : {
"attachment" : {
"properties" : {
"content" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"content_length" : {
"type" : "long"
},
"content_type" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"date" : {
"type" : "date"
},
"language" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"title" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
}
},
"createdBy" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"createdDate" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"dateConservation" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"document" : {
"properties" : {
"comment" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"content" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"dateConservation" : {
"type" : "date"
},
"dateNum" : {
"type" : "date"
},
"dua" : {
"type" : "long"
},
"idBundle" : {
"type" : "long"
},
"idLot" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"metadata" : {
"properties" : {
"key" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"value" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
}
},
"reference" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"scanProofAvailable" : {
"type" : "boolean"
},
"status" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"tags" : {
"properties" : {
"color" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"key" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
}
},
"unarchivable" : {
"type" : "boolean"
}
}
},
"documentType" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"filename" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"forwardedBy" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"id" : {
"type" : "date",
"store" : true
},
"idCourier" : {
"type" : "long"
},
"lastModifiedBy" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"lastModifiedDate" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"mailbox" : {
"properties" : {
"accesses" : {
"properties" : {
"type" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"user" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
}
},
"id" : {
"type" : "long"
},
"label" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"type" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
}
},
"metadata" : {
"properties" : {
"key" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"value" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
}
},
"smart_content" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"status" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
}
}
What is the search request you are running?
i'm running a query_string.
{
"query": {
"`query_string`": {
"query": "something"
}
}
}
Can you share a document that has been indexed which contains the word "something"?
For this, I'd change the mapping and remove the source exclusion on attachment.content.
I'm not sure why you wanted to remove it BTW.
Ideally, if you could reproduce the full problem with a very simple script, that'd help a lot to understand what you are doing wrong.
You could encode in BASE64 a minimal text file which contains the text you are then searching for.
to explain more my probleme:
The first time i created my index without excluding the attachement.content, but a few mounths later, i find out that my index is taking a lot of space ( hundreds of GO ), because i'm indexing big PDF files . so i desided to look for a solution to remove atachment.content feild from my index without removing "_termvectors" of this feild so a can keep the search fonctionality working.
I don't think that removing the extracted text will improve a lot. But removing the binary content is indeed the way to go IMO.
This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.