Hi Everyone,
I am using Ingest-attachment for indexing documents. I am able to parse text documents (.txt files). When I try to parse .doc or pdf files getting this error.
FILE = /elastic/files/englishAnalyzer.doc
ID = 6
"error" : {
"root_cause" : [
{
"type" : "exception",
"reason" : "java.lang.IllegalArgumentException: ElasticsearchParseException[Error parsing document in field [data]]; nested: TikaExc
eption[Unexpected RuntimeException from org.apache.tika.parser.microsoft.OfficeParser@28992079]; nested: ArrayIndexOutOfBoundsException[-1];
",
"header" : {
"processor_type" : "attachment"
}
}
],
"type" : "exception",
"reason" : "java.lang.IllegalArgumentException: ElasticsearchParseException[Error parsing document in field [data]]; nested: TikaExcepti
on[Unexpected RuntimeException from org.apache.tika.parser.microsoft.OfficeParser@28992079]; nested: ArrayIndexOutOfBoundsException[-1];",
"caused_by" : {
"type" : "illegal_argument_exception",
"reason" : "ElasticsearchParseException[Error parsing document in field [data]]; nested: TikaException[Unexpected RuntimeException fro
m org.apache.tika.parser.microsoft.OfficeParser@28992079]; nested: ArrayIndexOutOfBoundsException[-1];",
"caused_by" : {
"type" : "parse_exception",
"reason" : "Error parsing document in field [data]",
"caused_by" : {
"type" : "tika_exception",
"reason" : "Unexpected RuntimeException from org.apache.tika.parser.microsoft.OfficeParser@28992079",
"caused_by" : {
"type" : "array_index_out_of_bounds_exception",
"reason" : "-1"
}
}
}
},
"header" : {
"processor_type" : "attachment"
}
},
"status" : 500
}
Please help me to resolve the issue
PFB my template and pipeline configuration.
Template
curl -XPUT 'localhost:9200/_template/template_1?pretty' -H 'Content-Type: application/json' -d'
{
"order": 0,
"template": "policies*",
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0,
"refresh_interval": "1s"
},
"mappings": {
"policy": {
"_all": {
"enabled": false
},
"properties": {
"@timestamp": {
"include_in_all": false,
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
},
"filename": {
"type": "keyword",
"ignore_above": 256
},
"isEnabled": {
"type": "boolean"
},
"data": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"attachment" : {
"properties" : {
"content_length" : { "type": "long" },
"author" : {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"date" : {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"language" : {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"name" : {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"title" : {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"keywords" : {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"content_type": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"content": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"analyzer": "english",
"term_vector": "with_positions_offsets"
}
}
}
},
"dynamic_templates": [
{
"strings": {
"match_mapping_type": "*",
"mapping": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
}
}
}
]
}
}
}
'
Pipeline
curl -XPUT 'localhost:9200/_ingest/pipeline/attachment' -d'
{
"description" : "Extract attachment information",
"processors" : [
{
"attachment" : {
"field" : "data",
"target_field" : "attachment",
"indexed_chars" : -1,
"ignore_missing" : true
}
}
]
}'