I have a use case where I need to reindex my documents to use Machine Learning capabilities for my data. Hence, I have been E5 Text Embedding model with body_content field chunked using Ingest pipeline and also using Inference pipeline to convert the text fields into text embedded fields. In this process, while I am reindexing the documents using reindex API, my ingestion stops at certain point and there is no failure message where I can troubleshoot. I have enabled ingestion_failure message in the pipeline but the documents are being rejected and I was also not able to set/reroute it to another index to see the failure message. How can I troubleshoot this issue?
ReIndex API:
POST _reindex?wait_for_completion=false
{
"conflicts": "proceed",
"source": {
"index": ".ent-search-engine-documents-test",
"size": 100
},
"dest": {`
"index": "test",
"pipeline": "ml-inference-test"
}
}
Chunker pipeline:
[
{
"script": {
"description": "Chunk pdfcontent into sentences by looking for . followed by a space",
"lang": "painless",
"source": "\n String[] envSplit = /((?<!M(r|s|rs)\\.)(?<=\\.) |(?<=\\!) |(?<=\\?) )/.split(ctx['pdfcontent']);\n ctx['passages'] = new ArrayList();\n int i = 0;\n boolean remaining = true;\n if (envSplit.length == 0) {\n return\n } else if (envSplit.length == 1) {\n Map passage = ['text': envSplit[0]];ctx['passages'].add(passage)\n } else {\n while (remaining) {\n Map passage = ['text': envSplit[i++]];\n while (i < envSplit.length && passage.text.length() + envSplit[i].length() < params.model_limit) {passage.text = passage.text + ' ' + envSplit[i++]}\n if (i == envSplit.length) {remaining = false}\n ctx['passages'].add(passage)\n }\n }\n ",
"params": {
"model_limit": 400
}
}
},
{
"foreach": {
"field": "passages",
"processor": {
"inference": {
"field_map": {
"_ingest._value.text": "text_field"
},
"model_id": ".multilingual-e5-small_linux-x86_64",
"target_field": "_ingest._value.pdfcontent_chunked",
"on_failure": [
{
"append": {
"field": "_source._ingest.inference_errors",
"value": [
{
"message": "Processor 'inference' in pipeline 'ml-inference-test' failed with message '{{ _ingest.on_failure_message }}'",
"pipeline": "ml-inference-test",
"timestamp": "{{{ _ingest.timestamp }}}"
}
]
}
}
]
}
}
}
}
]
ml-inference-test pipeline:
[
{
"pipeline": {
"name": "chunker"
}
},
{
"remove": {
"field": "ml.inference.title",
"ignore_missing": true
}
},
{
"inference": {
"field_map": {
"title": "text_field"
},
"model_id": ".multilingual-e5-small_linux-x86_64",
"target_field": "ml.inference.title",
"on_failure": [
{
"append": {
"field": "_source._ingest.inference_errors",
"allow_duplicates": false,
"value": [
{
"message": "Processor 'inference' in pipeline 'ml.inference.test' failed for field 'title' with message '{{ _ingest.on_failure_message }}'",
"pipeline": "ml.inference.test",
"timestamp": "{{{ _ingest.timestamp }}}"
}
]
}
}
]
}
},
{
"remove": {
"field": "ml.inference.description",
"ignore_missing": true
}
},
{
"inference": {
"field_map": {
"description": "text_field"
},
"model_id": ".multilingual-e5-small_linux-x86_64",
"target_field": "ml.inference.description",
"on_failure": [
{
"append": {
"field": "_source._ingest.inference_errors",
"allow_duplicates": false,
"value": [
{
"message": "Processor 'inference' in pipeline 'ml.inference.test' failed for field 'description' with message '{{ _ingest.on_failure_message }}'",
"pipeline": "ml.inference.test",
"timestamp": "{{{ _ingest.timestamp }}}"
}
]
}
}
]
}
},
{
"remove": {
"field": "ml.inference.keywords",
"ignore_missing": true
}
},
{
"inference": {
"field_map": {
"keywords": "text_field"
},
"model_id": ".multilingual-e5-small_linux-x86_64",
"target_field": "ml.inference.keywords",
"on_failure": [
{
"append": {
"field": "_source._ingest.inference_errors",
"allow_duplicates": false,
"value": [
{
"message": "Processor 'inference' in pipeline 'ml.inference.test' failed for field 'keywords' with message '{{ _ingest.on_failure_message }}'",
"pipeline": "ml.inference.test",
"timestamp": "{{{ _ingest.timestamp }}}"
}
]
}
}
]
}
},
{
"remove": {
"field": "ml.inference.name",
"ignore_missing": true
}
},
{
"inference": {
"field_map": {
"name": "text_field"
},
"model_id": ".multilingual-e5-small_linux-x86_64",
"target_field": "ml.inference.name",
"on_failure": [
{
"append": {
"field": "_source._ingest.inference_errors",
"allow_duplicates": false,
"value": [
{
"message": "Processor 'inference' in pipeline 'ml.inference.test' failed for field 'name' with message '{{ _ingest.on_failure_message }}'",
"pipeline": "ml.inference.test",
"timestamp": "{{{ _ingest.timestamp }}}"
}
]
}
}
]
}
},
{
"remove": {
"field": "ml.inference.pdfcontent",
"ignore_missing": true
}
},
{
"inference": {
"field_map": {
"pdfcontent": "text_field"
},
"model_id": ".multilingual-e5-small_linux-x86_64",
"target_field": "ml.inference.pdfcontent",
"on_failure": [
{
"append": {
"field": "_source._ingest.inference_errors",
"allow_duplicates": false,
"value": [
{
"message": "Processor 'inference' in pipeline 'ml.inference.test' failed for field 'pdfcontent' with message '{{ _ingest.on_failure_message }}'",
"pipeline": "ml.inference.test",
"timestamp": "{{{ _ingest.timestamp }}}"
}
]
}
}
]
}
},
{
"remove": {
"field": "ml.inference.productslist",
"ignore_missing": true
}
},
{
"inference": {
"field_map": {
"productslist": "text_field"
},
"model_id": ".multilingual-e5-small_linux-x86_64",
"target_field": "ml.inference.productslist",
"on_failure": [
{
"append": {
"field": "_source._ingest.inference_errors",
"allow_duplicates": false,
"value": [
{
"message": "Processor 'inference' in pipeline 'ml.inference.test' failed for field 'productslist' with message '{{ _ingest.on_failure_message }}'",
"pipeline": "ml.inference.test",
"timestamp": "{{{ _ingest.timestamp }}}"
}
]
}
}
]
}
},
{
"remove": {
"field": "ml.inference.audience",
"ignore_missing": true
}
},
{
"inference": {
"field_map": {
"audience": "text_field"
},
"model_id": ".multilingual-e5-small_linux-x86_64",
"target_field": "ml.inference.audience",
"on_failure": [
{
"append": {
"field": "_source._ingest.inference_errors",
"allow_duplicates": false,
"value": [
{
"message": "Processor 'inference' in pipeline 'ml.inference.test' failed for field 'audience' with message '{{ _ingest.on_failure_message }}'",
"pipeline": "ml.inference.test",
"timestamp": "{{{ _ingest.timestamp }}}"
}
]
}
}
]
}
},
{
"remove": {
"field": "ml.inference.filename",
"ignore_missing": true
}
},
{
"inference": {
"field_map": {
"filename": "text_field"
},
"model_id": ".multilingual-e5-small_linux-x86_64",
"target_field": "ml.inference.filename",
"on_failure": [
{
"append": {
"field": "_source._ingest.inference_errors",
"allow_duplicates": false,
"value": [
{
"message": "Processor 'inference' in pipeline 'ml.inference.test' failed for field 'filename' with message '{{ _ingest.on_failure_message }}'",
"pipeline": "ml.inference.test",
"timestamp": "{{{ _ingest.timestamp }}}"
}
]
}
}
]
}
},
{
"append": {
"field": "_source._ingest.processors",
"value": [
{
"model_version": "12.0.0",
"pipeline": "ml.inference.test",
"processed_timestamp": "{{{ _ingest.timestamp }}}",
"types": [
"pytorch",
"text_embedding"
]
}
]
}
}
]
Failure processors:
[
{
"set": {
"field": "ml.inference_failure",
"value": "{{_ingest.on_failure_message}}"
}
},
{
"reroute": {
"destination": "techpubs-failed-docs"
}
},
{
"set": {
"field": "_index",
"value": "failed-techpubs"
}
}
]