Ingestion Failure with ML inference for E5 model

I have a use case where I need to reindex my documents to use Machine Learning capabilities for my data. Hence, I have been E5 Text Embedding model with body_content field chunked using Ingest pipeline and also using Inference pipeline to convert the text fields into text embedded fields. In this process, while I am reindexing the documents using reindex API, my ingestion stops at certain point and there is no failure message where I can troubleshoot. I have enabled ingestion_failure message in the pipeline but the documents are being rejected and I was also not able to set/reroute it to another index to see the failure message. How can I troubleshoot this issue?

ReIndex API:

POST _reindex?wait_for_completion=false
{
  "conflicts": "proceed",
  "source": {
    "index": ".ent-search-engine-documents-test",
    "size": 100
  },
  "dest": {`
    "index": "test",
    "pipeline": "ml-inference-test"
  }
}

Chunker pipeline:

[
  {
    "script": {
      "description": "Chunk pdfcontent into sentences by looking for . followed by a space",
      "lang": "painless",
      "source": "\n          String[] envSplit = /((?<!M(r|s|rs)\\.)(?<=\\.) |(?<=\\!) |(?<=\\?) )/.split(ctx['pdfcontent']);\n          ctx['passages'] = new ArrayList();\n          int i = 0;\n          boolean remaining = true;\n          if (envSplit.length == 0) {\n            return\n          } else if (envSplit.length == 1) {\n            Map passage = ['text': envSplit[0]];ctx['passages'].add(passage)\n          } else {\n            while (remaining) {\n              Map passage = ['text': envSplit[i++]];\n              while (i < envSplit.length && passage.text.length() + envSplit[i].length() < params.model_limit) {passage.text = passage.text + ' ' + envSplit[i++]}\n              if (i == envSplit.length) {remaining = false}\n              ctx['passages'].add(passage)\n            }\n          }\n          ",
      "params": {
        "model_limit": 400
      }
    }
  },
  {
    "foreach": {
      "field": "passages",
      "processor": {
        "inference": {
          "field_map": {
            "_ingest._value.text": "text_field"
          },
          "model_id": ".multilingual-e5-small_linux-x86_64",
          "target_field": "_ingest._value.pdfcontent_chunked",
          "on_failure": [
            {
              "append": {
                "field": "_source._ingest.inference_errors",
                "value": [
                  {
                    "message": "Processor 'inference' in pipeline 'ml-inference-test' failed with message '{{ _ingest.on_failure_message }}'",
                    "pipeline": "ml-inference-test",
                    "timestamp": "{{{ _ingest.timestamp }}}"
                  }
                ]
              }
            }
          ]
        }
      }
    }
  }
]

ml-inference-test pipeline:

[
  {
    "pipeline": {
      "name": "chunker"
    }
  },
  {
    "remove": {
      "field": "ml.inference.title",
      "ignore_missing": true
    }
  },
  {
    "inference": {
      "field_map": {
        "title": "text_field"
      },
      "model_id": ".multilingual-e5-small_linux-x86_64",
      "target_field": "ml.inference.title",
      "on_failure": [
        {
          "append": {
            "field": "_source._ingest.inference_errors",
            "allow_duplicates": false,
            "value": [
              {
                "message": "Processor 'inference' in pipeline 'ml.inference.test' failed for field 'title' with message '{{ _ingest.on_failure_message }}'",
                "pipeline": "ml.inference.test",
                "timestamp": "{{{ _ingest.timestamp }}}"
              }
            ]
          }
        }
      ]
    }
  },
  {
    "remove": {
      "field": "ml.inference.description",
      "ignore_missing": true
    }
  },
  {
    "inference": {
      "field_map": {
        "description": "text_field"
      },
      "model_id": ".multilingual-e5-small_linux-x86_64",
      "target_field": "ml.inference.description",
      "on_failure": [
        {
          "append": {
            "field": "_source._ingest.inference_errors",
            "allow_duplicates": false,
            "value": [
              {
                "message": "Processor 'inference' in pipeline 'ml.inference.test' failed for field 'description' with message '{{ _ingest.on_failure_message }}'",
                "pipeline": "ml.inference.test",
                "timestamp": "{{{ _ingest.timestamp }}}"
              }
            ]
          }
        }
      ]
    }
  },
  {
    "remove": {
      "field": "ml.inference.keywords",
      "ignore_missing": true
    }
  },
  {
    "inference": {
      "field_map": {
        "keywords": "text_field"
      },
      "model_id": ".multilingual-e5-small_linux-x86_64",
      "target_field": "ml.inference.keywords",
      "on_failure": [
        {
          "append": {
            "field": "_source._ingest.inference_errors",
            "allow_duplicates": false,
            "value": [
              {
                "message": "Processor 'inference' in pipeline 'ml.inference.test' failed for field 'keywords' with message '{{ _ingest.on_failure_message }}'",
                "pipeline": "ml.inference.test",
                "timestamp": "{{{ _ingest.timestamp }}}"
              }
            ]
          }
        }
      ]
    }
  },
  {
    "remove": {
      "field": "ml.inference.name",
      "ignore_missing": true
    }
  },
  {
    "inference": {
      "field_map": {
        "name": "text_field"
      },
      "model_id": ".multilingual-e5-small_linux-x86_64",
      "target_field": "ml.inference.name",
      "on_failure": [
        {
          "append": {
            "field": "_source._ingest.inference_errors",
            "allow_duplicates": false,
            "value": [
              {
                "message": "Processor 'inference' in pipeline 'ml.inference.test' failed for field 'name' with message '{{ _ingest.on_failure_message }}'",
                "pipeline": "ml.inference.test",
                "timestamp": "{{{ _ingest.timestamp }}}"
              }
            ]
          }
        }
      ]
    }
  },
  {
    "remove": {
      "field": "ml.inference.pdfcontent",
      "ignore_missing": true
    }
  },
  {
    "inference": {
      "field_map": {
        "pdfcontent": "text_field"
      },
      "model_id": ".multilingual-e5-small_linux-x86_64",
      "target_field": "ml.inference.pdfcontent",
      "on_failure": [
        {
          "append": {
            "field": "_source._ingest.inference_errors",
            "allow_duplicates": false,
            "value": [
              {
                "message": "Processor 'inference' in pipeline 'ml.inference.test' failed for field 'pdfcontent' with message '{{ _ingest.on_failure_message }}'",
                "pipeline": "ml.inference.test",
                "timestamp": "{{{ _ingest.timestamp }}}"
              }
            ]
          }
        }
      ]
    }
  },
  {
    "remove": {
      "field": "ml.inference.productslist",
      "ignore_missing": true
    }
  },
  {
    "inference": {
      "field_map": {
        "productslist": "text_field"
      },
      "model_id": ".multilingual-e5-small_linux-x86_64",
      "target_field": "ml.inference.productslist",
      "on_failure": [
        {
          "append": {
            "field": "_source._ingest.inference_errors",
            "allow_duplicates": false,
            "value": [
              {
                "message": "Processor 'inference' in pipeline 'ml.inference.test' failed for field 'productslist' with message '{{ _ingest.on_failure_message }}'",
                "pipeline": "ml.inference.test",
                "timestamp": "{{{ _ingest.timestamp }}}"
              }
            ]
          }
        }
      ]
    }
  },
  {
    "remove": {
      "field": "ml.inference.audience",
      "ignore_missing": true
    }
  },
  {
    "inference": {
      "field_map": {
        "audience": "text_field"
      },
      "model_id": ".multilingual-e5-small_linux-x86_64",
      "target_field": "ml.inference.audience",
      "on_failure": [
        {
          "append": {
            "field": "_source._ingest.inference_errors",
            "allow_duplicates": false,
            "value": [
              {
                "message": "Processor 'inference' in pipeline 'ml.inference.test' failed for field 'audience' with message '{{ _ingest.on_failure_message }}'",
                "pipeline": "ml.inference.test",
                "timestamp": "{{{ _ingest.timestamp }}}"
              }
            ]
          }
        }
      ]
    }
  },
  {
    "remove": {
      "field": "ml.inference.filename",
      "ignore_missing": true
    }
  },
  {
    "inference": {
      "field_map": {
        "filename": "text_field"
      },
      "model_id": ".multilingual-e5-small_linux-x86_64",
      "target_field": "ml.inference.filename",
      "on_failure": [
        {
          "append": {
            "field": "_source._ingest.inference_errors",
            "allow_duplicates": false,
            "value": [
              {
                "message": "Processor 'inference' in pipeline 'ml.inference.test' failed for field 'filename' with message '{{ _ingest.on_failure_message }}'",
                "pipeline": "ml.inference.test",
                "timestamp": "{{{ _ingest.timestamp }}}"
              }
            ]
          }
        }
      ]
    }
  },
  {
    "append": {
      "field": "_source._ingest.processors",
      "value": [
        {
          "model_version": "12.0.0",
          "pipeline": "ml.inference.test",
          "processed_timestamp": "{{{ _ingest.timestamp }}}",
          "types": [
            "pytorch",
            "text_embedding"
          ]
        }
      ]
    }
  }
]

Failure processors:

[
  {
    "set": {
      "field": "ml.inference_failure",
      "value": "{{_ingest.on_failure_message}}"
    }
  },
  {
    "reroute": {
      "destination": "techpubs-failed-docs"
    }
  },
  {
    "set": {
      "field": "_index",
      "value": "failed-techpubs"
    }
  }
]

Hi @Sanjay_Samanaboina sorry for the slow response

It looks like you've taken inspiration from this blog on chunking, do you get results if you run the chunker pipeline directly? Remember to set up you index mapping for nested dense vectors first.

If the chunker pipeline works I suggest you start with a very simple ml-inference-test pipeline and test that. Next add more ingest processors in order to discover which one is causing the errors.

Hi @dkyle yeah that's too slow response but thanks!
Actually, there are no errors reported and after adding chunker to the pipeline only I was not able to ingest my 113k documents.
Later I have added scroll to my request where scroll=30m, it took more than half a day and then it completed to ingest whole documents. The issue is resolved now

1 Like

Fantastic I glad you managed to ingest your docs