Dense vector field in nested object

Hi, I tried two different index mapping for my file content indexing with separate the file content into different text sections with corresponding text section embeddings.

Flat file section index mapping

In this case, I index each of the file text section into a Elasticsearch document. One example index mapping is like follows

{
    "mappings": {
        "properties": {
            "file_id": {
               "type": "keyword"
            },
            "file_name": {
               "type": "keyword"
            },
            "file_num_of_sections": {
               "type": "integer"
            },
            "file_section_number":{
                "type": "integer"
            },
            "file_section_text": {
               "type": "text",
               "index": true
            },
            "file_section_embedding": {
               "type": "dense_vector",
               "dims": 1024,
               "index": true
            }
        }
    }
}

Nested file section index mapping

In this mapping, we map each of the file content section as an nest object, and one example mapping is as follows

{
    "mappings": {
        "properties": {
            "file_id": {
               "type": "keyword"
            },
            "file_name": {
                "type": "keyword"
            },
            "file_sections": {
                "type": "nested",
                "properties": {
                    "file_section_number":{
                        "type": "integer"
                    },
                    "file_section_text": {
                        "type": "text",
                        "index": true
                    },
                    "file_section_embedding": {
                        "type": "dense_vector",
                        "dims": 1024,
                        "index": true
                    }
                }
            }
        }
    }
}

Question

With the 1st mapping, if we exclude the dense vector field from _source, we sill have the the raw vector values stored and quantized in the file_section_embedding field. The following is the response from the disk usage API

{
    "_shards": {
        "total": 2,
        "successful": 2,
        "failed": 0
    },
    "file_flat_1024_exclude_vec_3": {
        "store_size": "63.6gb",
        "store_size_in_bytes": 68343021736,
        "all_fields": {
            "total": "63.6gb",
            "total_in_bytes": 68327502483,
            "inverted_index": {
                "total": "2.9gb",
                "total_in_bytes": 3133941756
            },
            "stored_fields": "10.9gb",
            "stored_fields_in_bytes": 11706725438,
            "doc_values": "97.9mb",
            "doc_values_in_bytes": 102683766,
            "points": "90mb",
            "points_in_bytes": 94450574,
            "norms": "9.8mb",
            "norms_in_bytes": 10375997,
            "term_vectors": "0b",
            "term_vectors_in_bytes": 0,
            "knn_vectors": "49.6gb",
            "knn_vectors_in_bytes": 53279324952
        },
        "fields": {
            "_recovery_source": {
                "total": "5gb",
                "total_in_bytes": 5447105611,
                "inverted_index": {
                    "total": "0b",
                    "total_in_bytes": 0
                },
                "stored_fields": "5gb",
                "stored_fields_in_bytes": 5446786083,
                "doc_values": "312kb",
                "doc_values_in_bytes": 319528,
                "points": "0b",
                "points_in_bytes": 0,
                "norms": "0b",
                "norms_in_bytes": 0,
                "term_vectors": "0b",
                "term_vectors_in_bytes": 0,
                "knn_vectors": "0b",
                "knn_vectors_in_bytes": 0
            },
            "_source": {
                "total": "5.7gb",
                "total_in_bytes": 6160547735,
                "inverted_index": {
                    "total": "0b",
                    "total_in_bytes": 0
                },
                "stored_fields": "5.7gb",
                "stored_fields_in_bytes": 6160547735,
                "doc_values": "0b",
                "doc_values_in_bytes": 0,
                "points": "0b",
                "points_in_bytes": 0,
                "norms": "0b",
                "norms_in_bytes": 0,
                "term_vectors": "0b",
                "term_vectors_in_bytes": 0,
                "knn_vectors": "0b",
                "knn_vectors_in_bytes": 0
            },
            "file_number_of_sections": {
                "total": "948.1kb",
                "total_in_bytes": 970855,
                "inverted_index": {
                    "total": "0b",
                    "total_in_bytes": 0
                },
                "stored_fields": "0b",
                "stored_fields_in_bytes": 0,
                "doc_values": "507.2kb",
                "doc_values_in_bytes": 519411,
                "points": "440.8kb",
                "points_in_bytes": 451444,
                "norms": "0b",
                "norms_in_bytes": 0,
                "term_vectors": "0b",
                "term_vectors_in_bytes": 0,
                "knn_vectors": "0b",
                "knn_vectors_in_bytes": 0
            },
            "file_section_embedding": {
                "total": "49.6gb",
                "total_in_bytes": 53279324952,
                "inverted_index": {
                    "total": "0b",
                    "total_in_bytes": 0
                },
                "stored_fields": "0b",
                "stored_fields_in_bytes": 0,
                "doc_values": "0b",
                "doc_values_in_bytes": 0,
                "points": "0b",
                "points_in_bytes": 0,
                "norms": "0b",
                "norms_in_bytes": 0,
                "term_vectors": "0b",
                "term_vectors_in_bytes": 0,
                "knn_vectors": "49.6gb",
                "knn_vectors_in_bytes": 53279324952
            },
            "file_section_embedding._magnitude": {
                "total": "24.7mb",
                "total_in_bytes": 25939027,
                "inverted_index": {
                    "total": "0b",
                    "total_in_bytes": 0
                },
                "stored_fields": "0b",
                "stored_fields_in_bytes": 0,
                "doc_values": "24.7mb",
                "doc_values_in_bytes": 25939027,
                "points": "0b",
                "points_in_bytes": 0,
                "norms": "0b",
                "norms_in_bytes": 0,
                "term_vectors": "0b",
                "term_vectors_in_bytes": 0,
                "knn_vectors": "0b",
                "knn_vectors_in_bytes": 0
            },
            "file_section_text": {
                "total": "2.8gb",
                "total_in_bytes": 3035591481,
                "inverted_index": {
                    "total": "2.8gb",
                    "total_in_bytes": 3025215484
                },
                "stored_fields": "0b",
                "stored_fields_in_bytes": 0,
                "doc_values": "0b",
                "doc_values_in_bytes": 0,
                "points": "0b",
                "points_in_bytes": 0,
                "norms": "9.8mb",
                "norms_in_bytes": 10375997,
                "term_vectors": "0b",
                "term_vectors_in_bytes": 0,
                "knn_vectors": "0b",
                "knn_vectors_in_bytes": 0
            }
        }
    }
}

However, with the nested file sections in the 2nd mapping, if we exclude file_sections.file_section_embedding from _source, the vector field file_section_embedding seems won't be indexed, as we get the following response from the disk usage API

{
    "_shards": {
        "total": 2,
        "successful": 2,
        "failed": 0
    },
    "file_nested_1024_exclude_vec_4": {
        "store_size": "12.4gb",
        "store_size_in_bytes": 13318206724,
        "all_fields": {
            "total": "12.3gb",
            "total_in_bytes": 13310363394,
            "inverted_index": {
                "total": "2.8gb",
                "total_in_bytes": 3098251993
            },
            "stored_fields": "7gb",
            "stored_fields_in_bytes": 7519673957,
            "doc_values": "53.7mb",
            "doc_values_in_bytes": 56362370,
            "points": "64.2mb",
            "points_in_bytes": 67421547,
            "norms": "11.6mb",
            "norms_in_bytes": 12245676,
            "term_vectors": "0b",
            "term_vectors_in_bytes": 0,
            "knn_vectors": "2.3gb",
            "knn_vectors_in_bytes": 2556407851
        },
        "fields": {
            "_nested_path": {
                "total": "488.7kb",
                "total_in_bytes": 500506,
                "inverted_index": {
                    "total": "488.7kb",
                    "total_in_bytes": 500506
                },
                "stored_fields": "0b",
                "stored_fields_in_bytes": 0,
                "doc_values": "0b",
                "doc_values_in_bytes": 0,
                "points": "0b",
                "points_in_bytes": 0,
                "norms": "0b",
                "norms_in_bytes": 0,
                "term_vectors": "0b",
                "term_vectors_in_bytes": 0,
                "knn_vectors": "0b",
                "knn_vectors_in_bytes": 0
            },
            "_recovery_source": {
                "total": "2.2gb",
                "total_in_bytes": 2467104023,
                "inverted_index": {
                    "total": "0b",
                    "total_in_bytes": 0
                },
                "stored_fields": "2.2gb",
                "stored_fields_in_bytes": 2467103063,
                "doc_values": "960b",
                "doc_values_in_bytes": 960,
                "points": "0b",
                "points_in_bytes": 0,
                "norms": "0b",
                "norms_in_bytes": 0,
                "term_vectors": "0b",
                "term_vectors_in_bytes": 0,
                "knn_vectors": "0b",
                "knn_vectors_in_bytes": 0
            },
            "_source": {
                "total": "4.7gb",
                "total_in_bytes": 5052485652,
                "inverted_index": {
                    "total": "0b",
                    "total_in_bytes": 0
                },
                "stored_fields": "4.7gb",
                "stored_fields_in_bytes": 5052485652,
                "doc_values": "0b",
                "doc_values_in_bytes": 0,
                "points": "0b",
                "points_in_bytes": 0,
                "norms": "0b",
                "norms_in_bytes": 0,
                "term_vectors": "0b",
                "term_vectors_in_bytes": 0,
                "knn_vectors": "0b",
                "knn_vectors_in_bytes": 0
            },
            "file_sections.file_section_embedding": {
                "total": "2.3gb",
                "total_in_bytes": 2556407851,
                "inverted_index": {
                    "total": "0b",
                    "total_in_bytes": 0
                },
                "stored_fields": "0b",
                "stored_fields_in_bytes": 0,
                "doc_values": "0b",
                "doc_values_in_bytes": 0,
                "points": "0b",
                "points_in_bytes": 0,
                "norms": "0b",
                "norms_in_bytes": 0,
                "term_vectors": "0b",
                "term_vectors_in_bytes": 0,
                "knn_vectors": "2.3gb",
                "knn_vectors_in_bytes": 2556407851
            },
            "file_sections.file_section_embedding._magnitude": {
                "total": "1.4mb",
                "total_in_bytes": 1511072,
                "inverted_index": {
                    "total": "0b",
                    "total_in_bytes": 0
                },
                "stored_fields": "0b",
                "stored_fields_in_bytes": 0,
                "doc_values": "1.4mb",
                "doc_values_in_bytes": 1511072,
                "points": "0b",
                "points_in_bytes": 0,
                "norms": "0b",
                "norms_in_bytes": 0,
                "term_vectors": "0b",
                "term_vectors_in_bytes": 0,
                "knn_vectors": "0b",
                "knn_vectors_in_bytes": 0
            },
            "file_sections.file_section_text": {
                "total": "2.8gb",
                "total_in_bytes": 3095784524,
                "inverted_index": {
                    "total": "2.8gb",
                    "total_in_bytes": 3083538848
                },
                "stored_fields": "0b",
                "stored_fields_in_bytes": 0,
                "doc_values": "0b",
                "doc_values_in_bytes": 0,
                "points": "0b",
                "points_in_bytes": 0,
                "norms": "11.6mb",
                "norms_in_bytes": 12245676,
                "term_vectors": "0b",
                "term_vectors_in_bytes": 0,
                "knn_vectors": "0b",
                "knn_vectors_in_bytes": 0
            }
        }
    }
}

While indexing with the nest object mapping, we first create the Elasticsearch document with the file_sections set to be empty array. Then use the bulk API to update the document with the actually content objs to file_section.

It would be great if anyone could help to check what might be the cause of the issue of the 2nd indexing.

FYI, I tried to get the doc value via the following query

{
  "_source": false,
  "query": {
    "nested": {
      "path": "file_sections",
      "query": {
        "match_all": {}
      },
      "inner_hits": {
        "size": 5,
        "script_fields": {
          "raw_vector": {
            "script": {
              "source": "if (doc['file_sections.file_section_embedding'].size() != 0) { return doc['file_sections.file_section_embedding'].vectorValue; } else { return null; }"
            }
          }
        }
      }
    }
  }
}

but I got all null values. However, I was able to retrieve the actual doc value from the 1st index mapping.

Hi, may I ask whether anyone could follow this issue report? Thanks a lot.

Hi, would anyone have any idea about the mentioned issue in this topic? Thanks a lot.

Hi @yli :

Can you please get the index stats for the two indices?

I'd like to double check that the same number of vectors and documents have been indexed in both cases.

Thanks!

Hi @Carlos_D, thanks for your reply.

Both indices do contain the same number of documents, you could see from the following partial results from the _stats API endpoint.

  1. Exclude nested dense vector from _source
    {
    "_shards": {
        "total": 4,
        "successful": 4,
        "failed": 0
    },
    "_all": {
        "primaries": {
            "docs": {
                "count": 10400000,
                "deleted": 377214,
                "total_size_in_bytes": 12950719159
            },
            "shard_stats": {
                "total_count": 2
            },
            "store": {
                "size_in_bytes": 12950723707,
                "total_data_set_size_in_bytes": 12950723707,
                "reserved_in_bytes": 0
            },
    
  2. Include nested dense vector from _source
    {
    "_shards": {
        "total": 4,
        "successful": 4,
        "failed": 0
    },
    "_all": {
        "primaries": {
            "docs": {
                "count": 10400000,
                "deleted": 649222,
                "total_size_in_bytes": 247691181822
            },
            "shard_stats": {
                "total_count": 2
            },
            "store": {
                "size_in_bytes": 247691189206,
                "total_data_set_size_in_bytes": 247691189206,
                "reserved_in_bytes": 0
            },
    

Let me know if you need more information.

Hi @yli :

I have not been able to reproduce your issue. I've used the following code to generate some synthetic data according to your mapping:

from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import numpy as np
import random
import string
import traceback

# Elasticsearch connection settings
ES_HOST = "http://localhost:9200"
INDEX_NAME = "documents"

# Connect to Elasticsearch
es = Elasticsearch([ES_HOST], basic_auth=("elastic-admin", "elastic-password"))

# Define the index mapping with dense_vector fields
def create_index():
    mapping = {
            "mappings": {
                "properties": {
                    "file_id": {
                       "type": "keyword"
                    },
                    "file_name": {
                        "type": "keyword"
                    },
                    "file_sections": {
                        "type": "nested",
                        "properties": {
                            "file_section_number":{
                                "type": "integer"
                            },
                            "file_section_text": {
                                "type": "text",
                                "index": True
                            },
                            "file_section_embedding": {
                                "type": "dense_vector",
                                "dims": 1024,
                                "index": True
                            }
                        }
                    }
                },
                            "_source": {
              "excludes": [
                "file_sections.*"
              ]
            }
            }

        }

    
    if es.indices.exists(index=INDEX_NAME):
        es.indices.delete(index=INDEX_NAME)
    es.indices.create(index=INDEX_NAME, body=mapping)
    print(f"Index '{INDEX_NAME}' created.")

# Generate random vector data
def random_vector(dim):
    return list(np.random.rand(dim))

# Generate random text
def random_text(length=50):
    return ''.join(random.choices(string.ascii_letters + string.digits + ' ', k=length))

def random_nested_embeddings(num=4):
    return [{"file_section_number":  np.random.randint(0, 1000), "file_section_text": random_text(10), "file_section_embedding": random_vector(1024)} for _ in range(num)]    

# Insert random documents into Elasticsearch
def insert_documents():
    docs = []
    for i in range(1, 1000):
        doc = {
            "_index": INDEX_NAME, "_id": i, "_source": {
                "file_id": random_text(20),
                "file_name": random_text(20),
                "file_sections": random_nested_embeddings(4)
            }
        }
        docs.append(doc)
    success, failed = bulk(es, docs, raise_on_error=False)
    print(f"Documents inserted: {success}, Failed: {failed}")
    

if __name__ == "__main__":
    create_index()
    insert_documents()

I can get doc vector values when executing your script:

GET documents/_search
{
  "_source": false,
  "query": {
    "nested": {
      "path": "file_sections",
      "query": {
        "match_all": {}
      },
      "inner_hits": {
        "size": 5,
        "script_fields": {
          "raw_vector": {
            "script": {
              "source": "if (doc['file_sections.file_section_embedding'].size() != 0) { return doc['file_sections.file_section_embedding'].vectorValue; } else { return null; }"
            }
          }
        }
      }
    }
  }
}

Comparing with not excluding source, I also don't see that difference in total_in_bytes from the disk usage API - they're almost equivalent.

Can you please double check that your ingestion process is correct? Trying to ingest your data without excluding sources might give us a clue about what is missing.

Hi @Carlos_D, thanks a lot for your invest of time and effort, and sorry for the delay in replying you.

In our case, we have files that are more than 500MB large, therefore, I was not able to insert the complete file in one insert, and have to rely on update.

The following contains code snips from my indexing strategy

class FileSection(BaseModel):
    """Represent a section inside a file."""
    file_page_num: int
    file_section_number: int
    file_section_text: str
    file_section_embedding: list[float]

class FileNestedMapping(BaseModel):
    """Represents content of a file.
    """"
    file_id: str
    file_name: str
    file_creation_date: str
    file_modification_date: str
    file_number_of_pages: int

    file_sections: list[FileSection]


async def _bulk_update(
            self,
            doc_gen: Generator[dict, None, None],
            doc_id: str,
            batch_size=1000,
            refresh=False,
            timeout="1800s"
    ):
        """Bulk update list of data into Elasticsearch.

        Args:
            doc_gen: Generator of document sections to update
            doc_id: Document ID to update
            batch_size: Number of sections to insert in each batch
        """

        def section_batch_gen_func():
            exhausted = False
            while not exhausted:
                section_batch = []
                try:
                    for _ in range(batch_size):
                        next_doc = next(doc_gen)
                        section_batch.append(next_doc)
                except StopIteration:
                    exhausted = True
                finally:
                    yield section_batch

        section_batch_gen = section_batch_gen_func()

        await helpers.async_bulk(
            client=self._es_client,
            actions=(
                {
                    "_op_type": "update",
                    "_index": self._index_name,
                    "_id": doc_id,
                    "script": {
                        "source": "ctx._source.file_sections.addAll(params.new_object)",
                        "params": {
                            "new_object": section_batch
                        }
                    },
                }
                for section_batch in section_batch_gen
            ),
            timeout=timeout,
            refresh=refresh,
        )


async def index(self, data: FileNestedMapping):

        def doc_section_gen_func():
            for file_section in data.file_sections:
                file_section_dict = file_section.model_dump()
                # change the embedding dimension
                file_section_dict["file_section_embedding"] = file_section_dict["file_section_embedding"][:1024]
                yield file_section_dict

        doc_section_gen = doc_section_gen_func()

        # Create the main document without the nested sections
        data_without_sections = data.model_dump(exclude={"file_sections"})

        # Add a single section object first, to avoid the empty array won't be indexed if excluded in `_source`
        data_without_sections["file_sections"] = [next(doc_section_gen)]

        await self._es_client.index(
            index=self._index_name,
            document=data_without_sections,
            id=data.file_id,
            refresh=False,
        )
        
        # Index the nest sections
        await self._bulk_update(doc_gen=doc_section_gen, doc_id=data.file_id)

The methods _bulk_update and index are defined in the same Python class.

I tried with both including and excluding, and got the above mentioned differences.

Let me know if you need more information, thanks a lot.

Can you check that vectors are included correctly in the nested strategy when you're not excluding sources in the mapping?

Perform the indexing of a few docs, and then do a GET of some docs to double check that the vectors are present in the nested fields.

That way we will ensure that the indexing process is correct and vectors are added to the nested fields.