Hi, I tried two different index mapping for my file content indexing with separate the file content into different text sections with corresponding text section embeddings.
Flat file section index mapping
In this case, I index each of the file text section into a Elasticsearch document. One example index mapping is like follows
{
"mappings": {
"properties": {
"file_id": {
"type": "keyword"
},
"file_name": {
"type": "keyword"
},
"file_num_of_sections": {
"type": "integer"
},
"file_section_number":{
"type": "integer"
},
"file_section_text": {
"type": "text",
"index": true
},
"file_section_embedding": {
"type": "dense_vector",
"dims": 1024,
"index": true
}
}
}
}
Nested file section index mapping
In this mapping, we map each of the file content section as an nest object, and one example mapping is as follows
{
"mappings": {
"properties": {
"file_id": {
"type": "keyword"
},
"file_name": {
"type": "keyword"
},
"file_sections": {
"type": "nested",
"properties": {
"file_section_number":{
"type": "integer"
},
"file_section_text": {
"type": "text",
"index": true
},
"file_section_embedding": {
"type": "dense_vector",
"dims": 1024,
"index": true
}
}
}
}
}
}
Question
With the 1st mapping, if we exclude the dense vector field from _source
, we sill have the the raw vector values stored and quantized in the file_section_embedding
field. The following is the response from the disk usage API
{
"_shards": {
"total": 2,
"successful": 2,
"failed": 0
},
"file_flat_1024_exclude_vec_3": {
"store_size": "63.6gb",
"store_size_in_bytes": 68343021736,
"all_fields": {
"total": "63.6gb",
"total_in_bytes": 68327502483,
"inverted_index": {
"total": "2.9gb",
"total_in_bytes": 3133941756
},
"stored_fields": "10.9gb",
"stored_fields_in_bytes": 11706725438,
"doc_values": "97.9mb",
"doc_values_in_bytes": 102683766,
"points": "90mb",
"points_in_bytes": 94450574,
"norms": "9.8mb",
"norms_in_bytes": 10375997,
"term_vectors": "0b",
"term_vectors_in_bytes": 0,
"knn_vectors": "49.6gb",
"knn_vectors_in_bytes": 53279324952
},
"fields": {
"_recovery_source": {
"total": "5gb",
"total_in_bytes": 5447105611,
"inverted_index": {
"total": "0b",
"total_in_bytes": 0
},
"stored_fields": "5gb",
"stored_fields_in_bytes": 5446786083,
"doc_values": "312kb",
"doc_values_in_bytes": 319528,
"points": "0b",
"points_in_bytes": 0,
"norms": "0b",
"norms_in_bytes": 0,
"term_vectors": "0b",
"term_vectors_in_bytes": 0,
"knn_vectors": "0b",
"knn_vectors_in_bytes": 0
},
"_source": {
"total": "5.7gb",
"total_in_bytes": 6160547735,
"inverted_index": {
"total": "0b",
"total_in_bytes": 0
},
"stored_fields": "5.7gb",
"stored_fields_in_bytes": 6160547735,
"doc_values": "0b",
"doc_values_in_bytes": 0,
"points": "0b",
"points_in_bytes": 0,
"norms": "0b",
"norms_in_bytes": 0,
"term_vectors": "0b",
"term_vectors_in_bytes": 0,
"knn_vectors": "0b",
"knn_vectors_in_bytes": 0
},
"file_number_of_sections": {
"total": "948.1kb",
"total_in_bytes": 970855,
"inverted_index": {
"total": "0b",
"total_in_bytes": 0
},
"stored_fields": "0b",
"stored_fields_in_bytes": 0,
"doc_values": "507.2kb",
"doc_values_in_bytes": 519411,
"points": "440.8kb",
"points_in_bytes": 451444,
"norms": "0b",
"norms_in_bytes": 0,
"term_vectors": "0b",
"term_vectors_in_bytes": 0,
"knn_vectors": "0b",
"knn_vectors_in_bytes": 0
},
"file_section_embedding": {
"total": "49.6gb",
"total_in_bytes": 53279324952,
"inverted_index": {
"total": "0b",
"total_in_bytes": 0
},
"stored_fields": "0b",
"stored_fields_in_bytes": 0,
"doc_values": "0b",
"doc_values_in_bytes": 0,
"points": "0b",
"points_in_bytes": 0,
"norms": "0b",
"norms_in_bytes": 0,
"term_vectors": "0b",
"term_vectors_in_bytes": 0,
"knn_vectors": "49.6gb",
"knn_vectors_in_bytes": 53279324952
},
"file_section_embedding._magnitude": {
"total": "24.7mb",
"total_in_bytes": 25939027,
"inverted_index": {
"total": "0b",
"total_in_bytes": 0
},
"stored_fields": "0b",
"stored_fields_in_bytes": 0,
"doc_values": "24.7mb",
"doc_values_in_bytes": 25939027,
"points": "0b",
"points_in_bytes": 0,
"norms": "0b",
"norms_in_bytes": 0,
"term_vectors": "0b",
"term_vectors_in_bytes": 0,
"knn_vectors": "0b",
"knn_vectors_in_bytes": 0
},
"file_section_text": {
"total": "2.8gb",
"total_in_bytes": 3035591481,
"inverted_index": {
"total": "2.8gb",
"total_in_bytes": 3025215484
},
"stored_fields": "0b",
"stored_fields_in_bytes": 0,
"doc_values": "0b",
"doc_values_in_bytes": 0,
"points": "0b",
"points_in_bytes": 0,
"norms": "9.8mb",
"norms_in_bytes": 10375997,
"term_vectors": "0b",
"term_vectors_in_bytes": 0,
"knn_vectors": "0b",
"knn_vectors_in_bytes": 0
}
}
}
}
However, with the nested file sections in the 2nd mapping, if we exclude file_sections.file_section_embedding
from _source
, the vector field file_section_embedding
seems won't be indexed, as we get the following response from the disk usage API
{
"_shards": {
"total": 2,
"successful": 2,
"failed": 0
},
"file_nested_1024_exclude_vec_4": {
"store_size": "12.4gb",
"store_size_in_bytes": 13318206724,
"all_fields": {
"total": "12.3gb",
"total_in_bytes": 13310363394,
"inverted_index": {
"total": "2.8gb",
"total_in_bytes": 3098251993
},
"stored_fields": "7gb",
"stored_fields_in_bytes": 7519673957,
"doc_values": "53.7mb",
"doc_values_in_bytes": 56362370,
"points": "64.2mb",
"points_in_bytes": 67421547,
"norms": "11.6mb",
"norms_in_bytes": 12245676,
"term_vectors": "0b",
"term_vectors_in_bytes": 0,
"knn_vectors": "2.3gb",
"knn_vectors_in_bytes": 2556407851
},
"fields": {
"_nested_path": {
"total": "488.7kb",
"total_in_bytes": 500506,
"inverted_index": {
"total": "488.7kb",
"total_in_bytes": 500506
},
"stored_fields": "0b",
"stored_fields_in_bytes": 0,
"doc_values": "0b",
"doc_values_in_bytes": 0,
"points": "0b",
"points_in_bytes": 0,
"norms": "0b",
"norms_in_bytes": 0,
"term_vectors": "0b",
"term_vectors_in_bytes": 0,
"knn_vectors": "0b",
"knn_vectors_in_bytes": 0
},
"_recovery_source": {
"total": "2.2gb",
"total_in_bytes": 2467104023,
"inverted_index": {
"total": "0b",
"total_in_bytes": 0
},
"stored_fields": "2.2gb",
"stored_fields_in_bytes": 2467103063,
"doc_values": "960b",
"doc_values_in_bytes": 960,
"points": "0b",
"points_in_bytes": 0,
"norms": "0b",
"norms_in_bytes": 0,
"term_vectors": "0b",
"term_vectors_in_bytes": 0,
"knn_vectors": "0b",
"knn_vectors_in_bytes": 0
},
"_source": {
"total": "4.7gb",
"total_in_bytes": 5052485652,
"inverted_index": {
"total": "0b",
"total_in_bytes": 0
},
"stored_fields": "4.7gb",
"stored_fields_in_bytes": 5052485652,
"doc_values": "0b",
"doc_values_in_bytes": 0,
"points": "0b",
"points_in_bytes": 0,
"norms": "0b",
"norms_in_bytes": 0,
"term_vectors": "0b",
"term_vectors_in_bytes": 0,
"knn_vectors": "0b",
"knn_vectors_in_bytes": 0
},
"file_sections.file_section_embedding": {
"total": "2.3gb",
"total_in_bytes": 2556407851,
"inverted_index": {
"total": "0b",
"total_in_bytes": 0
},
"stored_fields": "0b",
"stored_fields_in_bytes": 0,
"doc_values": "0b",
"doc_values_in_bytes": 0,
"points": "0b",
"points_in_bytes": 0,
"norms": "0b",
"norms_in_bytes": 0,
"term_vectors": "0b",
"term_vectors_in_bytes": 0,
"knn_vectors": "2.3gb",
"knn_vectors_in_bytes": 2556407851
},
"file_sections.file_section_embedding._magnitude": {
"total": "1.4mb",
"total_in_bytes": 1511072,
"inverted_index": {
"total": "0b",
"total_in_bytes": 0
},
"stored_fields": "0b",
"stored_fields_in_bytes": 0,
"doc_values": "1.4mb",
"doc_values_in_bytes": 1511072,
"points": "0b",
"points_in_bytes": 0,
"norms": "0b",
"norms_in_bytes": 0,
"term_vectors": "0b",
"term_vectors_in_bytes": 0,
"knn_vectors": "0b",
"knn_vectors_in_bytes": 0
},
"file_sections.file_section_text": {
"total": "2.8gb",
"total_in_bytes": 3095784524,
"inverted_index": {
"total": "2.8gb",
"total_in_bytes": 3083538848
},
"stored_fields": "0b",
"stored_fields_in_bytes": 0,
"doc_values": "0b",
"doc_values_in_bytes": 0,
"points": "0b",
"points_in_bytes": 0,
"norms": "11.6mb",
"norms_in_bytes": 12245676,
"term_vectors": "0b",
"term_vectors_in_bytes": 0,
"knn_vectors": "0b",
"knn_vectors_in_bytes": 0
}
}
}
}
While indexing with the nest object mapping, we first create the Elasticsearch document with the file_sections
set to be empty array. Then use the bulk
API to update the document with the actually content objs to file_section
.
It would be great if anyone could help to check what might be the cause of the issue of the 2nd indexing.
FYI, I tried to get the doc value via the following query
{
"_source": false,
"query": {
"nested": {
"path": "file_sections",
"query": {
"match_all": {}
},
"inner_hits": {
"size": 5,
"script_fields": {
"raw_vector": {
"script": {
"source": "if (doc['file_sections.file_section_embedding'].size() != 0) { return doc['file_sections.file_section_embedding'].vectorValue; } else { return null; }"
}
}
}
}
}
}
}
but I got all null
values. However, I was able to retrieve the actual doc value from the 1st index mapping.