Hi there! I'm new to Elastic and have been trying to do a information retrieval system for 500k documents of text using python and docker (for elastic 8.7).
I'm not really sure how to go about doing an hybrid search (BM25 + HNSW) with Mean Reciprocal Rank. At the moment I'm giving what I think are weights (50/50). And it exists a warning because body param is deprecated.
How should I build a hybrid search using BM25 + HNSW in python client that uses Mean Reciprocal Rank and can handle 500k docs?
The code I have at the momento is the following:
from datetime import datetime
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
import pandas as pd
if __name__ == '__main__':
# Elastic Search Index
idx_name = "hnsw_big"
# Load the model
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
# Create Elasticsearch index with custom similarity
es = Elasticsearch('https://user:password@localhost:9200', ca_certs='http_ca.crt')
if es.indices.exists(index=[idx_name]):
# Get a list of all indices
indices = es.cat.indices()
print("Existing indices:")
print(indices)
# es.indices.delete(index=idx_name)
print("Index exists!")
else:
# Read the CSV file ---------------------------------
# columns: id, content, date
# example row: 12345678 (int), "Some text to be used" (str), "2023-18-04 14:21:18.000" (str)
df = pd.read_csv('path/test-file.csv', header=0)
# Encode the content column with the model
embeddings = model.encode(df['content'].tolist(), convert_to_tensor=True, normalize_embeddings=False, show_progress_bar=True)
# Convert embeddings to a list of dictionaries
data = [{'id': row['id'], 'content': row['content'], 'date': row['date'], 'embeddings': embedding.tolist()} for idx, (index, row), embedding in zip(range(len(df)), df.iterrows(), embeddings)]
stgs = {
"number_of_shards": 1,
"analysis": {
"filter": {
"portuguese_stop": {
"type": "stop",
"stopwords": "_portuguese_"
},
"portuguese_keywords": {
"type": "keyword_marker",
"keywords": ["exemplo"]
},
"portuguese_stemmer": {
"type": "stemmer",
"language": "light_portuguese"
}
},
"analyzer": {
"rebuilt_portuguese": {
"tokenizer": "standard",
"filter": [
"lowercase",
"portuguese_stop",
"portuguese_keywords",
"portuguese_stemmer"
]
}
}
}
}
mpgs = {
"properties": {
"content": {
"type": "text"
},
"embeddings": {
"type": "dense_vector",
"dims": 512,
"index": True,
"similarity": "cosine",
"index_options": {
"type": "hnsw",
"m": 32,
"ef_construction": 100
}
},
"date": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
}
}
}
es.indices.create(index=idx_name, settings=stgs, mappings=mpgs)
# Get a list of all indices
indices = es.cat.indices()
print("Existing indices:")
print(indices)
for doc in tqdm(data, total=len(data)):
try:
# print(doc)
es.index(index=idx_name, id=doc['id'], document=doc, refresh=True)
except:
# print(doc)
# print("error!")
pass
# print the total number of documents in the index
print(es.count(index=idx_name)['count'])
resp = es.get(index=idx_name, id=<some_id_to_be_tested>)
print("Query.:", resp['_source'])
# define the query text
query = "<Some_query_to_be_tested>"
query_vector = model.encode(query, normalize_embeddings=False, convert_to_tensor=True).tolist()
k = 5
# define the Elasticsearch query using HNSW ANN and BM25
es_query = {
"query": {
"match": {
"content": {
"query": query,
"boost": 0.5
}
}
},
"knn": {
"field": "embeddings",
"query_vector": query_vector,
"k": k,
"num_candidates": 60,
"boost": 0.5
},
"size": k * 2
}
# execute the Elasticsearch query
results = es.search(
index=idx_name,
request_timeout=30,
body=es_query,
explain=True
)
# print the search results
for hit in results["hits"]["hits"]:
print(f"Document ID: {hit['_id']}")
print(f"Document Content: {hit['_source']['content']}")
print(f"Document Date: {hit['_source']['date']}")
print(f"Document Score: {hit['_score']}")
print("----------")