Hi everyone,
I am trying to read documents from an index using the scroll API. The problem that I am facing is that, Kibana Discover shows the hits to be 195,867, but my code misses or skips 10,000 documents.
I am not able to figure out the part I am doing wrong. Please find my code below.
DOMAIN = 'x.x.x.x'
PORT = 'y'
import json
from elasticsearch import Elasticsearch, exceptions
values_list = []
query_body = {
"size":10000,
"query":
{
"exists": {"field": "candidateSkills.skill"}
},
"_source": "candidateSkills.skill"
}
def fetch_data_from_ES(index_name):
results_dict = {}
global values_list
INDEX_NAMES = [index_name]
host = str(DOMAIN) + ":" + str(PORT)
client = Elasticsearch(host)
#client = client.params(scroll='25m')
try:
info = json.dumps(client.info(), indent=4)
print("Elasticsearch client info():", info)
except exceptions.ConnectionError as err:
print ("\nElasticsearch info() ERROR:", err)
print ("\nThe client host:", host, "is invalid or cluster is not running")
client = None
if client != None:
for index in INDEX_NAMES:
print("Fetching from index = {}".format(index))
doc_count = 0
query_body = {"size":10000,"query":{"exists": {"field": "candidateSkills.skill"}}}
resp = client.search(index = index,body = query_body,scroll = '25m')
old_scroll_id = resp['_scroll_id']
while len(resp['hits']['hits']):
try:
resp = client.scroll(scroll_id = old_scroll_id,scroll = '25m')
except:
resp = client.search(index = index,body = query_body)
if old_scroll_id != resp['_scroll_id']:
print ("Switched scroll ID")
try:
old_scroll_id = resp['_scroll_id']
except:
pass
for doc in resp['hits']['hits']:
doc_id = doc['_id']
values = doc['_source']['candidateSkills']
temp = [i['skill'] for i in values]
results_dict[doc_id] = temp
print("Creating dictionary, added {} values.".format(len(results_dict)))
return results_dict