Issues with Elasticsearch Scroll API results

Hi everyone,

I am trying to read documents from an index using the scroll API. The problem that I am facing is that, Kibana Discover shows the hits to be 195,867, but my code misses or skips 10,000 documents.

I am not able to figure out the part I am doing wrong. Please find my code below.

DOMAIN = 'x.x.x.x'
PORT = 'y'
import json
from elasticsearch import Elasticsearch, exceptions
values_list = []
query_body = {
    "size":10000,
    "query":
    {
        "exists": {"field": "candidateSkills.skill"}
    },
    "_source": "candidateSkills.skill"
}
def fetch_data_from_ES(index_name):
    results_dict = {}
    global values_list
    INDEX_NAMES = [index_name]
    host = str(DOMAIN) + ":" + str(PORT)
    client = Elasticsearch(host)
    #client = client.params(scroll='25m')
    try:
        info = json.dumps(client.info(), indent=4)
        print("Elasticsearch client info():", info)

    except exceptions.ConnectionError as err:
        print ("\nElasticsearch info() ERROR:", err)
        print ("\nThe client host:", host, "is invalid or cluster is not running")

        client = None
    if client != None:
        for index in INDEX_NAMES:
            print("Fetching from index = {}".format(index))
            doc_count = 0
            query_body = {"size":10000,"query":{"exists": {"field": "candidateSkills.skill"}}}
            resp = client.search(index = index,body = query_body,scroll = '25m')
            old_scroll_id = resp['_scroll_id']
            while len(resp['hits']['hits']):
                try:
                    resp = client.scroll(scroll_id = old_scroll_id,scroll = '25m')
                except:
                    resp = client.search(index = index,body = query_body)
                if old_scroll_id != resp['_scroll_id']:
                    print ("Switched scroll ID")
                try:
                    old_scroll_id = resp['_scroll_id']
                except:
                    pass
                for doc in resp['hits']['hits']:
                    doc_id = doc['_id']
                    values = doc['_source']['candidateSkills']
                    temp = [i['skill'] for i in values]
                    results_dict[doc_id] = temp
                print("Creating dictionary, added {} values.".format(len(results_dict)))
    return results_dict

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.