I have an index which has around 300 million documents. What is the fastest recommended way to retrieve all the documentIDs from the index?
Currently I am using the below python script for doing a scan and scroll to retrieve all the IDs. However this takes around 20-24 hours to fetch all the IDs
import csv
import json
import sys
import requests
indexName = sys.argv[1]
url = "http://<<es_host>>/" + str(indexName) + "/_search"
querystring = {"scroll":"1m"}
payload = {
"query": {},
"size": 1000,
"stored_fields": []
}
headers = {
"Content-Type": "application/json"
}
response = requests.request("POST", url, data=json.dumps(payload), headers=headers, params=querystring)
response = json.loads(response.text)
#print response
f = open('results.txt','w')
while True:
# print(len(response['hits']['hits']))
if(len(response['hits']['hits']) == 0):
break
for hit in response['hits']['hits']:
fsn = hit['_id']
f.write(fsn)
f.write("\n")
scroll_id = response['_scroll_id']
#print scroll_id
payload = {
"scroll_id": scroll_id,
"scroll" : "1m"
}
#print (payload)
url = "http://<<es>>/_search/scroll"
response = requests.request("POST", url, data=json.dumps(payload), headers=headers)