Cross-posted on SO: http://stackoverflow.com/questions/32560339/sending-too-many-ids-in-elasticsearch-py-mtermvectors
I'm trying to use mtermvectors() in the python elasticsearch-py client to retrieve term vectors for a large number of documents. I'm running into an error when trying to request too many documents. For example, if I request termvectors for 200 documents:
>> es.mtermvectors(index = index_name, doc_type = index_type, ids = docs.keys()[0:200])
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/elasticsearch/connection/http_urllib3.py", line 74, in perform_request
response = self.pool.urlopen(method, url, body, retries=False, headers=self.headers, **kw)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/urllib3/connectionpool.py", line 607, in urlopen
_stacktrace=sys.exc_info()[2])
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/urllib3/util/retry.py", line 222, in increment
raise six.reraise(type(error), error, _stacktrace)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/urllib3/connectionpool.py", line 557, in urlopen
body=body, headers=headers)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/urllib3/connectionpool.py", line 374, in _make_request
httplib_response = conn.getresponse(buffering=True)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 1132, in getresponse
response.begin()
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 453, in begin
version, status, reason = self._read_status()
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 417, in _read_status
raise BadStatusLine(line)
ProtocolError: ('Connection aborted.', BadStatusLine("''",))
---------------------------------------------------------------------------
ConnectionError Traceback (most recent call last)
<ipython-input-172-74371d038a71> in <module>()
1 st = time.time()
----> 2 tv = es.mtermvectors(index = index_name, doc_type = index_type, ids = docs.keys()[0:200])
3 time.time()-st
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/elasticsearch/client/utils.pyc in _wrapped(*args, **kwargs)
67 if p in kwargs:
68 params[p] = kwargs.pop(p)
---> 69 return func(*args, params=params, **kwargs)
70 return _wrapped
71 return _wrapper
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/elasticsearch/client/__init__.pyc in mtermvectors(self, index, doc_type, body, params)
1100 """
1101 _, data = self.transport.perform_request('GET', _make_path(index,
-> 1102 doc_type, '_mtermvectors'), params=params, body=body)
1103 return data
1104
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/elasticsearch/transport.pyc in perform_request(self, method, url, params, body)
305
306 try:
--> 307 status, headers, data = connection.perform_request(method, url, params, body, ignore=ignore, timeout=timeout)
308
309 except TransportError as e:
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/elasticsearch/connection/http_urllib3.pyc in perform_request(self, method, url, params, body, timeout, ignore)
83 except Exception as e:
84 self.log_request_fail(method, full_url, body, time.time() - start, exception=e)
---> 85 raise ConnectionError('N/A', str(e), e)
86
87 if not (200 <= response.status < 300) and response.status not in ignore:
ConnectionError: ConnectionError(('Connection aborted.', BadStatusLine("''",))) caused by: ProtocolError(('Connection aborted.', BadStatusLine("''",)))
I did some testing and it seems to be that the problem is that I'm sending too many document IDs in the request and somehow there is a limit. For example, in the above example, it works when I request 193 documents, but breaks down when I request >= 194 documents. The IDs here are 18 characters long. In another test, I used shorter IDs and was able to request a larger number of documents before it broke down.
Is there a way around this problem or am I limited in the number of document termvectors I can request at once?