Hi,
I currently have ES installed on a Xen cluster of 14 instances:
400 GB Disk space
2 Intel cores
4 GB RAM
All instances are on the same network segment.
I am storing documents with mapping identical to twitter with a
collator and analyzer operating on the text field.
I am running 14 python threads each writing to one of the instances on
port 9500 using the thrift transport.
For each thrift transport open/close transaction I am passing 350
documents.
For 56 shards and no replication on the index.
When I begin the build of the index I am writing ~4500 documents/sec
but this performance degrades hence I am writing 5M documents at an
average rate of 1400 documents/sec and 10M at < 1000 documents per
second.
When I double the number of shards I get errors in writing.
Any suggestions?
Regards,
Wolf
index setup:
curl -XPUT 'http://192.168.0.129:9200/twitter' -d
'{ "index" : {
"numberOfShards" : 42,
"numberOfReplicas" : 0,
"analysis" : {
"analyzer" : {
"collation" : {
"tokenizer" : "keyword",
"filter" : ["myCollator"]
},
"my_analyzer" : {
"type" : "igo"
}
},
"filter" : {
"myCollator" : {
"type" : "icu_collation",
"language" : "ja"
}
}
}
}
}'
curl -XPUT 'http://192.168.0.129:9200/twitter/_mapping' -d
@tweet_mapping
python class for writing tweet docs
class DocWriter:
def init(self, host, data_file, initial_index):
self.index = initial_index
self.uri = "/twitter/tweet/"
self.host = "192.168.0."+str(host)
self.data_file = data_file
self.socket = TSocket.TSocket("192.168.0.190", 9500)
self.transport = TTransport.TBufferedTransport(self.socket)
self.protocol = TBinaryProtocolAccelerated(self.transport)
self.client = Rest.Client(self.protocol)
def write_file_to_es(self):
with open(self.data_file, 'r') as f:
self.transport.open()
line = f.readline()
while 0 < len(line):
self.index = self.index + 1
line = json.loads(line.rstrip("\n"), object_hook=self.fix_obj)
request = RestRequest(method=1, uri=self.uri + str(self.index),
headers={}, body=json.dumps(line))
response = self.client.execute(request)
response = json.loads(response.body)
try: response['ok']
except NameError:
print response
self.transport.close()
f.close
return self.index
line = f.readline()
f.closed
self.transport.close()
return self.index
def fix_obj(self, obj):
if 'retweeted_status' in obj:
if 'retweet_count' in obj['retweeted_status']:
print(obj['retweeted_status']['retweet_count'])
obj['retweeted_status']['retweet_count'] =
str(obj['retweeted_status']['retweet_count'])
obj['retweeted_status']=obj['retweeted_status']
elif 'retweet_count' in obj:
obj['retweet_count'] = str(obj['retweet_count'])
return obj