Hello
I'm having indexing performance problems. I'm using python for bulk operations
1000 documents per bulk tooks about 30 seconds
Documents quite small, about 15 fields most of which integers or short strings.
Indexing daemon runs almost on every ES node (in 5 to 15 threads), each deamon connects to local ES nods. Besides indexing deamon delete old records using bulk delete (1000 recores per bulk to)
Each index has from 300 millions to 1.5 billions records, devided on 10 shards (largest index with 1.5 billions records has 20 shards) and 1 replica.
My cluster has 24 nodes, 27 indexes 432 shards, 9 billions documents, 6.5TB of data
Elasticsearch version 2.2.0
Java: open-jdk (from 1.7.0_65 to 1.7.0_91)
Client library: elasticsearch 2.2.0 (latest)
Node details:
Ubuntu 12.04 or 14.04
CPU: Intel Xeon 8 cores
RAM: 32GB ( ES_HEAP_SIZE=16g on several nodes 20g)
SSD discs (2 discs per node, some of them in RAID1)
------------ index settings ---------------
{
"index": {
"creation_date": "1450432002298",
"number_of_replicas": "1",
"codec": "best_compression",
"uuid": "riSNQJY-R5K8McxgkbXbCg",
"ttl": {
"disable_purge": "true"
},
"analysis": {
"filter": {
"english_stemmer": {
"type": "stemmer",
"language": "english"
}
},
"analyzer": {
"english": {
"type": "custom",
"filter": [
"lowercase",
"english_morphology"
],
"tokenizer": "standard"
}
}
},
"number_of_shards": "10",
"refresh_interval": "30s",
"version": {
"created": "2010099"
}
}
}
---------- mapping ---------------
"positions": {
"_routing": {
"required": true
},
"_ttl": {
"enabled": true,
"default": 7776000000
},
"properties": {
"dynamic": {
"type": "short"
},
"position": {
"type": "short"
},
"region_queries_count_wide": {
"type": "integer"
},
"right_spell": {
"index": "no",
"doc_values": true,
"type": "string"
},
"keyword": {
"analyzer": "english",
"type": "string"
},
"keyword_id": {
"type": "integer"
},
"date": {
"format": "strict_date_optional_time||epoch_millis",
"type": "date"
},
"geo_names": {
"index": "not_analyzed",
"type": "string"
},
"cost": {
"type": "float"
},
"url": {
"index": "not_analyzed",
"type": "string"
},
"region_queries_count": {
"type": "integer"
},
"url_crc": {
"type": "long"
},
"subdomain": {
"index": "not_analyzed",
"type": "string"
},
"concurrency": {
"type": "short"
},
"domain": {
"index": "not_analyzed",
"type": "string"
},
"found_results": {
"type": "long"
},
"types": {
"index": "not_analyzed",
"type": "string"
}
},
"_all": {
"enabled": false
}
}
------------ elasticsearch.yml ------------
cluster.name: name
node.name: "es18"
node.master: false
node.data: true
path.data: /var/lib/elasticsearch,/home/elasticsearch
path.repo: ["/home/backupfs"]
http.port: 9200
http.host: "127.0.0.1"
network.bind_host: 0.0.0.0
network.publish_host: non_loopback:ipv4
transport.tcp.port: 9300
transport.tcp.compress: true
index.max_result_window: 60000
gateway.recover_after_nodes: 15
gateway.expected_nodes: 17
gateway.recover_after_time: 15m
bootstrap.mlockall: true
indices.recovery.max_bytes_per_sec: 150mb
indices.store.throttle.max_bytes_per_sec: 150mb
index.translog.flush_threshold_size: 500mb
discovery.zen.ping.multicast.enabled: false
discovery.zen.ping.unicast.hosts:
- es-gw1
- es-gw2
- es1
- es2
- es3
script.inline: on
script.indexed: on
threadpool.search.type: fixed
threadpool.search.size: 20
threadpool.search.queue_size: 100
threadpool.bulk.type: fixed
threadpool.bulk.size: 20
threadpool.bulk.queue_size: 300
threadpool.index.type: fixed
threadpool.index.size: 20
threadpool.index.queue_size: 100
indices.memory.index_buffer_size: 10%
indices.memory.min_shard_index_buffer_size: 12mb
indices.memory.min_index_buffer_size: 96mb
index.refresh_interval: 30s
index.translog.flush_threshold_ops: 50000
index.search.slowlog.threshold.query.warn: 10s
index.search.slowlog.threshold.query.info: 5s
index.search.slowlog.threshold.fetch.warn: 2s
index.search.slowlog.threshold.fetch.info: 1s
What should I do to increase indexing speed?