Performance issue with concurrent call ElasticSearch API

css.carlyle · April 8, 2022, 2:13am

Hello all,

Recently, my program failed the stress test (Jmeter) , my requirement is that get response from Elasticsearch within 5 seconds when it was 50, 100, 500, 1000, 1300 concurrent call. During the stress test, I observed the system information through Kibana Stack Monitoring and New Relic, and found that the CPU usage reached 100% at 500 concurrency. The response time reaches 58s
for 500 concurrent call, compare with a single call, the average return time is about 300ms. I also test the Elasticsearch search API via Postman, sometimes the response time is more than 2s with a single call. So I want to know how to increase the performance, please give me some hints. Many thanks.

Information about Elasticsearch:
version: 7.10.1
deploy environment: Docker Swarm with 3 nodes (all 8 cores, 32Gb ram, jvm Xms, Xmx16g, openjdk 11)

Index information:

Setting:

{
  "settings": {
    "index": {
      "routing": {
        "allocation": {
          "include": {
            "_tier_preference": "data_content"
          }
        }
      },
      "mapping": {
        "nested_fields": {
          "limit": "100"
        },
        "total_fields": {
          "limit": "5000"
        }
      },
      "number_of_shards": "2",
      "provided_name": "pcelk_c1_pc_refinement",
      "creation_date": "1639199848687",
      "number_of_replicas": "1",
      "uuid": "NU6Ev9fvTYyJUAIdzQNp8Q",
      "version": {
        "created": "7100199"
      }
    }
  },
  "defaults": {
    "index": {
      "flush_after_merge": "512mb",
      "final_pipeline": "_none",
      "max_inner_result_window": "100",
      "unassigned": {
        "node_left": {
          "delayed_timeout": "1m"
        }
      },
      "max_terms_count": "65536",
      "lifecycle": {
        "name": "",
        "parse_origination_date": "false",
        "indexing_complete": "false",
        "rollover_alias": "",
        "origination_date": "-1"
      },
      "routing_partition_size": "1",
      "force_memory_term_dictionary": "false",
      "max_docvalue_fields_search": "100",
      "merge": {
        "scheduler": {
          "max_thread_count": "4",
          "auto_throttle": "true",
          "max_merge_count": "9"
        },
        "policy": {
          "reclaim_deletes_weight": "2.0",
          "floor_segment": "2mb",
          "max_merge_at_once_explicit": "30",
          "max_merge_at_once": "10",
          "max_merged_segment": "5gb",
          "expunge_deletes_allowed": "10.0",
          "segments_per_tier": "10.0",
          "deletes_pct_allowed": "33.0"
        }
      },
      "max_refresh_listeners": "1000",
      "max_regex_length": "1000",
      "load_fixed_bitset_filters_eagerly": "true",
      "number_of_routing_shards": "1",
      "write": {
        "wait_for_active_shards": "1"
      },
      "verified_before_close": "false",
      "mapping": {
        "coerce": "false",
        "nested_objects": {
          "limit": "10000"
        },
        "depth": {
          "limit": "20"
        },
        "ignore_malformed": "false",
        "field_name_length": {
          "limit": "9223372036854775807"
        }
      },
      "source_only": "false",
      "soft_deletes": {
        "enabled": "false",
        "retention": {
          "operations": "0"
        },
        "retention_lease": {
          "period": "12h"
        }
      },
      "max_script_fields": "32",
      "query": {
        "default_field": [
          "*"
        ],
        "parse": {
          "allow_unmapped_fields": "true"
        }
      },
      "format": "0",
      "frozen": "false",
      "sort": {
        "missing": [],
        "mode": [],
        "field": [],
        "order": []
      },
      "priority": "1",
      "codec": "default",
      "max_rescore_window": "10000",
      "max_adjacency_matrix_filters": "100",
      "analyze": {
        "max_token_count": "10000"
      },
      "gc_deletes": "60s",
      "top_metrics_max_size": "10",
      "optimize_auto_generated_id": "true",
      "max_ngram_diff": "1",
      "hidden": "false",
      "translog": {
        "generation_threshold_size": "64mb",
        "flush_threshold_size": "512mb",
        "sync_interval": "5s",
        "retention": {
          "size": "512MB",
          "age": "12h"
        },
        "durability": "REQUEST"
      },
      "auto_expand_replicas": "false",
      "mapper": {
        "dynamic": "true"
      },
      "recovery": {
        "type": ""
      },
      "requests": {
        "cache": {
          "enable": "true"
        }
      },
      "data_path": "",
      "highlight": {
        "max_analyzed_offset": "1000000"
      },
      "routing": {
        "rebalance": {
          "enable": "all"
        },
        "allocation": {
          "include": {
            "_tier": ""
          },
          "exclude": {
            "_tier": ""
          },
          "require": {
            "_tier": ""
          },
          "enable": "all",
          "total_shards_per_node": "-1"
        }
      },
      "search": {
        "slowlog": {
          "level": "TRACE",
          "threshold": {
            "fetch": {
              "warn": "-1",
              "trace": "-1",
              "debug": "-1",
              "info": "-1"
            },
            "query": {
              "warn": "-1",
              "trace": "-1",
              "debug": "-1",
              "info": "-1"
            }
          }
        },
        "idle": {
          "after": "30s"
        },
        "throttled": "false"
      },
      "fielddata": {
        "cache": "node"
      },
      "default_pipeline": "_none",
      "max_slices_per_scroll": "1024",
      "shard": {
        "check_on_startup": "false"
      },
      "xpack": {
        "watcher": {
          "template": {
            "version": ""
          }
        },
        "version": "",
        "ccr": {
          "following_index": "false"
        }
      },
      "percolator": {
        "map_unmapped_fields_as_text": "false"
      },
      "allocation": {
        "max_retries": "5",
        "existing_shards_allocator": "gateway_allocator"
      },
      "refresh_interval": "1s",
      "indexing": {
        "slowlog": {
          "reformat": "true",
          "threshold": {
            "index": {
              "warn": "-1",
              "trace": "-1",
              "debug": "-1",
              "info": "-1"
            }
          },
          "source": "1000",
          "level": "TRACE"
        }
      },
      "compound_format": "0.1",
      "blocks": {
        "metadata": "false",
        "read": "false",
        "read_only_allow_delete": "false",
        "read_only": "false",
        "write": "false"
      },
      "max_result_window": "10000",
      "store": {
        "stats_refresh_interval": "10s",
        "type": "",
        "fs": {
          "fs_lock": "native"
        },
        "preload": [],
        "snapshot": {
          "snapshot_name": "",
          "index_uuid": "",
          "cache": {
            "prewarm": {
              "enabled": "true"
            },
            "enabled": "true",
            "excluded_file_types": []
          },
          "uncached_chunk_size": "-1b",
          "index_name": "",
          "repository_name": "",
          "snapshot_uuid": ""
        }
      },
      "queries": {
        "cache": {
          "enabled": "true"
        }
      },
      "warmer": {
        "enabled": "true"
      },
      "max_shingle_diff": "3",
      "query_string": {
        "lenient": "false"
      }
    }
  }
}

Stats:

{
  "_shards": {
    "total": 4,
    "successful": 4,
    "failed": 0
  },
  "stats": {
    "uuid": "NU6Ev9fvTYyJUAIdzQNp8Q",
    "primaries": {
      "docs": {
        "count": 95641618,
        "deleted": 33061794
      },
      "store": {
        "size_in_bytes": 16741487081,
        "reserved_in_bytes": 0
      },
      "indexing": {
        "index_total": 984,
        "index_time_in_millis": 3580,
        "index_current": 0,
        "index_failed": 0,
        "delete_total": 126,
        "delete_time_in_millis": 136,
        "delete_current": 0,
        "noop_update_total": 0,
        "is_throttled": false,
        "throttle_time_in_millis": 0
      },
      "get": {
        "total": 882,
        "time_in_millis": 1160,
        "exists_total": 882,
        "exists_time_in_millis": 1160,
        "missing_total": 0,
        "missing_time_in_millis": 0,
        "current": 0
      },
      "search": {
        "open_contexts": 0,
        "query_total": 26505,
        "query_time_in_millis": 3521784,
        "query_current": 0,
        "fetch_total": 1637,
        "fetch_time_in_millis": 445,
        "fetch_current": 0,
        "scroll_total": 0,
        "scroll_time_in_millis": 0,
        "scroll_current": 0,
        "suggest_total": 0,
        "suggest_time_in_millis": 0,
        "suggest_current": 0
      },
      "merges": {
        "current": 0,
        "current_docs": 0,
        "current_size_in_bytes": 0,
        "total": 28,
        "total_time_in_millis": 13093,
        "total_docs": 263027,
        "total_size_in_bytes": 92467365,
        "total_stopped_time_in_millis": 0,
        "total_throttled_time_in_millis": 0,
        "total_auto_throttle_in_bytes": 41943040
      },
      "refresh": {
        "total": 578,
        "total_time_in_millis": 53564,
        "external_total": 470,
        "external_total_time_in_millis": 52687,
        "listeners": 0
      },
      "flush": {
        "total": 100,
        "periodic": 0,
        "total_time_in_millis": 10488
      },
      "warmer": {
        "current": 0,
        "total": 468,
        "total_time_in_millis": 465
      },
      "query_cache": {
        "memory_size_in_bytes": 108231648,
        "total_count": 263660,
        "hit_count": 122128,
        "miss_count": 141532,
        "cache_size": 160,
        "cache_count": 1234,
        "evictions": 1074
      },
      "fielddata": {
        "memory_size_in_bytes": 0,
        "evictions": 0
      },
      "completion": {
        "size_in_bytes": 0
      },
      "segments": {
        "count": 59,
        "memory_in_bytes": 10847856,
        "terms_memory_in_bytes": 8661720,
        "stored_fields_memory_in_bytes": 38344,
        "term_vectors_memory_in_bytes": 0,
        "norms_memory_in_bytes": 1233856,
        "points_memory_in_bytes": 0,
        "doc_values_memory_in_bytes": 913936,
        "index_writer_memory_in_bytes": 0,
        "version_map_memory_in_bytes": 0,
        "fixed_bit_set_memory_in_bytes": 28944096,
        "max_unsafe_auto_id_timestamp": -1,
        "file_sizes": {}
      },
      "translog": {
        "operations": 7,
        "size_in_bytes": 51644,
        "uncommitted_operations": 7,
        "uncommitted_size_in_bytes": 51644,
        "earliest_last_modified_age": 194855
      },
      "request_cache": {
        "memory_size_in_bytes": 0,
        "evictions": 0,
        "hit_count": 1,
        "miss_count": 1
      },
      "recovery": {
        "current_as_source": 0,
        "current_as_target": 0,
        "throttle_time_in_millis": 0
      }
    },
    "total": {
      "docs": {
        "count": 191283211,
        "deleted": 62280152
      },
      "store": {
        "size_in_bytes": 33026717523,
        "reserved_in_bytes": 0
      },
      "indexing": {
        "index_total": 1962,
        "index_time_in_millis": 6938,
        "index_current": 0,
        "index_failed": 0,
        "delete_total": 252,
        "delete_time_in_millis": 241,
        "delete_current": 0,
        "noop_update_total": 0,
        "is_throttled": false,
        "throttle_time_in_millis": 0
      },
      "get": {
        "total": 882,
        "time_in_millis": 1160,
        "exists_total": 882,
        "exists_time_in_millis": 1160,
        "missing_total": 0,
        "missing_time_in_millis": 0,
        "current": 0
      },
      "search": {
        "open_contexts": 0,
        "query_total": 47330,
        "query_time_in_millis": 6149899,
        "query_current": 0,
        "fetch_total": 2604,
        "fetch_time_in_millis": 778,
        "fetch_current": 0,
        "scroll_total": 0,
        "scroll_time_in_millis": 0,
        "scroll_current": 0,
        "suggest_total": 0,
        "suggest_time_in_millis": 0,
        "suggest_current": 0
      },
      "merges": {
        "current": 0,
        "current_docs": 0,
        "current_size_in_bytes": 0,
        "total": 37,
        "total_time_in_millis": 16728,
        "total_docs": 345945,
        "total_size_in_bytes": 125338158,
        "total_stopped_time_in_millis": 0,
        "total_throttled_time_in_millis": 0,
        "total_auto_throttle_in_bytes": 83886080
      },
      "refresh": {
        "total": 864,
        "total_time_in_millis": 75974,
        "external_total": 653,
        "external_total_time_in_millis": 73888,
        "listeners": 0
      },
      "flush": {
        "total": 201,
        "periodic": 0,
        "total_time_in_millis": 28528
      },
      "warmer": {
        "current": 0,
        "total": 649,
        "total_time_in_millis": 686
      },
      "query_cache": {
        "memory_size_in_bytes": 360633872,
        "total_count": 468236,
        "hit_count": 229520,
        "miss_count": 238716,
        "cache_size": 955,
        "cache_count": 2029,
        "evictions": 1074
      },
      "fielddata": {
        "memory_size_in_bytes": 0,
        "evictions": 0
      },
      "completion": {
        "size_in_bytes": 0
      },
      "segments": {
        "count": 116,
        "memory_in_bytes": 22791378,
        "terms_memory_in_bytes": 18001464,
        "stored_fields_memory_in_bytes": 75280,
        "term_vectors_memory_in_bytes": 0,
        "norms_memory_in_bytes": 2568384,
        "points_memory_in_bytes": 0,
        "doc_values_memory_in_bytes": 2146250,
        "index_writer_memory_in_bytes": 10651492,
        "version_map_memory_in_bytes": 564,
        "fixed_bit_set_memory_in_bytes": 57355992,
        "max_unsafe_auto_id_timestamp": -1,
        "file_sizes": {}
      },
      "translog": {
        "operations": 14,
        "size_in_bytes": 103288,
        "uncommitted_operations": 14,
        "uncommitted_size_in_bytes": 103288,
        "earliest_last_modified_age": 194855
      },
      "request_cache": {
        "memory_size_in_bytes": 0,
        "evictions": 0,
        "hit_count": 17,
        "miss_count": 3
      },
      "recovery": {
        "current_as_source": 0,
        "current_as_target": 0,
        "throttle_time_in_millis": 0
      }
    }
  }
}

Christian_Dahlqvist · April 8, 2022, 4:43am

In Elasticsearch the limiting factor is not necessarily CPU, but rather disk performance. Are you monitoring disk I/O and iowait on the nodes?

In order to support the maximum number of concurrent queries you want to make sure 1) your queries are as efficient as possible, 2) that you use the file system cache and is not limited by disk performance and 3) use CPU as efficiently as possible. Make sure your data is stored as efficiently as possible on disk to optimize file system cache usage. Follow the instruction available here.

It seems that you are updating and/or deleting documents from the index. This will change the data on disk and affect the file system page cache and data structures in memory, so make sure this is done as efficiently as possible. The best query throughput can often be achieved when data is rarely changed.

When benchmarking I would recommend the following:

Initially set the number of primary shards to 1 and the number of replicas to 2. This means a single shard can serve each query and all nodes hold all the data.
Try to run with as small heap as you can get away with. Start small and increase as necessary. This gives the operating system page cache as much resources as possible. Be aware that Elasticsearch uses off-heap memory, so all memory not assigned to the heap is not available to the page cache.
Start benchmarking at a low concurrency level and slowly increase as long as the query latencies are below the required latency while monitoring the cluster. This will give you the maximum query concurrency the cluster can support. There is no point making huge steps and overwhelming the cluster. Make sure that you are benchmarking with a realistic work load. If you are going to update/delete documents, make sure you do this during the benchmark and do not just run queries.

I also see that you are running an old version. I would recommend upgrading to the latest version as a lot of improvements have been made. If you are using any third-party plugins, be aware that these can affect performance and may make it hard for us to help you in this forum.

The following resources might also be useful:

system · May 6, 2022, 4:44am

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
Concurrent queries (EsRejectedExecutionException and low performance) Elasticsearch	3	410	July 6, 2017
Understanding Threadpools Elasticsearch	7	435	July 6, 2017
Elasticsearch Transport Client bottle neck with concurrent calls Elasticsearch	4	1369	July 5, 2017
ES slowness and calls taking a long time to return Elasticsearch	5	588	July 6, 2017
EC2 Perfomance problems, advice needed Elasticsearch	19	489	July 6, 2017

Performance issue with concurrent call ElasticSearch API

Related topics