Help with slow query/indexing time on big index

System Specifications
Elasticsearch: 3 data nodes, 200 primary shards, 406 total shards
CPU model: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
CPU total logical cores: 8
Memory : 32GB (16gb heap)

Client have an index 90gb (178 million docs), on 6 shards + 1 replica for each. This index is constantly updated. Query time on this index on average is 400-500ms, sometimes there is spikes for 1s-2s. Spikes on query time correspond to indexing time on attached images.

image
image
image

Elasticsearch Settings we tried to change(Elasticsearch version is 6.8.17 OpenDistro)

index.refresh_interval: 30s
indices.memory.index_buffer_size: 25%

Can we somehow increase query time and decrese indexing time?
Thanks!

Welcome to our community! :smiley:

Are you able to upgrade? 7.15 is latest and has tonnes of performance improvements.
Otherwise a few things;

  • What is the output from the _cluster/stats?pretty&human API?
  • What's in your hot threads and slowlog?
  • opendistro has a number of 3rd party plugins that we aren't able to provide assistance on. Can you replicate this on a standard Elasticsearch cluster?

Cluster stats output

{
  "_nodes" : {
    "total" : 3,
    "successful" : 3,
    "failed" : 0
  },
  "cluster_name" : "prulmdbemb",
  "cluster_uuid" : "-Hw5fmDnTjGzqGlro7NsEw",
  "timestamp" : 1634110320287,
  "status" : "green",
  "indices" : {
    "count" : 57,
    "shards" : {
      "total" : 406,
      "primaries" : 202,
      "replication" : 1.00990099009901,
      "index" : {
        "shards" : {
          "min" : 2,
          "max" : 12,
          "avg" : 7.12280701754386
        },
        "primaries" : {
          "min" : 1,
          "max" : 6,
          "avg" : 3.543859649122807
        },
        "replication" : {
          "min" : 1.0,
          "max" : 2.0,
          "avg" : 1.0350877192982457
        }
      }
    },
    "docs" : {
      "count" : 736774395,
      "deleted" : 25745895
    },
    "store" : {
      "size" : "388.8gb",
      "size_in_bytes" : 417491691602
    },
    "fielddata" : {
      "memory_size" : "125.1mb",
      "memory_size_in_bytes" : 131280320,
      "evictions" : 0
    },
    "query_cache" : {
      "memory_size" : "3.2gb",
      "memory_size_in_bytes" : 3505660078,
      "total_count" : 1550259429,
      "hit_count" : 416039567,
      "miss_count" : 1134219862,
      "cache_size" : 346337,
      "cache_count" : 1296147,
      "evictions" : 949810
    },
    "completion" : {
      "size" : "0b",
      "size_in_bytes" : 0
    },
    "segments" : {
      "count" : 2387,
      "memory" : "1.3gb",
      "memory_in_bytes" : 1482340925,
      "terms_memory" : "1.2gb",
      "terms_memory_in_bytes" : 1336752597,
      "stored_fields_memory" : "107.7mb",
      "stored_fields_memory_in_bytes" : 112963560,
      "term_vectors_memory" : "0b",
      "term_vectors_memory_in_bytes" : 0,
      "norms_memory" : "1.8mb",
      "norms_memory_in_bytes" : 1971392,
      "points_memory" : "20.9mb",
      "points_memory_in_bytes" : 22007372,
      "doc_values_memory" : "8.2mb",
      "doc_values_memory_in_bytes" : 8646004,
      "index_writer_memory" : "37.3mb",
      "index_writer_memory_in_bytes" : 39202125,
      "version_map_memory" : "12.9kb",
      "version_map_memory_in_bytes" : 13302,
      "fixed_bit_set" : "274.3mb",
      "fixed_bit_set_memory_in_bytes" : 287713864,
      "max_unsafe_auto_id_timestamp" : 1634083202833,
      "file_sizes" : { }
    }
  },
  "nodes" : {
    "count" : {
      "total" : 3,
      "data" : 3,
      "coordinating_only" : 0,
      "master" : 3,
      "ingest" : 3
    },
    "versions" : [
      "6.8.17"
    ],
    "os" : {
      "available_processors" : 24,
      "allocated_processors" : 24,
      "names" : [
        {
          "name" : "Linux",
          "count" : 3
        }
      ],
      "pretty_names" : [
        {
          "pretty_name" : "CentOS Linux 8 (Core)",
          "count" : 3
        }
      ],
      "mem" : {
        "total" : "93.7gb",
        "total_in_bytes" : 100664389632,
        "free" : "755.7mb",
        "free_in_bytes" : 792465408,
        "used" : "93gb",
        "used_in_bytes" : 99871924224,
        "free_percent" : 1,
        "used_percent" : 99
      }
    },
    "process" : {
      "cpu" : {
        "percent" : 70
      },
      "open_file_descriptors" : {
        "min" : 2397,
        "max" : 2472,
        "avg" : 2424
      }
    },
    "jvm" : {
      "max_uptime" : "5.4d",
      "max_uptime_in_millis" : 473004997,
      "versions" : [
        {
          "version" : "1.8.0_265",
          "vm_name" : "OpenJDK 64-Bit Server VM",
          "vm_version" : "25.265-b01",
          "vm_vendor" : "Oracle Corporation",
          "count" : 3
        }
      ],
      "mem" : {
        "heap_used" : "18gb",
        "heap_used_in_bytes" : 19368668416,
        "heap_max" : "47.8gb",
        "heap_max_in_bytes" : 51330416640
      },
      "threads" : 431
    },
    "fs" : {
      "total" : "899.5gb",
      "total_in_bytes" : 965883211776,
      "free" : "484gb",
      "free_in_bytes" : 519717601280,
      "available" : "484gb",
      "available_in_bytes" : 519717601280
    },
    "plugins" : [
      {
        "name" : "search-guard-6",
        "version" : "6.8.17-25.6",
        "elasticsearch_version" : "6.8.17",
        "java_version" : "1.8",
        "description" : "Provide access control related features for Elasticsearch 6",
        "classname" : "com.floragunn.searchguard.SearchGuardPlugin",
        "extended_plugins" : [ ],
        "has_native_controller" : false
      },
      {
        "name" : "repository-s3",
        "version" : "6.8.17",
        "elasticsearch_version" : "6.8.17",
        "java_version" : "1.8",
        "description" : "The S3 repository plugin adds S3 repositories",
        "classname" : "org.elasticsearch.repositories.s3.S3RepositoryPlugin",
        "extended_plugins" : [ ],
        "has_native_controller" : false
      }
    ],
    "network_types" : {
      "transport_types" : {
        "com.floragunn.searchguard.ssl.http.netty.SearchGuardSSLNettyTransport" : 3
      },
      "http_types" : {
        "com.floragunn.searchguard.http.SearchGuardHttpServerTransport" : 3
      }
    }
  }
}

Hot threads

::: {p-solelst-db-03}{EBYwu7JlR2y30e1tsCMurg}{lf4ViEYeT3aHnR4TWJTOvQ}{p-solelst-db-03.hq.ru.corp.leroymerlin.com}{10.220.44.23:9300}{ml.machine_memory=33554796544, ml.max_open_jobs=20, xpack.installed=true, ml.enabled=true}
   Hot threads at 2021-10-13T07:35:27.797Z, interval=500ms, busiestThreads=3, ignoreIdleThreads=true:

   21.2% (105.8ms out of 500ms) cpu usage by thread 'elasticsearch[p-solelst-db-03][management][T#5]'
     8/10 snapshots sharing following 2 elements
       java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
       java.lang.Thread.run(Thread.java:748)

   14.4% (72ms out of 500ms) cpu usage by thread 'elasticsearch[p-solelst-db-03][management][T#3]'
     10/10 snapshots sharing following 2 elements
       java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
       java.lang.Thread.run(Thread.java:748)

::: {p-solelst-db-01}{u0kXxPAUQweToAtlxNk8Hg}{qf6RTvb_RheI8jjdYa9m8g}{p-solelst-db-01.hq.ru.corp.leroymerlin.com}{10.220.44.21:9300}{ml.machine_memory=33554796544, xpack.installed=true, ml.max_open_jobs=20, ml.enabled=true}
   Hot threads at 2021-10-13T07:35:27.794Z, interval=500ms, busiestThreads=3, ignoreIdleThreads=true:

   19.1% (95.3ms out of 500ms) cpu usage by thread 'elasticsearch[p-solelst-db-01][management][T#4]'
     4/10 snapshots sharing following 2 elements
       java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
       java.lang.Thread.run(Thread.java:748)

::: {p-solelst-db-02}{Kx7cqNp9QRSPDV0dxqUq2g}{DVlslQQNQMK5i7G8jgbpTg}{p-solelst-db-02.hq.ru.corp.leroymerlin.com}{10.220.44.22:9300}{ml.machine_memory=33554796544, ml.max_open_jobs=20, xpack.installed=true, ml.enabled=true}
   Hot threads at 2021-10-13T07:35:27.796Z, interval=500ms, busiestThreads=3, ignoreIdleThreads=true:

   20.6% (102.7ms out of 500ms) cpu usage by thread 'elasticsearch[p-solelst-db-02][management][T#5]'
     4/10 snapshots sharing following 2 elements
       java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
       java.lang.Thread.run(Thread.java:748)

CPU usage around 20%.

I didn't use slow log, because client said that they're using simple queries without aggregations.

Client wanted to stay on this version, because next version introduce someone breaking changes for them.

What is the specification of your cluster? What type of storage are you using? Locally attached SSDs? How are you updating the data? Are you using bulk requests? Are you using nested mappings?

Yes, they're using locally attached SSDs (300gb), and yes they're using nested mappings, root document have 5 nested field. So, nested mappings can cause performance issue?
Thanks!

Yes, each nested document is stored as a separate document behind the scenes and if you change anything all these are reindexed every time. Making changes to large nested structures can therefore get expensive.