Timeout collecting data X-Pack

Hello,

We have problem with our Elasticsearch cluster.

Architecture :

11 physical machines (128 Go RAM - 32 CPU - Hardware HDD)

  • 3 data instances by machine
  • 3 physical machines carry the master's instance in addition to the 3 data instances

HEAP of data instance : 19 Go each and master instance 2 Go each

one application which generated an index of 800G per day :

    "refresh_interval": "30s",
    "number_of_shards": "36",
    "provided_name": "BIGINDEX",
    "merge": {
      "scheduler": {
        "max_thread_count": "1"
    "number_of_replicas": "1",

Overview of cluster

JVM Heap: 47.86%  (300GB / 627GB)
Documents: 5,321,404,424
Primary Shards: 3,707
Replica Shards: 3,736

Here the cluster stats API :

{
  "_nodes" : {
    "total" : 36,
    "successful" : 36,
    "failed" : 0
  },
  "cluster_name" : "ES_5",
  "timestamp" : 1538586960513,
  "status" : "green",
  "indices" : {
    "count" : 396,
    "shards" : {
      "total" : 7443,
      "primaries" : 3707,
      "replication" : 1.007823037496628,
      "index" : {
        "shards" : {
          "min" : 2,
          "max" : 72,
          "avg" : 18.795454545454547
        },
        "primaries" : {
          "min" : 1,
          "max" : 36,
          "avg" : 9.36111111111111
        },
        "replication" : {
          "min" : 1.0,
          "max" : 14.0,
          "avg" : 1.0732323232323233
        }
      }
    },
    "docs" : {
      "count" : 5320385349,
      "deleted" : 4995852
    },
    "store" : {
      "size" : "38.9tb",
      "size_in_bytes" : 42836390979259,
      "throttle_time" : "0s",
      "throttle_time_in_millis" : 0
    },
    "fielddata" : {
      "memory_size" : "1.2mb",
      "memory_size_in_bytes" : 1333920,
      "evictions" : 0
    },
    "query_cache" : {
      "memory_size" : "196.8mb",
      "memory_size_in_bytes" : 206415239,
      "total_count" : 67884881,
      "hit_count" : 24300810,
      "miss_count" : 43584071,
      "cache_size" : 6992,
      "cache_count" : 4056221,
      "evictions" : 4049229
    },
    "completion" : {
      "size" : "0b",
      "size_in_bytes" : 0
    },
    "segments" : {
      "count" : 120238,
      "memory" : "97.4gb",
      "memory_in_bytes" : 104610428781,
      "terms_memory" : "84.6gb",
      "terms_memory_in_bytes" : 90848983399,
      "stored_fields_memory" : "7.9gb",
      "stored_fields_memory_in_bytes" : 8569212864,
      "term_vectors_memory" : "0b",
      "term_vectors_memory_in_bytes" : 0,
      "norms_memory" : "910.3mb",
      "norms_memory_in_bytes" : 954539776,
      "points_memory" : "2.1gb",
      "points_memory_in_bytes" : 2277826818,
      "doc_values_memory" : "1.8gb",
      "doc_values_memory_in_bytes" : 1959865924,
      "index_writer_memory" : "2.5gb",
      "index_writer_memory_in_bytes" : 2707509968,
      "version_map_memory" : "7.1mb",
      "version_map_memory_in_bytes" : 7473310,
      "fixed_bit_set" : "0b",
      "fixed_bit_set_memory_in_bytes" : 0,
      "max_unsafe_auto_id_timestamp" : 1538545665133,
      "file_sizes" : { }
    }
  },
  "nodes" : {
    "count" : {
      "total" : 36,
      "data" : 33,
      "coordinating_only" : 0,
      "master" : 3,
      "ingest" : 0
    },
    "versions" : [
      "5.5.1"
    ],
    "os" : {
      "available_processors" : 1152,
      "allocated_processors" : 1152,
      "names" : [
        {
          "name" : "Linux",
          "count" : 36
        }
      ],
      "mem" : {
        "total" : "4.4tb",
        "total_in_bytes" : 4856688279552,
        "free" : "351.1gb",
        "free_in_bytes" : 377046077440,
        "used" : "4tb",
        "used_in_bytes" : 4479642202112,
        "free_percent" : 8,
        "used_percent" : 92
      }
    },
    "process" : {
      "cpu" : {
        "percent" : 11
      },
      "open_file_descriptors" : {
        "min" : 1526,
        "max" : 2138,
        "avg" : 1997
      }
    },
    "jvm" : {
      "max_uptime" : "7.1d",
      "max_uptime_in_millis" : 617508014,
      "versions" : [
        {
          "version" : "1.8.0_181",
          "vm_name" : "OpenJDK 64-Bit Server VM",
          "vm_version" : "25.181-b13",
          "vm_vendor" : "Oracle Corporation",
          "count" : 6
        },
        {
          "version" : "1.8.0_161",
        }
      ],
      "mem" : {
        "heap_used" : "293.5gb",
        "heap_used_in_bytes" : 315195193072,
        "heap_max" : "626.6gb",
        "heap_max_in_bytes" : 672843890688
      },
      "threads" : 11234
    },
    "fs" : {
      "total" : "315.4tb",
      "total_in_bytes" : 346790843449344,
      "free" : "266.7tb",
      "free_in_bytes" : 293311977111552,
      "available" : "261tb",
      "available_in_bytes" : 287033225674752,
      "spins" : "true"
    },
    "plugins" : [
      {
        "name" : "search-guard-5",
        "version" : "5.5.1-15",
        "description" : "Provide access control related features for Elasticsearch 5",
        "classname" : "com.floragunn.searchguard.SearchGuardPlugin",
        "has_native_controller" : false
      },
      {
        "name" : "x-pack",
        "version" : "5.5.1",
        "description" : "Elasticsearch Expanded Pack Plugin",
        "classname" : "org.elasticsearch.xpack.XPackPlugin",
        "has_native_controller" : true
      }
    ],
    "network_types" : {
      "transport_types" : {
        "com.floragunn.searchguard.ssl.http.netty.SearchGuardSSLNettyTransport" : 36
      },
      "http_types" : {
        "com.floragunn.searchguard.http.SearchGuardHttpServerTransport" : 36
      }
    }
  }
}

WHAT's WRONG :sweat_smile::sweat_smile:

Thank you for any helps

For complete :

We have several performance problem (show differents errors messages) :

[data1_01][10.79.18.198:9301][cluster:monitor/nodes/stats[n]] request_id [6630083] timed out after [15000ms]#012#011at org.elasticsearch.transport.TransportService$TimeoutHandler.run(TransportService.java:951) [elasticsearch-5.5.1.jar:5.5.1]#012#011at org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingRunnable.run(ThreadContext.java:569) [elasticsearch-5.5.1.jar:5.5.1]#012#011at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_161]#012#011at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_161]#012#011at java.lang.Thread.run(Thread.java:748) [?:1.8.0_161]

Sep 27 18:20:33 data1 [2018-09-27T18: 20:33,763][WARN ][o.e.t.TransportService   ] [data1 _90] Received response for a request that has timed out, sent [22280ms] ago, timed out [7280ms] ago, action [cluster:monitor/nodes/stats[n]], node [{data2_01}{-AzB6cvqScmCtTEEkVhOkQ}{6h1ZhpsJRWKEyTYF6Xo7AQ}{10.79.18.198}{10.79.18.198:9301}{rack_id=BB_7}], id [6630083]

Sep 27 19:25:42 data1 [2018-09-27T19: 25:42,082][ERROR][o.e.x.m.c.i.IndicesStatsCollector] [data1 _90] collector [indices-stats] timed out when collecting data

[DEBUG][o.e.a.s.TransportSearchAction] [data3_00] [BIGINDEX-20180922][0], node[ONO-a2YPRB6L07M7w8LtqA], [R], s[STARTED], a[id=raHB9sPMQsK1-f7WQ0Ks0A]: Failed to execute [SearchRequest

Else, we encounter sometimes Freez of cluster and mostly a very very long response time when we put request

I think we have lot of gc (but it's maybe normal ? or no)

|Oct|3|15:05:11|data3|[data3_02]|[gc][30027]|[283ms]|[1s]|
|Oct|3|15:06:02|data5|[data5_00]|[gc][31561]|[267ms]|[1s]|
|Oct|3|15:06:18|data3|[data3_02]|[gc][30094]|[332ms]|[1s]|
|Oct|3|15:09:45|data5|[data5_02]|[gc][32119]|[304ms]|[1s]|
|Oct|3|15:17:33|data5|[data5_02]|[gc][32541]|[265ms]|[1s]|
|Oct|3|15:17:51|data7|[data7_01]|[gc][595786]|[388ms]|[1s]|
|Oct|3|15:30:09|data7|[data7_01]|[gc][596502]|[345ms]|[1s]|
|Oct|3|15:36:05|data7|[data7_01]|[gc][596855]|[303ms]|[1s]|
|Oct|3|15:39:13|data7|[data7_01]|[gc][597043]|[304ms]|[1s]|
|Oct|3|15:41:33|data7|[data7_01]|[gc][597182]|[298ms]|[1s]|
|Oct|3|15:50:04|data8|[data8_03]|[gc][31296]|[250ms]|[1s]|
|Oct|3|15:51:53|data7|[data7_01]|[gc][597801]|[309ms]|[1s]|
|Oct|3|15:52:14|data5|[data5_00]|[gc][34233]|[263ms]|[1s]|
|Oct|3|15:52:24|data5|[data5_01]|[gc][46071]|[250ms]|[1s]|
|Oct|3|16:04:46|data9|[data9_01]|[gc][31817]|[258ms]|[1s]|
|Oct|3|16:06:39|data7|[data7_01]|[gc][598686]|[282ms]|[1s]|
|Oct|3|16:13:16|data8|[data8_03]|[gc][32687]|[304ms]|[1s]|
|Oct|3|16:26:23|data7|[data7_01]|[gc][599869]|[281ms]|[1s]|
|Oct|3|16:31:54|data5|[data5_00]|[gc][36608]|[270ms]|[1s]|
|Oct|3|16:37:26|data2|[data2_01]|[gc][49549]|[343ms]|[1s]|
|Oct|3|16:38:12|data2|[data2_01]|[gc][49595]|[293ms]|[1s]|
|Oct|3|16:56:24|data7|[data7_01]|[gc][601662]|[311ms]|[1s]|
|Oct|3|16:58:31|data7|[data7_01]|[gc][601789]|[266ms]|[1s]|
|Oct|3|17:04:13|data8|[data8_02]|[gc][36091]|[252ms]|[1s]|

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.