Upgrade from 5.5 to 5.6

Hello,

I recently upgraded my cluster from 5.5 to 5.6 (I have same issues on a 6.1 cluster too). Soon, I got two issues:

I suspected that I messed up something during the upgrade with xpack. So yesterday I did a full cluster restart. Not only I got the monitoring back, DateHistogramAggregation were much faster (from timeout at 30s to ~6s).

This morning I decided to check and DateHistogramAggregation were very slow (timeout on few data) and monitoring was lost, with the same logs. It happened around 7 A.M (Europe/Paris), not too long after some index mappings updates.

I searched a little bit and indeed, the cluster-stats API was slow : 1m44s. I read it could be caused by oversharding and indeed, I was oversharding : 8000 shards in a few hundreds indices time-based, for 6 servers with 16GB heap each. I was way over the "good rule of thumb" of < 25/HEAP_GB. Because of a mess up with index templates, even .monitoring-es-6* were on several shards.

Now, I am at less than 2000 shards, which is a lot but way below 25/HEAP_GB (25*96=2400). I removed the old monitoring indices to start clean.

=> Cluster stats API is now at 12s. It's less than 1m44s but it's still a lot more than my cluster in 5.1.6 with 320 shards (0.01s).
And the slowness of DateHistogramAggregations came back.

It's going to be difficult for me to reduce again the number of shards (yes, I could be ~1500 but not a lot less), so the question is, what can I do ? Is there a hard number of shards I must not exceed (I have not read that) ? Some tuning ? Something to monitor ?

Thank you very much,
Regards,
Grégoire

If it can help, here are my cluster stats. Don't hesitate if there is any additionnal info I should provide

[user@host ~]$ time  curl -XGET 'http://localhost:9201/_cluster/stats?human&pretty'
{
  "_nodes" : {
    "total" : 10,
    "successful" : 10,
    "failed" : 0
  },
  "cluster_name" : "elastic-cluster",
  "timestamp" : 1522311750032,
  "status" : "green",
  "indices" : {
    "count" : 205,
    "shards" : {
      "total" : 1924,
      "primaries" : 962,
      "replication" : 1.0,
      "index" : {
        "shards" : {
          "min" : 2,
          "max" : 12,
          "avg" : 9.385365853658536
        },
        "primaries" : {
          "min" : 1,
          "max" : 6,
          "avg" : 4.692682926829268
        },
        "replication" : {
          "min" : 1.0,
          "max" : 1.0,
          "avg" : 1.0
        }
      }
    },
    "docs" : {
      "count" : 864714419,
      "deleted" : 298499
    },
    "store" : {
      "size" : "2.2tb",
      "size_in_bytes" : 2475974787676,
      "throttle_time" : "0s",
      "throttle_time_in_millis" : 0
    },
    "fielddata" : {
      "memory_size" : "136.6mb",
      "memory_size_in_bytes" : 143281976,
      "evictions" : 0
    },
    "query_cache" : {
      "memory_size" : "420.5mb",
      "memory_size_in_bytes" : 440997821,
      "total_count" : 8212578,
      "hit_count" : 4887236,
      "miss_count" : 3325342,
      "cache_size" : 35639,
      "cache_count" : 45485,
      "evictions" : 9846
    },
    "completion" : {
      "size" : "0b",
      "size_in_bytes" : 0
    },
    "segments" : {
      "count" : 26699,
      "memory" : "3.7gb",
      "memory_in_bytes" : 3975465393,
      "terms_memory" : "3gb",
      "terms_memory_in_bytes" : 3284508341,
      "stored_fields_memory" : "459mb",
      "stored_fields_memory_in_bytes" : 481343808,
      "term_vectors_memory" : "0b",
      "term_vectors_memory_in_bytes" : 0,
      "norms_memory" : "5.1mb",
      "norms_memory_in_bytes" : 5387072,
      "points_memory" : "67mb",
      "points_memory_in_bytes" : 70310740,
      "doc_values_memory" : "127.7mb",
      "doc_values_memory_in_bytes" : 133915432,
      "index_writer_memory" : "56.6mb",
      "index_writer_memory_in_bytes" : 59352388,
      "version_map_memory" : "390.8kb",
      "version_map_memory_in_bytes" : 400213,
      "fixed_bit_set" : "0b",
      "fixed_bit_set_memory_in_bytes" : 0,
      "max_unsafe_auto_id_timestamp" : 9223372036854775807,
      "file_sizes" : { }
    }
  },
  "nodes" : {
    "count" : {
      "total" : 10,
      "data" : 6,
      "coordinating_only" : 2,
      "master" : 7,
      "ingest" : 8
    },
    "versions" : [
      "5.6.8"
    ],
    "os" : {
      "available_processors" : 248,
      "allocated_processors" : 200,
      "names" : [
        {
          "name" : "Linux",
          "count" : 10
        }
      ],
      "mem" : {
        "total" : "201.5gb",
        "total_in_bytes" : 216401367040,
        "free" : "3gb",
        "free_in_bytes" : 3276292096,
        "used" : "198.4gb",
        "used_in_bytes" : 213125074944,
        "free_percent" : 2,
        "used_percent" : 98
      }
    },
    "process" : {
      "cpu" : {
        "percent" : 54
      },
      "open_file_descriptors" : {
        "min" : 375,
        "max" : 1811,
        "avg" : 1197
      }
    },
    "jvm" : {
      "max_uptime" : "1.9d",
      "max_uptime_in_millis" : 171829969,
      "versions" : [
        {
          "version" : "1.8.0_141",
          "vm_name" : "OpenJDK 64-Bit Server VM",
          "vm_version" : "25.141-b16",
          "vm_vendor" : "Oracle Corporation",
          "count" : 8
        },
        {
          "version" : "1.8.0_161",
          "vm_name" : "OpenJDK 64-Bit Server VM",
          "vm_version" : "25.161-b14",
          "vm_vendor" : "Oracle Corporation",
          "count" : 2
        }
      ],
      "mem" : {
        "heap_used" : "49.9gb",
        "heap_used_in_bytes" : 53627010496,
        "heap_max" : "102.5gb",
        "heap_max_in_bytes" : 110133903360
      },
      "threads" : 2243
    },
    "fs" : {
      "total" : "12.8tb",
      "total_in_bytes" : 14162062884864,
      "free" : "10.5tb",
      "free_in_bytes" : 11643901661184,
      "available" : "10.3tb",
      "available_in_bytes" : 11345782673408
    },
    "plugins" : [
      {
        "name" : "x-pack",
        "version" : "5.6.8",
        "description" : "Elasticsearch Expanded Pack Plugin",
        "classname" : "org.elasticsearch.xpack.XPackPlugin",
        "has_native_controller" : true
      }
    ],
    "network_types" : {
      "transport_types" : {
        "netty4" : 10
      },
      "http_types" : {
        "netty4" : 10
      }
    }
  }
}

real    0m6.229s
user    0m0.004s
sys     0m0.004s

Regards,
Grégoire

Hello,

Is there anyone who has an idea of the issue ? Is there anything I can provide to get feedback ? I tried to see every changes between 5.5 and 5.6, without seeing what could be the cause of the issue. Do I have other solutions than downgrade ?

Regards,
Grégoire

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.