How to manage shards and indexes on Production cluster?

Greetings,

We use a 12-node ELK dockerised stack to index our webapps and access logs. We create one index per app per day. webapps logs are stored 90 days, access_logs shall be available "forever".

Currently, we are facing the maximum shards opened issue (12000 shards current) and I am failing to understand what must be done to overcome this problem.

I'm watching https://www.elastic.co/fr/elasticon/conf/2016/sf/quantitative-cluster-sizing and from what I understand, we should have only 40 to 60 shards in our current deployment, and should have bigger shards.
I am failing to see how I could make these configuration changes and apply them to the current Production cluster.
Since, we've reached max shards, logstash doesn't get to write any new logs into elastichsearch, which is an issue.

I'm attaching our cluster stats to the post.

Best regards,

    {
      "_nodes" : {
        "total" : 12,
        "successful" : 12,
        "failed" : 0
      },
      "cluster_name" : "docker-cluster",
      "cluster_uuid" : "1mf2s9I0T06Gh3JG3HlLLA",
      "timestamp" : 1600079162017,
      "status" : "green",
      "indices" : {
        "count" : 5833,
        "shards" : {
          "total" : 11666,
          "primaries" : 5833,
          "replication" : 1.0,
          "index" : {
            "shards" : {
              "min" : 2,
              "max" : 2,
              "avg" : 2.0
            },
            "primaries" : {
              "min" : 1,
              "max" : 1,
              "avg" : 1.0
            },
            "replication" : {
              "min" : 1.0,
              "max" : 1.0,
              "avg" : 1.0
            }
          }
        },
        "docs" : {
          "count" : 934977833,
          "deleted" : 6666495
        },
        "store" : {
          "size_in_bytes" : 1556691445657
        },
        "fielddata" : {
          "memory_size_in_bytes" : 11182528,
          "evictions" : 0
        },
        "query_cache" : {
          "memory_size_in_bytes" : 1847097498,
          "total_count" : 2512950387,
          "hit_count" : 298055135,
          "miss_count" : 2214895252,
          "cache_size" : 77112,
          "cache_count" : 228827,
          "evictions" : 151715
        },
        "completion" : {
          "size_in_bytes" : 0
        },
        "segments" : {
          "count" : 69264,
          "memory_in_bytes" : 4034102015,
          "terms_memory_in_bytes" : 3076207598,
          "stored_fields_memory_in_bytes" : 770449528,
          "term_vectors_memory_in_bytes" : 0,
          "norms_memory_in_bytes" : 13440,
          "points_memory_in_bytes" : 119741945,
          "doc_values_memory_in_bytes" : 67689504,
          "index_writer_memory_in_bytes" : 149050876,
          "version_map_memory_in_bytes" : 0,
          "fixed_bit_set_memory_in_bytes" : 35090488,
          "max_unsafe_auto_id_timestamp" : 1600077901575,
          "file_sizes" : { }
        }
      },
      "nodes" : {
        "count" : {
          "total" : 12,
          "coordinating_only" : 0,
          "data" : 12,
          "ingest" : 12,
          "master" : 12,
          "ml" : 12,
          "voting_only" : 0
        },
        "versions" : [
          "7.4.0"
        ],
        "os" : {
          "available_processors" : 192,
          "allocated_processors" : 192,
          "names" : [
            {
              "name" : "Linux",
              "count" : 12
            }
          ],
          "pretty_names" : [
            {
              "pretty_name" : "CentOS Linux 7 (Core)",
              "count" : 12
            }
          ],
          "mem" : {
            "total_in_bytes" : 1622289432576,
            "free_in_bytes" : 32867880960,
            "used_in_bytes" : 1589421551616,
            "free_percent" : 2,
            "used_percent" : 98
          }
        },
        "process" : {
          "cpu" : {
            "percent" : 23
          },
          "open_file_descriptors" : {
            "min" : 4107,
            "max" : 14601,
            "avg" : 10397
          }
        },
        "jvm" : {
          "max_uptime_in_millis" : 346634822,
          "versions" : [
            {
              "version" : "13",
              "vm_name" : "OpenJDK 64-Bit Server VM",
              "vm_version" : "13+33",
              "vm_vendor" : "AdoptOpenJDK",
              "bundled_jdk" : true,
              "using_bundled_jdk" : true,
              "count" : 12
            }
          ],
          "mem" : {
            "heap_used_in_bytes" : 55705524856,
            "heap_max_in_bytes" : 101769019392
          },
          "threads" : 2708
        },
        "fs" : {
          "total_in_bytes" : 47241260089344,
          "free_in_bytes" : 28102805385216,
          "available_in_bytes" : 25702791778304
        },
        "plugins" : [ ],
        "network_types" : {
          "transport_types" : {
            "security4" : 12
          },
          "http_types" : {
            "security4" : 12
          }
        },
        "discovery_types" : {
          "zen" : 12
        },
        "packaging_types" : [
          {
            "flavor" : "default",
            "type" : "docker",
            "count" : 12
          }
        ]
      }
    }

There are a couple of common ways to reduce shard count. You could switch to weekly indices instead of daily or even use ILM to have each index handle a flexible period and instead roll over to new indices based on size. Another way is to consolidate multiple apps into a single index. If all your indices has just a single primary shard you will not be able to use the shrink index API so will most likely need to reindex your data.

1 Like

I have cleared some indexes that weren't relevant and we managed to recover a third of the shards.

Now I will apply a lifecycle policy based on our expected growth upon the years

Thank you for your reply, we will definitively review our index creation from daily to weekly, or even monthly

Best Regards

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.