Large Clusters - 35 nodes - ILM Issues

Hi Im struggling a little bit with elastic ILM configuration. Version 7.4.2

I have a total of 35nodes

3 - master
2- coordinator
8 - hot nodes
20 - warm nodes
2 - cold nodes

Warm Nodes have:

  • 8 x 1.8TB of storage each,
  • Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz (2 chips, 80 cores)
  • 376.35 GB physical memory, 4.00 GB swap

Hot Nodes have:

  • 10 x ssd disks with 1.8TB of storage each,
  • Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz (2 chips, 80 cores)
  • 376.35 GB physical memory, 4.00 GB swap

Master and Coordinator Nodes have:

  • 1 data disk with 1.8TB in raid 1
  • Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz (2 chips, 80 cores)
  • 376.35 GB physical memory, 4.00 GB swap

Cold Nodes have:

  • 12 HDD drives of 3.7 TB space
  • Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz (2 chips, 80 cores)
  • 376.35 GB physical memory, 4.00 GB swap

Currently I have 2 main pipelines through logstash, one for netflow and the other for servers/app logs. Our flow goes like Filebeat -> kafka -> logstash -> elasticsearch.

The server/ app logs depending on the location generate about 800GB of logs daily.
The logstash logs, which Im sending (/var/log/messages, /var/log/secure) produce about 200GB of logs daily.

So I have 8 main indexes:

  • app_ilm_sjc2

  • app_ilm_ams1

  • app_ilm_atl1

  • app_ilm_hkg1

  • logstash_ilm_sjc2

  • logstash_ilm_ams1

  • logstash_ilm_atl1

  • logstash_hkg1

Each of them are create with the date format like logstash_ilm_hkg1-2020-04.05
for the app logs I have created the indices with 24 shards each and one replica. Im using dynamic templates which I specify from logstash, here is a sample of one of the templates:

{
  "logstash_ilm_sjc2" : {
    "order" : 0,
    "index_patterns" : [
      "logstash_ilm_sjc2-*"
    ],
    "settings" : {
      "index" : {
        "lifecycle" : {
          "name" : "logstash_ilm_policy_sjc2",
          "rollover_alias" : "logstash_ilm_sjc2"
        },
        "refresh_interval" : "10s",
        "analysis" : {
          "analyzer" : {
            "default" : {
              "type" : "standard",
              "stopwords" : "_none_"
            }
          }
        },
        "number_of_shards" : "16",
        "number_of_replicas" : "1"
      }
    },
    "mappings" : {
      "dynamic_templates" : [
        {
          "command" : {
            "mapping" : {
              "norms" : false,
              "index" : true,
              "type" : "text",
              "index_options" : "offsets"
            },
            "match_mapping_type" : "string",
            "match" : "command"
          }
        },
        {
          "arguments" : {
            "mapping" : {
              "norms" : false,
              "index" : true,
              "type" : "text",
              "index_options" : "offsets"
            },
            "match_mapping_type" : "string",
            "match" : "arguments"
          }
        },
        {
          "changes" : {
            "mapping" : {
              "norms" : false,
              "index" : true,
              "type" : "text",
              "index_options" : "offsets"
            },
            "match_mapping_type" : "string",
            "match" : "changes"
          }
        },
        {
          "message_field" : {
            "mapping" : {
              "norms" : false,
              "index" : true,
              "type" : "text",
              "index_options" : "offsets"
            },
            "match_mapping_type" : "string",
            "match" : "message"
          }
        },
        {
          "string_fields" : {
            "mapping" : {
              "fields" : {
                "{name}" : {
                  "norms" : false,
                  "index" : true,
                  "type" : "text",
                  "index_options" : "offsets"
                }
              }
            },
            "match_mapping_type" : "string",
            "match" : "*"
          }
        }
      ],
      "properties" : {
        "severity" : {
          "index" : true,
          "type" : "keyword"
        },
        "geoip" : {
          "dynamic" : true,
          "type" : "object",
          "properties" : {
            "ip" : {
              "type" : "ip"
            },
            "location" : {
              "type" : "geo_point"
            }
          }
        },
        "syslog_severity_code" : {
          "index" : true,
          "type" : "keyword"
        },
        "@version" : {
          "index" : true,
          "type" : "keyword"
        },
        "syslog_facility_code" : {
          "index" : true,
          "type" : "keyword"
        },
        "pid" : {
          "index" : true,
          "type" : "keyword"
        },
        "priority" : {
          "index" : true,
          "type" : "keyword"
        },
        "facility" : {
          "index" : true,
          "type" : "keyword"
        },
        "tags" : {
          "index" : true,
          "type" : "keyword"
        }
      }
    },
    "aliases" : { }
  }
}

and I have set my ILM to:

  "logstash_ilm_policy_sjc2" : {
    "version" : 1,
    "modified_date" : "2020-04-23T20:04:20.968Z",
    "policy" : {
      "phases" : {
        "warm" : {
          "min_age" : "30d",
          "actions" : {
            "allocate" : {
              "number_of_replicas" : 1,
              "include" : { },
              "exclude" : { },
              "require" : {
                "box_type" : "warm"
              }
            },
            "forcemerge" : {
              "max_num_segments" : 1
            },
            "set_priority" : {
              "priority" : 50
            },
            "shrink" : {
              "number_of_shards" : 8
            }
          }
        },
        "cold" : {
          "min_age" : "90d",
          "actions" : {
            "allocate" : {
              "number_of_replicas" : 1,
              "include" : { },
              "exclude" : { },
              "require" : {
                "box_type" : "cold"
              }
            },
            "set_priority" : {
              "priority" : 0
            }
          }
        },
        "hot" : {
          "min_age" : "0ms",
          "actions" : {
            "rollover" : {
              "max_size" : "100gb",
              "max_age" : "7d"
            },
            "set_priority" : {
              "priority" : 100
            }
          }
        },
        "delete" : {
          "min_age" : "91d",
          "actions" : {
            "delete" : { }
          }
        }
      }
    }
  }

Each index when is created is created with 24 shards for app and 16 for logstash with 1 replica.
Not sure if this is the right configuration for shards.

My first question is
Should the shard configuration looks excessive? would it be best just to have one shard and let growth all the way to max size of like about 900GB, how this will affect the cluster.

Some of the issues I have experience lately are, index gets created but no documents, then some of the indices are created without the date, or the policy doesn't get apply:

if [type] == "xxx" and [index_name] {
    elasticsearch {
  index               => "logstash_ilm_atl1-%{+YYYY.MM.dd}"
  hosts               => ["app1417.atl1.com:9200","app1431.atl1.com:9200"]
  manage_template     => true
  template_name       => "logstash_ilm_atl1"
  template            => "/etc/logstash/templates/ilm_logstash.json"
  template_overwrite  => true
  ilm_rollover_alias  => "logstash_ilm_atl1"
  ilm_policy          => "logstash_ilm_policy_atl1"
}
  }

Ok so far thing were working, but yesterday we had some issues when the cluster entered in a block state, ilm reported:

  • reason : Values less than -1 bytes are not supported: -235741566572b

and logstash reported:
- [2020-07-08T08:54:51,862][ERROR][logstash.outputs.elasticsearch] Encountered a retryable error. Will Retry with exponential backoff {:code=>400, :url=>"http://app1431.atl1.turn.com:9200/_bulk"}

Elastic reported:

  • Elasticsearch::Transport::Transport::Errors::Forbidden: [403] {"error":{"root_cause":[{"type":"cluster_block_exception","reason":"blocked by: [FORBIDDEN/12/index read-only / allow delete (api)];"}],"type":"cluster_block_exception","reason":"blocked by: [FORBIDDEN/12/index read-only / allow delete (api)];"},"status":403}

So I checked some of the suggestions here were refers to storage, so I add this changes to fix the sitauation quicly:

```{
  "transient" : {
    "cluster" : {
      "routing" : {
        "allocation" : {
          "disk" : {
            "threshold_enabled" : "false"
          },
          "enable" : "all"
        }
      }
    },
    "indices" : {
      "recovery" : {
        "max_bytes_per_sec" : "200mb",
        "max_concurrent_file_chunks" : "5"
      }
    }
  }
}```

that fix it but then indices stop getting data, the ILM for some of them is not working even when I delete the index and let it recreate.

Should I stop everything and delete all data and start over. Also here is my config:

cluster:
  name: production_app
  info:
    update:
      interval: 1m
  routing:
    allocation:
      cluster_concurrent_rebalance: 25
      disk:
        threshold_enabled: true
        watermark:
          high: 90%
          low:  80%
  initial_master_nodes:
    - app1416.atl1.com
    - app1430.atl1.com
    - app1450.atl1.com
node:
  name: app1430.atl1.turn.com
  master: true
  data: false
  ingest: false
  ml: false
xpack.ml.enabled: true
cluster.remote.connect: false
path:
  repo: ["/elastic_amobee_snaps/production_snap"]
  logs: /var/log/elasticsearch/production_amobee
  data: /data1/es_data
bootstrap:
  memory_lock: true
network:
  bind_host: 0.0.0.0
  publish_host: 10.16.106.148
http:
  cors:
    allow-origin: /.*/
    enabled: true
  port: 9200
discovery:
  seed_hosts:
    - app1416.atl1.com
    - app1417.atl1.com
    - app1418.atl1.com
    - app1419.atl1.com
    - app1420.atl1.com
    - app1421.atl1.com
    - app1422.atl1.com
    - app1423.atl1.com
    - app1424.atl1.com
    - app1425.atl1.com
    - app1426.atl1.com
    - app1427.atl1.com
    - app1428.atl1.com
    - app1429.atl1.com
    - app1430.atl1.com
    - app1431.atl1.com
    - app1432.atl1.com
    - app1433.atl1.com
    - app1434.atl1.com
    - app1435.atl1.com
    - app1436.atl1.com
    - app1437.atl1.com
    - app1438.atl1.com
    - app1439.atl1.com
    - app1440.atl1.com
    - app1441.atl1.com
    - app1442.atl1.com
    - app1443.atl1.com
    - app1444.atl1.com
    - app1445.atl1.com
    - app1446.atl1.com
    - app1447.atl1.com
    - app1448.atl1.com
    - app1449.atl1.com
    - app1450.atl1.com
gateway:
  recover_after_nodes: 2
thread_pool:
  write:
    queue_size: 200
    size: 80
  search:
    queue_size: 1000
    size: 121
transport:
  port: 9300
indices:
  query:
    bool:
      max_clause_count: 8192
  memory:
    index_buffer_size: 50%
  fielddata:
    cache:
      size:  40%
#index:
#  number_of_replicas: 1
#  number_of_shards: 20
#  translog:
#    flush_threshold_ops: 50000
action:
  destructive_requires_name: true
search:
  max_buckets: 250000

If someone can help me with this that will be great cause Im going crazy

That "less than negative one" bit is a bug that I think was fixed. For that, at least, you should upgrade.

I don't know ilm super well do I can't really advise on that bit.

Thank you. I will look into that

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.