Elasticsearch windows server high split i/o and service restart

Hello,

We are using an app that has Elasticsearch database. The version is old and we are seeing a lot of issues lately. The split I/O increases to 50% + and the service becomes unresponsive. Below are the cluster details. Any settings that are obviously incorrect/need changes in order to improve performance.

{
  "_nodes" : {
    "total" : 1,
    "successful" : 1,
    "failed" : 0
  },
  "cluster_name" : "Name",
  "timestamp" : 1673623125030,
  "status" : "green",
  "indices" : {
    "count" : 8,
    "shards" : {
      "total" : 16,
      "primaries" : 16,
      "replication" : 0.0,
      "index" : {
        "shards" : {
          "min" : 2,
          "max" : 2,
          "avg" : 2.0
        },
        "primaries" : {
          "min" : 2,
          "max" : 2,
          "avg" : 2.0
        },
        "replication" : {
          "min" : 0.0,
          "max" : 0.0,
          "avg" : 0.0
        }
      }
    },
    "docs" : {
      "count" : 4337795846,
      "deleted" : 24298960
    },
    "store" : {
      "size" : "873.7gb",
      "size_in_bytes" : 938194540063,
      "throttle_time" : "0s",
      "throttle_time_in_millis" : 0
    },
    "fielddata" : {
      "memory_size" : "0b",
      "memory_size_in_bytes" : 0,
      "evictions" : 0
    },
    "query_cache" : {
      "memory_size" : "295.3mb",
      "memory_size_in_bytes" : 309741592,
      "total_count" : 180086,
      "hit_count" : 177991,
      "miss_count" : 2095,
      "cache_size" : 163,
      "cache_count" : 163,
      "evictions" : 0
    },
    "completion" : {
      "size" : "0b",
      "size_in_bytes" : 0
    },
    "segments" : {
      "count" : 426,
      "memory" : "1.7gb",
      "memory_in_bytes" : 1862916846,
      "terms_memory" : "1gb",
      "terms_memory_in_bytes" : 1090071670,
      "stored_fields_memory" : "538.8mb",
      "stored_fields_memory_in_bytes" : 565008680,
      "term_vectors_memory" : "0b",
      "term_vectors_memory_in_bytes" : 0,
      "norms_memory" : "53kb",
      "norms_memory_in_bytes" : 54272,
      "points_memory" : "195.1mb",
      "points_memory_in_bytes" : 204577440,
      "doc_values_memory" : "3mb",
      "doc_values_memory_in_bytes" : 3204784,
      "index_writer_memory" : "5.2mb",
      "index_writer_memory_in_bytes" : 5490992,
      "version_map_memory" : "1.5mb",
      "version_map_memory_in_bytes" : 1644944,
      "fixed_bit_set" : "520mb",
      "fixed_bit_set_memory_in_bytes" : 545340016,
      "max_unsafe_auto_id_timestamp" : -1,
      "file_sizes" : { }
    }
  },
  "nodes" : {
    "count" : {
      "total" : 1,
      "data" : 1,
      "coordinating_only" : 0,
      "master" : 1,
      "ingest" : 1
    },
    "versions" : [
      "5.1.1"
    ],
    "os" : {
      "available_processors" : 8,
      "allocated_processors" : 8,
      "names" : [
        {
          "name" : "Windows Server 2016",
          "count" : 1
        }
      ],
      "mem" : {
        "total" : "31.9gb",
        "total_in_bytes" : 34359132160,
        "free" : "14.8gb",
        "free_in_bytes" : 15916871680,
        "used" : "17.1gb",
        "used_in_bytes" : 18442260480,
        "free_percent" : 46,
        "used_percent" : 54
      }
    },
    "process" : {
      "cpu" : {
        "percent" : 26
      },
      "open_file_descriptors" : {
        "min" : -1,
        "max" : -1,
        "avg" : 0
      }
    },
    "jvm" : {
      "max_uptime" : "8h",
      "max_uptime_in_millis" : 28806467,
      "versions" : [
        {
          "version" : "1.8.0_271",
          "vm_name" : "Java HotSpot(TM) 64-Bit Server VM",
          "vm_version" : "25.271-b09",
          "vm_vendor" : "Oracle Corporation",
          "count" : 1
        }
      ],
      "mem" : {
        "heap_used" : "5.8gb",
        "heap_used_in_bytes" : 6327084720,
        "heap_max" : "11.9gb",
        "heap_max_in_bytes" : 12815171584
      },
      "threads" : 71
    },
    "fs" : {
      "total" : "1.3tb",
      "total_in_bytes" : 1539191074816,
      "free" : "223.4gb",
      "free_in_bytes" : 239884152832,
      "available" : "223.4gb",
      "available_in_bytes" : 239884152832
    },
    "plugins" : [
      {
        "name" : "readonlyrest",
        "version" : "1.16.16-pre2",
        "description" : "Safely expose Elasticsearch REST API",
        "classname" : "tech.beshu.ror.es.ReadonlyRestPlugin"
      }
    ],
    "network_types" : {
      "transport_types" : {
        "local" : 1
      },
      "http_types" : {
        "ssl_netty4" : 1
      }
    }
  }
}

Welcome to our community! :smiley:

5.X is very old and EOL, you should be looking at upgrading as a matter of urgency. You will also struggle to get support here given its age.

What sort of data is this? What sort of use case?

Thank you for you reply.
Yes indeed it is very old. Vendor has not upgraded it even in their applications 2023 release.
The application is File Access Manager from SailPoint which collects user access activity on network shares and other locations. In our case, the events collected are file access logs including file path, user name, timestamp etc. and the rate of receiving such events is more or less steady - 600 per second. with very few spikes going up to 2500 events per second. We have had various issues with the application and its handling of ES so wanted to check if there are any obvious issues in basic Elasticsearch configuration (which is fully controlled by the application). Also, there is daily deletion task that deletes events older than x days from ElasticSerach. Further, the application creates new indices per month . In our case typically each index has 250 GB of data and at a given time the entire cluster has max. 900 GB of total data across 3-4 indices. (Others have data in KBs). Also, data is queried via application (to create report or filter events) by only 2 users. We have other larger (50x of this single node app) multi-node Elasticsearch cluster that handles data very efficiently that makes me think possibly there is something terribly misconfigured in this application

That's seriously inefficient.

But still deletes daily? Very weird. They would be better off doing weekly indices and then deleting those entirely.

You would be best off increasing your heap size if you can. You have a lot of data for a heap that size (and a version that old).

Thank you!
I will try increasing the heap size.
Also, clarification : The app creates new index for each new month. The deletion task runs daily and deletes only events older than X days (90 in our case) As we want to keep past 90 days/3 months data. Do you think changing the schedule or number of days etc. will possibly make any impact e.g. keep 4 full indices (4 months of data) at a given time and then delete the entire index for 1st month at the end of 4th month.