Divide by Zero Error upgrading a deployment

Hi,

I'm getting the following error when upgrading a deployment (version 6.5.4 to 6.7.2 on ECE 2.2.2)

Unexpected error during step: [rolling-upgrade]: [java.lang.ArithmeticException: / by zero]

Any idea why this error may be occouring. When I look in the logs (logging and metrics logs) the deployment has no errors.

Here's the plan I'm applying

{
  "tiebreaker_topology": {
    "memory_per_node": 1024
  },
  "elasticsearch": {
    "version": "6.7.2",
    "system_settings": {
      "use_disk_threshold": true
    },
    "user_settings_yaml": "redacted"
  },
  "transient": {
    "strategy": {
      "grow_and_shrink": {}
    },
    "plan_configuration": {
      "preferred_allocators": [],
      "max_snapshot_attempts": 3,
      "move_allocators": [],
      "skip_snapshot": false,
      "move_instances": [],
      "skip_post_upgrade_steps": false,
      "extended_maintenance": false,
      "skip_upgrade_checker": false,
      "override_failsafe": false,
      "skip_data_migration": false,
      "calm_wait_time": 5,
      "reallocate_instances": false,
      "timeout": 4096,
      "max_snapshot_age": 300,
      "move_only": false
    }
  },
  "cluster_topology": [
    {
      "memory_per_node": 1024,
      "node_type": {
        "master": true,
        "data": true,
        "ingest": false,
        "ml": false
      },
      "instance_configuration_id": "9c6147389f7d45d6b19c2e93f852cd49",
      "elasticsearch": {
        "system_settings": {
          "enable_close_index": false,
          "use_disk_threshold": true,
          "monitoring_collection_interval": -1,
          "monitoring_history_duration": "7d",
          "destructive_requires_name": false,
          "reindex_whitelist": [],
          "auto_create_index": true,
          "scripting": {
            "stored": {
              "enabled": true
            },
            "inline": {
              "enabled": true
            }
          },
          "http": {
            "compression": true,
            "cors_enabled": false,
            "cors_max_age": 1728000,
            "cors_allow_credentials": false
          }
        },
        "user_settings_yaml": " redacted "
      },
      "zone_count": 1,
      "node_count_per_zone": 1
    },
    {
      "size": {
        "value": 0,
        "resource": "memory"
      },
      "node_type": {
        "master": true,
        "data": true,
        "ingest": true,
        "ml": false
      },
      "instance_configuration_id": "data.default",
      "elasticsearch": {
        "system_settings": {
          "enable_close_index": false,
          "use_disk_threshold": true,
          "monitoring_collection_interval": -1,
          "monitoring_history_duration": "7d",
          "destructive_requires_name": false,
          "reindex_whitelist": [],
          "auto_create_index": true,
          "scripting": {
            "stored": {
              "enabled": true
            },
            "inline": {
              "enabled": true
            }
          },
          "http": {
            "compression": true,
            "cors_enabled": false,
            "cors_max_age": 1728000,
            "cors_allow_credentials": false
          }
        },
        "user_settings_yaml": " redacted "
      },
      "zone_count": 1
    },
    {
      "size": {
        "value": 0,
        "resource": "memory"
      },
      "node_type": {
        "master": true,
        "data": false,
        "ingest": false,
        "ml": false
      },
      "instance_configuration_id": "master",
      "elasticsearch": {
        "system_settings": {
          "enable_close_index": false,
          "use_disk_threshold": true,
          "monitoring_collection_interval": -1,
          "monitoring_history_duration": "7d",
          "destructive_requires_name": false,
          "reindex_whitelist": [],
          "auto_create_index": true,
          "scripting": {
            "stored": {
              "enabled": true
            },
            "inline": {
              "enabled": true
            }
          },
          "http": {
            "compression": true,
            "cors_enabled": false,
            "cors_max_age": 1728000,
            "cors_allow_credentials": false
          }
        },
        "user_settings_yaml": " redacted "
      },
      "zone_count": 1
    },
    {
      "size": {
        "value": 0,
        "resource": "memory"
      },
      "node_type": {
        "master": false,
        "data": false,
        "ingest": false,
        "ml": true
      },
      "instance_configuration_id": "ml",
      "elasticsearch": {
        "system_settings": {
          "enable_close_index": false,
          "use_disk_threshold": true,
          "monitoring_collection_interval": -1,
          "monitoring_history_duration": "7d",
          "destructive_requires_name": false,
          "reindex_whitelist": [],
          "auto_create_index": true,
          "scripting": {
            "stored": {
              "enabled": true
            },
            "inline": {
              "enabled": true
            }
          },
          "http": {
            "compression": true,
            "cors_enabled": false,
            "cors_max_age": 1728000,
            "cors_allow_credentials": false
          }
        },
        "user_settings_yaml": " redacted "
      },
      "zone_count": 1
    }
  ],
  "deployment_template": {
    "id": "default"
  }
}

Many thanks

That's an interesting one!

I had a look around and couldn't see where that could be happening. The error you'd see would be in the service-constructor index in the L+M - can you have a look around the right timestamp to see if anything is in there?

The only oddity with the plan is that the data topology element is using node_count_per_zone/memory_per_node whereas the others (empty ones) are using size

Maybe try replacing those 2 fields with:

"size": {
   "value": 1024, "resource": "memory"
},

but that's a big guess. There should be an exception in the constructor logs (or even possibly in the activity page for that cluster under details), which will tell us what's actually happening

Alex

1 Like

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.