Unresponsive Cluster


(Fernando) #1

Im having issues with my cluster.The cluster has 3 master and 15 nodes 13 are data nodes and 2 are edge nodes. The cluster was working before, and I did a change in one of the templates that we use.
The original config looks like:

{
"template" : "servers-",
"settings" : {
"index.refresh_interval" : "5s",
"index.routing.allocation.require.node_type": "hot",
"analysis" : {
"analyzer" : {
"default" : {
"type" : "standard",
"stopwords" : "none"
}
}
}
},
"mappings" : {
"default" : {
"_all" : {"enabled" : true},
"dynamic_templates" : [
{
"string_fields" : {
"match" : "
",
"match_mapping_type" : "string",
"mapping" : {
"type" : "multi_field",
"fields" : {
"{name}" : {"type": "string", "index" : "analyzed", "omit_norms" : true, "index_options" : "docs"}
}
}
}
} ],
"properties" : {
"@version": { "type": "string", "index": "not_analyzed" },
"geoip" : {
"type" : "object",
"dynamic": true,
"properties" : {
"ip" : { "type": "ip" },
"location" : { "type" : "geo_point" }
}
},
"tags": { "type": "string", "index": "not_analyzed" },
"pid": { "type": "long", "index": "not_analyzed" },
"priority": { "type": "integer", "index": "not_analyzed" },
"severity": { "type": "integer", "index": "not_analyzed" },
"facility": { "type": "integer", "index": "not_analyzed" },
"syslog_severity_code": { "type": "integer", "index": "not_analyzed" },
"syslog_facility_code": { "type": "integer", "index": "not_analyzed" }
}
}
}
}

My changes:

{
"template" : "servers-",
"settings" : {
"index.refresh_interval" : "5s",
"index.routing.allocation.require.node_type": "hot",
"analysis" : {
"analyzer" : {
"default" : {
"type" : "standard",
"stopwords" : "none"
}
}
}
},
"mappings" : {
"default" : {
"_all" : {"enabled" : true},
"dynamic_templates" : [
{
"beat" : {
"path_match" : "beat.
",
"match_mapping_type" : "string",
"mapping" : {
"type" : "string",
"index" : "not_analyzed",
"omit_norms" : true,
"index_options" : "offsets"
}
}
},
{
"string_fields" : {
"match" : "*",
"match_mapping_type" : "string",
"mapping" : {
"type" : "multi_field",
"fields" : {
"{name}" : {"type": "string", "index" : "analyzed", "omit_norms" : true, "index_options" : "docs"}
}
}
}
} ],
"properties" : {
"@version": { "type": "string", "index": "not_analyzed" },
"geoip" : {
"type" : "object",
"dynamic": true,
"properties" : {
"ip" : { "type": "ip" },
"location" : { "type" : "geo_point" }
}
},
"tags": { "type": "string", "index": "not_analyzed" },
"pid": { "type": "long", "index": "not_analyzed" },
"priority": { "type": "integer", "index": "not_analyzed" },
"severity": { "type": "integer", "index": "not_analyzed" },
"facility": { "type": "integer", "index": "not_analyzed" },
"syslog_severity_code": { "type": "integer", "index": "not_analyzed" },
"syslog_facility_code": { "type": "integer", "index": "not_analyzed" }
}
}
}
}

Since then the cluster has being degraded to the point that I had to restarted a few times but never recovers completely.Is there a way I can restore my cluster, I cant delete the new indices or perform any kind of action due to I always get the following error:

"error": {
"root_cause": [
{
"type": "master_not_discovered_exception",
"reason": null
}
],
"type": "master_not_discovered_exception",
"reason": null
},
"status": 503

The cluster seem to be crazy busy trying to recover indices or something else. Any suggestion of what I can do we are talking of about 1000+ indices some of them are big as 160GB


(Fernando) #2

Here some sample of the logs:

[2016-11-04 23:09:28,984][DEBUG][action.admin.indices.mapping.put] [dwh-head003.sing1.com] failed to put mappings on indices [[servers-forecaster_aggregator-2016.11.04]], type [servers]
ProcessClusterEventTimeoutException[failed to process cluster event (put-mapping [servers]) within 30s]
at org.elasticsearch.cluster.service.InternalClusterService$2$1.run(InternalClusterService.java:349)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:744)


(Mark Walkom) #3

What version are you on?
Do you have Monitoring in place?


(Fernando) #4

name": "edge001.sing1.com",
"cluster_name": "prodexpansion",
"version": {
"number": "2.3.1",
"build_hash": "bd980929010aef404e7cb0843e61d0665269fc39",
"build_timestamp": "2016-04-04T12:25:05Z",
"build_snapshot": false,
"lucene_version": "5.5.0"
},
"tagline": "You Know, for Search"
}


(Fernando) #5

We are using version 2.3.1


(Fernando) #6

I have also another question:

Why is that when I curl -XGET http://head001.sing1.com:9200/_cluster/settings, I get

{
"persistent": {},
"transient": {}
}

If the config file show:

--
action:
  disable_delete_all_indices: 1
bootstrap:
  mlockall: true
cluster:
  info:
    update:
      interval: 1m
  name: prod_sing
  routing:
    allocation:
      cluster_concurrent_rebalance: 25
      disk:
        threshold_enabled: true
        watermark:
          high: 90%
          low: 80%
discovery:
  zen:
    minimum_master_nodes: 2
    ping:
      multicast:
        enabled: false
      unicast:
        hosts:
             - kdb012.sing1.com
             - kdb013.sing1.com
             - kdb014.sing1.com
             - kdb015.sing1.com
             - kdb016.sing1.com
             - kdb017.sing1.com
             - kdb018.sing1.com
             - kdb019.sing1.com
             - edge002.sing1.com
gateway:
  recover_after_nodes: 2
http:
  cors:
    allow-origin: /.*/
    enabled: true
  enabled: true
  port: 9200
index:
  fielddata:
    cache:
      size: 40%
  number_of_replicas: 1
  number_of_shards: 15
  translog:
    flush_threshold_ops: 50000
indices:
  memory:
    index_buffer_size: 50%
network:
  bind_host: 0.0.0.0
  publish_host: 10.0.0.1
node:
  data: true
  master: false
  name: kdb006.sing1.com-hot
  node_type: hot
path:
  data: /data/hot
threadpool:
  index:
    queue_size: 200
    size: 24
    type: fixed
  management:
    size: 20
  search:
    queue_size: 100
    size: 8
    type: fixed
transport:
  port: 9300
watcher:
  actions:
    email:
      service:
        account:
          email_account:
            smtp:
              host: prodmail.sing1.com

Should be showing those settings? I do see them when I do it on the node level:

curl -XGET http://kdb006.sing1.com:9200/_nodes/head001.sing1.com/settings

{
"cluster_name": "prod_sing1",
"nodes": {
"mxBhAVAaQ1S2HlsE9DHzHg": {
"name": "head001.sing1com",
"transport_address": "10.0.0.22:9300",
"host": "10.0.0.2",
"ip": "10.0.0.2",
"version": "2.3.3",
"build": "218bdf1",
"http_address": "10.0.0.2:9200",
"attributes": {
"data": "false",
"master": "true"
},
"settings": {
"index": {
"fielddata": {
"cache": {
"size": "40%"
}
},
"number_of_replicas": "1",
"translog": {
"flush_threshold_ops": "50000"
},
"number_of_shards": "15"
},
"bootstrap": {
"mlockall": "true"
},
"client": {
"type": "node"
},
"gateway": {
"recover_after_nodes": "2"
},
"pidfile": "/var/run/elasticsearch/elasticsearch-prod_sing1.pid",
"network": {
"bind_host": "0.0.0.0",
"publish_host": "10.0.0.2"
},
"threadpool": {
"search": {
"type": "fixed",
"queue_size": "100"
},
"management": {
"size": "20"
},
"index": {
"type": "fixed",
"queue_size": "200"
}
},
"node": {
"data": "false",
"master": "true",
"name": "head001.sing1.com"
},
"http": {
"enabled": "true",
"cors": {
"enabled": "true",
"allow-origin": "/.*/"
}
},
"name": "head001.sing1.com",
"path": {
"data": "/data/warm",
"home": "/usr/share/elasticsearch",
"conf": "/etc/elasticsearch/prod_dwh_expansion",
"logs": "/var/log/elasticsearch/prod_dwh_expansion"
},
"action": {
"disable_delete_all_indices": "1"
},
"config": {
"ignore_system_properties": "true"
},
"cluster": {
"routing": {
"allocation": {
"cluster_concurrent_rebalance": "25",
"disk": {
"watermark": {
"low": "80%",
"high": "90%"
},
"threshold_enabled": "true"
}
}
},
"name": "prod_sing1",
"info": {
"update": {
"interval": "1m"
}
}
},
"indices": {
"memory": {
"index_buffer_size": "50%"
}
},
"watcher": {
"actions": {
"email": {
"service": {
"account": {
"email_account": {
"smtp": {
"host": "prodmail.com"
}
}
}
}
}
}
},
"discovery": {
"zen": {
"minimum_master_nodes": "2",
"ping": {
"unicast": {
"hosts": [
"kdb001.sing1.com",
"kdb002.sing1.com",
"kdb003.sing1.com",
"kdb004.sing1.com",
"kdb005.sing1.com",
"head001.sing1.com",
"head002.sing1.com",
"edge001.sing1.com",
 "edge002.sing1.com"
]
},
"multicast": {
"enabled": "false"
}
}
}
},
"foreground": "false"
}
}
}
}

(Mark Walkom) #7

How many shards? How many nodes?

It does that in 5.0.


(Fernando) #8
"number_of_nodes": 29,
"number_of_data_nodes": 24,
"active_primary_shards": 21076,
"active_shards": 40180,
"relocating_shards": 0,
"initializing_shards": 2,
"active_shards_percent_as_number": 95.32169292085784

(Mark Walkom) #9

The problem is you have too many shards. You need to decrease that, pretty dramatically.


(Fernando) #10

Thanks yes we decrease the number


(system) #11