Hello,
I have this cluster with 6 ES nodes, 3 logstashes and numbers of servers shippings logs with filebeat. Versions are 6.8.1 for filebeat, 6.6.1for logstash and 6.7.1for ES . Clusterstate is green.
The load on our platform is increasing considerably and we have huge issues at the moment.
I'll try to describe the stack.
All servers are vmware vm's
ES servers:
8 cores, 32 GB Ram of which 15GB is set as heap.
Indices are rotated daily (We can only keep info for 14 days due to gdpr laws)
Currently there are 191 indices containing 16b documents with a total size of 12TB
ELASTICSEARCH:
Process:
elastic+ 6548 1 97 10:32 ? 03:55:37 /bin/java -Xms15g -Xmx15g -XX:+UseG1GC -XX:MaxGCPauseMillis=400 -XX:+AlwaysPreTouch -Xss1m -Djava.awt.headless=true -Dfile.encoding=UTF-8 -Djna.nosys=true -XX:-OmitStackTraceInFastThrow -Dio.netty.noUnsafe=true -Dio.netty.noKeySetOptimization=true -Dio.netty.recycler.maxCapacityPerThread=0 -Dlog4j.shutdownHookEnabled=false -Dlog4j2.disable.jmx=true -Djava.io.tmpdir=/var/elk/tmp/ -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/var/elk/es -Xlog:gc*,gc+age=trace,safepoint:file=/var/log/elasticsearch/gc.log:utctime,pid,tags:filecount=32,filesize=64m -Djava.locale.providers=COMPAT -Des.path.home=/usr/share/elasticsearch -Des.path.conf=/etc/elasticsearch -Des.distribution.flavor=default -Des.distribution.type=rpm -cp /usr/share/elasticsearch/lib/* org.elasticsearch.bootstrap.Elasticsearch -p /var/run/elasticsearch/elasticsearch.pid --quiet
elastic+ 6640 6548 0 10:32 ? 00:00:00 /usr/share/elasticsearch/modules/x-pack-ml/platform/linux-x86_64/bin/controller
settings from elasticsearch.yml:
cluster.name: adc-elk
node.name: pdbs250.grn.prd.itv.local
node.master: false
node.data: true
path.data: /var/elk/es/data
path.logs: /var/elk/es/logs
bootstrap.memory_lock: true
network.host: 10.222.249.42
transport.host: 10.222.249.42
transport.tcp.port: 9300
http.port: 9200
discovery.zen.ping.unicast.hosts: ["server1", "server2", "server3", "server4", "server5", "server6"]
discovery.zen.minimum_master_nodes: 1
thread_pool.search.queue_size: 10000
thread_pool.index.queue_size: 1000
There are more indices in trouble (not all of them),but this is the most troublesome (and one of the 2 heaviest loaded)
{
"index_patterns": [
"agl-api-"
],
"settings": {
"index": {
"refresh_interval": "30s",
"number_of_shards": "5",
"number_of_replicas": "1"
}
},
"mappings": {
"doc": {
"dynamic_templates": [
{
"message_field": {
"path_match": "message",
"match_mapping_type": "string",
"mapping": {
"norms": false,
"type": "text"
}
}
},
{
"string_fields": {
"match": "",
"match_mapping_type": "string",
"mapping": {
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
},
"norms": false,
"type": "text"
}
}
}
],
"properties": {
"@timestamp": {
"type": "date"
},
"@version": {
"type": "keyword",
"norms": false
},
"additional-data": {
"type": "text",
"norms": false
},
"api-brand": {
"type": "keyword",
"norms": false
},
"api-query-string": {
"type": "text",
"norms": false
},
"api-url": {
"type": "text",
"norms": false
},
"api-tenant": {
"type": "keyword",
"norms": false
},
"api-version": {
"type": "keyword",
"norms": false
},
"api-platform": {
"type": "keyword",
"norms": false
},
"api-nanoservice": {
"type": "keyword",
"norms": false
},
"app-id": {
"type": "keyword",
"norms": false
},
"beat": {
"properties": {
"hostname": {
"type": "text",
"norms": false
},
"name": {
"type": "text",
"norms": false
},
"version": {
"type": "text",
"norms": false
}
}
},
"client-ip": {
"type": "ip"
},
"event-type": {
"type": "keyword",
"norms": false
},
"execution-time": {
"type": "integer"
},
"fields": {
"properties": {
"environment": {
"type": "keyword",
"norms": false
}
}
},
"host": {
"type": "keyword",
"norms": false
},
"http-verb": {
"type": "keyword",
"norms": false
},
"log-level": {
"type": "keyword",
"norms": false
},
"log-message": {
"type": "text",
"norms": false
},
"ms-error-message": {
"type": "text",
"norms": false
},
"ms-request-body": {
"type": "text",
"norms": false
},
"ms-result": {
"type": "keyword",
"norms": false
},
"ms-result-code": {
"type": "text",
"norms": false
},
"ms-url": {
"type": "text",
"norms": false
},
"ms-host": {
"type": "keyword",
"norms": false
},
"offset": {
"type": "long"
},
"platform": {
"type": "keyword",
"norms": false
},
"prospector": {
"properties": {
"type": {
"type": "text",
"norms": false
}
}
},
"session-id": {
"type": "text",
"norms": false
},
"source": {
"type": "text",
"norms": false
},
"tags": {
"type": "text",
"norms": false
},
"transaction-id": {
"type": "text",
"norms": false
},
"username": {
"type": "text",
"norms": false
},
"user-id": {
"type": "text",
"norms": false
}
}
}
}
}
After rollover of the logs at 1AM, it functions for about an hour and I see data of all 32 logshippers. After that it gradually declines until extremely slow data of only 1 comes in .
LOGSTASH:
3 logstash servers with 6 cores and 16 GB of ram
Process:
logstash 35553 1 99 14:49 ? 00:01:08 /bin/java -Xms8g -Xmx8g -XX:+UseParNewGC -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=75 -XX:+UseCMSInitiatingOccupancyOnly -XX:+DisableExplicitGC -Djava.io.tmpdir=/var/tmp -Djava.awt.headless=true -Dfile.encoding=UTF-8 -XX:+HeapDumpOnOutOfMemoryError -cp /usr/share/logstash/logstash-core/lib/jars/animal-sniffer-annotations-1.14.jar:/usr/share/logstash/logstash-core/lib/jars/commons-codec-1.11.jar:/usr/share/logstash/logstash-core/lib/jars/commons-compiler-3.0.8.jar:/usr/share/logstash/logstash-core/lib/jars/error_prone_annotations-2.0.18.jar:/usr/share/logstash/logstash-core/lib/jars/google-java-format-1.1.jar:/usr/share/logstash/logstash-core/lib/jars/gradle-license-report-0.7.1.jar:/usr/share/logstash/logstash-core/lib/jars/guava-22.0.jar:/usr/share/logstash/logstash-core/lib/jars/j2objc-annotations-1.1.jar:/usr/share/logstash/logstash-core/lib/jars/jackson-annotations-2.9.5.jar:/usr/share/logstash/logstash-core/lib/jars/jackson-core-2.9.5.jar:/usr/share/logstash/logstash-core/lib/jars/jackson-databind-2.9.5.jar:/usr/share/logstash/logstash-core/lib/jars/jackson-dataformat-cbor-2.9.5.jar:/usr/share/logstash/logstash-core/lib/jars/janino-3.0.8.jar:/usr/share/logstash/logstash-core/lib/jars/javassist-3.22.0-GA.jar:/usr/share/logstash/logstash-core/lib/jars/jruby-complete-9.1.13.0.jar:/usr/share/logstash/logstash-core/lib/jars/jsr305-1.3.9.jar:/usr/share/logstash/logstash-core/lib/jars/log4j-api-2.9.1.jar:/usr/share/logstash/logstash-core/lib/jars/log4j-core-2.9.1.jar:/usr/share/logstash/logstash-core/lib/jars/log4j-slf4j-impl-2.9.1.jar:/usr/share/logstash/logstash-core/lib/jars/logstash-core.jar:/usr/share/logstash/logstash-core/lib/jars/org.eclipse.core.commands-3.6.0.jar:/usr/share/logstash/logstash-core/lib/jars/org.eclipse.core.contenttype-3.4.100.jar:/usr/share/logstash/logstash-core/lib/jars/org.eclipse.core.expressions-3.4.300.jar:/usr/share/logstash/logstash-core/lib/jars/org.eclipse.core.filesystem-1.3.100.jar:/usr/share/logstash/logstash-core/lib/jars/org.eclipse.core.jobs-3.5.100.jar:/usr/share/logstash/logstash-core/lib/jars/org.eclipse.core.resources-3.7.100.jar:/usr/share/logstash/logstash-core/lib/jars/org.eclipse.core.runtime-3.7.0.jar:/usr/share/logstash/logstash-core/lib/jars/org.eclipse.equinox.app-1.3.100.jar:/usr/share/logstash/logstash-core/lib/jars/org.eclipse.equinox.common-3.6.0.jar:/usr/share/logstash/logstash-core/lib/jars/org.eclipse.equinox.preferences-3.4.1.jar:/usr/share/logstash/logstash-core/lib/jars/org.eclipse.equinox.registry-3.5.101.jar:/usr/share/logstash/logstash-core/lib/jars/org.eclipse.jdt.core-3.10.0.jar:/usr/share/logstash/logstash-core/lib/jars/org.eclipse.osgi-3.7.1.jar:/usr/share/logstash/logstash-core/lib/jars/org.eclipse.text-3.5.101.jar:/usr/share/logstash/logstash-core/lib/jars/slf4j-api-1.7.25.jar org.logstash.Logstash --path.settings /etc/logstash
IS it necessary to post fileters and output as well? Output to different ES nodes is done based on tags given by filebeat.
logstash yml settings:
path.data: /var/lib/logstash
pipeline.workers: 16
pipeline.batch.size: 200
config.support_escapes: true
queue.type: persisted
log.level: info
path.logs: /var/log/logstash
I tried creating a 2nd pipeline but this did nothing to help,so for the sake of debugging I reverted back to only port 5044
FILEBEAT:
Settings:
filebeat.inputs:
- type: log
enabled: true
paths:
- /product/AGL/agl-core/logs/agl.log
exclude_files: ['\.gz$']
multiline.pattern: '^ts:'
multiline.negate: true
multiline.match: after
tags: ["avs6", "api-log", "apigateway", "asd"]
ignore_older: 336h
close_inactive: 5m
close_removed: true
clean_removed: true
harvester_limit: 0
filebeat.config.modules:
enabled: false
processors:
- drop_fields:
fields: ["host"]
fields:
environment: production
queue.mem:
events: 4096
flush.min_events: 1024
flush.timeout: 10s
output.logstash:
enabled: true
hosts: ["logstash1:5044","logstash2:5044","logstash3:5044"]
loadbalance: true
timeout: 5m
bulk_max_size: 2048
slow_start: true
logging:
level: info
metrics:
enabled: false
The concerning logs grow to 8 to 10 GB a day per server.
What am I missing?