elasticsearch cluster: 11 nodes (8 CPU/62GB RAM, i3.2xlarge instances in AWS) version 8.13.2
kibana: 1 node (4CPU, 8GB RAM) c5.xlarge instance in AWS, version 8.13.2
apm-server: 1 node (8CPU, 16GB RAM), c5.2xlarge instance in AWS, version 8.13.2
Agent:
APM jar version 1.50.0
java version 21
/etc/apm-server/apm-server.yml config
######################### APM Server Configuration #########################
################################ APM Server ################################
apm-server:
# Defines the host and port the server is listening on. Use "unix:/path/to.sock" to listen on a unix domain socket.
host: "IP_APM_SERVER_INSTANCE:8200"
# Maximum permitted size in bytes of a request's header accepted by the server to be processed.
max_header_size: 1048576
# Maximum amount of time to wait for the next incoming request before underlying connection is closed.
idle_timeout: 90s
# Maximum permitted duration for reading an entire request.
read_timeout: 300s
# Maximum permitted duration for writing a response.
write_timeout: 300s
# Maximum permitted size in bytes of an event accepted by the server to be processed.
max_event_size: 1457600
#---------------------------- APM Server - RUM Real User Monitoring ----------------------------
rum:
enabled: true
event_rate:
limit: 300
lru_size: 1000
allow_origins: ['*']
#---------------------------- APM Server - Agent Configuration ----------------------------
kibana:
# For APM Agent configuration in Kibana, enabled must be true.
enabled: true
host: "IP_KIBANA_INSTANCE:5601"
#================================= General =================================
queue:
# Queue type by name (default 'mem').
mem:
# Max number of events the queue can buffer.
events: 100000
#-------------------------- Elasticsearch output --------------------------
output.elasticsearch:
# Array of hosts to connect to.
# Scheme and port can be left out and will be set to the default (`http` and `9200`).
# In case you specify and additional path, the scheme is required: `http://localhost:9200/path`.
# IPv6 addresses should always be defined as: `https://[2001:db8::1]:9200`.
hosts: ["DNS_OF_THE_ELASTICSEARCH_ALB:9200"]
# Boolean flag to enable or disable the output module.
enabled: true
# Number of workers per Elasticsearch host.
worker: 8
# The maximum number of events to bulk in a single Elasticsearch bulk API index request.
# The default is 50.
bulk_max_size: 80000
# Configure http request timeout before failing an request to Elasticsearch.
timeout: 300
#================================= Paths ==================================
# The home path for the apm-server installation. This is the default base path
# for all other path settings and for miscellaneous files that come with the
# distribution.
# If not set by a CLI flag or in the configuration file, the default for the
# home path is the location of the binary.
path.home: /usr/share/apm-server
# The configuration path for the apm-server installation. This is the default
# base path for configuration files, including the main YAML configuration file
# and the Elasticsearch template file. If not set by a CLI flag or in the
# configuration file, the default for the configuration path is the home path.
path.config: /etc/apm-server
# The data path for the apm-server installation. This is the default base path
# for all the files in which apm-server needs to store its data. If not set by a
# CLI flag or in the configuration file, the default for the data path is a data
# subdirectory inside the home path.
path.data: /var/lib/apm-server
# The logs path for an apm-server installation. If not set by a CLI flag or in the
# configuration file, the default is a logs subdirectory inside the home path.
path.logs: /var/log/apm-server
#================================= Logging =================================
# There are three options for the log output: syslog, file, and stderr.
# Windows systems default to file output. All other systems default to syslog.
# Sets the minimum log level. The default log level is info.
# Available log levels are: error, warning, info, or debug.
logging.level: info
# Enable debug output for selected components. To enable all selectors use ["*"].
# Other available selectors are "beat", "publish", or "service".
# Multiple selectors can be chained.
#logging.selectors: [ ]
# Send all logging output to syslog. The default is false.
#logging.to_syslog: true
# If enabled, apm-server periodically logs its internal metrics that have changed
# in the last period. For each metric that changed, the delta from the value at
# the beginning of the period is logged. Also, the total values for
# all non-zero internal metrics are logged on shutdown. The default is false.
#logging.metrics.enabled: false
# The period after which to log the internal metrics. The default is 30s.
#logging.metrics.period: 30s
# Logging to rotating files. When true, writes all logging output to files.
# The log files are automatically rotated when the log file size limit is reached.
logging.to_files: true
logging.files:
# Configure the path where the logs are written. The default is the logs directory
# under the home path (the binary location).
path: /var/log/apm-server
# The name of the files where the logs are written to.
name: apm-server
# Configure log file size limit. If limit is reached, log file will be
# automatically rotated.
rotateeverybytes: 62914560 # = 60MB
# Number of rotated log files to keep. Oldest files will be deleted first.
keepfiles: 7
# The permissions mask to apply when rotating log files. The default value is 0600.
# Must be a valid Unix-style file permissions mask expressed in octal notation.
permissions: 0600
# Enable log file rotation on time intervals in addition to size-based rotation.
# Intervals must be at least 1s. Values of 1m, 1h, 24h, 7*24h, 30*24h, and 365*24h
# are boundary-aligned with minutes, hours, days, weeks, months, and years as
# reported by the local system clock. All other intervals are calculated from the
# Unix epoch. Defaults to disabled.
#interval: 0
# Set to true to log messages in json format.
logging.json: true
# Set to true, to log messages with minimal required Elastic Common Schema (ECS)
# information. Recommended to use in combination with `logging.json=true`.
#logging.ecs: true
With these settings when a customer sends events he sees in the logs of the agent:
024-07-04 11:36:34,056 [elastic-apm-server-reporter] INFO co.elastic.apm.agent.report.AbstractIntakeApiHandler - Backing off for 9 seconds (+/-10%)2024-07-04 11:36:43,592 [elastic-apm-server-reporter] ERROR co.elastic.apm.agent.report.AbstractIntakeApiHandler - Error sending data to APM server: Read timed out, response code is -12024-07-04 11:36:43,712 [elastic-apm-server-reporter] ERROR co.elastic.apm.agent.report.AbstractIntakeApiHandler - Error sending data to APM server: Read timed out, response code is -1
On the apm-server side, I see the following in the logs:
{"log.level":"error","@timestamp":"2024-07-08T15:12:35.230Z","log.logger":"request","log.origin":{"function":"github.com/elastic/apm-server/internal/beater/api.apmMiddleware.LogMiddleware.func1.1","file.name":"middleware/log_middleware.go","file.line":59},"message":"request timed out","service.name":"apm-server","url.original":"/intake/v2/events","http.request.method":"POST","user_agent.original":"apm-agent-java/1.50.0 (identity 1.0.0)","source.address":"source_address_ip","http.request.id":"43w4242-c434-42re-bree-b9c16b5719a7432f","event.duration":15005476018,"http.request.body.bytes":2320,"http.response.status_code":503,"error.message":"request timed out","ecs.version":"1.6.0"}
{"log.level":"error","@timestamp":"2024-07-08T15:12:35.252Z","log.logger":"request","log.origin":{"function":"github.com/elastic/apm-server/internal/beater/api.apmMiddleware.LogMiddleware.func1.1","file.name":"middleware/log_middleware.go","file.line":59},"message":"request timed out","service.name":"apm-server","url.original":"/intake/v2/events","http.request.method":"POST","user_agent.original":"apm-agent-java/1.50.0 (identity 1.0.0)","source.address":"source_address_ip","http.request.id":"request_id","event.duration":15007539970,"http.request.body.bytes":2310,"http.response.status_code":503,"error.message":"request timed out","ecs.version":"1.6.0"}
{"log.level":"error","@timestamp":"2024-07-08T15:12:36.359Z","log.logger":"request","log.origin":{"function":"github.com/elastic/apm-server/internal/beater/api.apmMiddleware.LogMiddleware.func1.1","file.name":"middleware/log_middleware.go","file.line":59},"message":"request timed out","service.name":"apm-server","url.original":"/intake/v2/events","http.request.method":"POST","user_agent.original":"apm-agent-java/1.50.0 (identity 1.0.0)","source.address":"source_address_ip","http.request.id":"request_id","event.duration":15012934832,"http.request.body.bytes":3129,"http.response.status_cod
Can you please help me fix this? Thank you!