Elasticsearch killed by oom-killer

Elasticsearch version: 6.2.3
System:

[root@my-host-name]# uname -s -r -v -m -p -i -o
Linux 5.4.8-1.el7.elrepo.x86_64 #1 SMP Sat Jan 4 15:29:03 EST 2020 x86_64 x86_64 x86_64 GNU/Linux

ErrorMessage in /var/log/message:


Jun  1 17:48:09 my-host-name kernel: java invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=0
Jun  1 17:48:09 my-host-name kernel: CPU: 1 PID: 45239 Comm: java Tainted: P           O      5.4.8-1.el7.elrepo.x86_64 #1
Jun  1 17:48:09 my-host-name kernel: Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.1-0-g0551a4be2c-prebuilt.qemu-project.org 04/01/2014
Jun  1 17:48:09 my-host-name kernel: Call Trace:
Jun  1 17:48:09 my-host-name kernel: dump_stack+0x6d/0x98
Jun  1 17:48:09 my-host-name kernel: dump_header+0x51/0x210
Jun  1 17:48:09 my-host-name kernel: oom_kill_process+0x105/0x130
Jun  1 17:48:09 my-host-name kernel: out_of_memory+0x105/0x4c0
Jun  1 17:48:09 my-host-name kernel: __alloc_pages_slowpath+0x876/0xb15
Jun  1 17:48:09 my-host-name kernel: __alloc_pages_nodemask+0x2ee/0x320
Jun  1 17:48:09 my-host-name kernel: alloc_pages_current+0x6a/0xb0
Jun  1 17:48:09 my-host-name kernel: __page_cache_alloc+0x73/0x90
Jun  1 17:48:09 my-host-name kernel: pagecache_get_page+0x59/0x280
Jun  1 17:48:09 my-host-name kernel: filemap_fault+0x5c0/0x8c0
Jun  1 17:48:09 my-host-name kernel: ? futex_wait+0x183/0x260
Jun  1 17:48:09 my-host-name kernel: ? xas_load+0xd/0x80
Jun  1 17:48:09 my-host-name kernel: ? xas_find+0x138/0x1c0
Jun  1 17:48:09 my-host-name kernel: ? filemap_map_pages+0xe9/0x3d0
Jun  1 17:48:09 my-host-name kernel: ext4_filemap_fault+0x31/0x50 [ext4]
Jun  1 17:48:09 my-host-name kernel: __do_fault+0x3e/0xb0
Jun  1 17:48:09 my-host-name kernel: __handle_mm_fault+0xbcc/0xe80
Jun  1 17:48:09 my-host-name kernel: handle_mm_fault+0xea/0x200
Jun  1 17:48:09 my-host-name kernel: __do_page_fault+0x225/0x490
Jun  1 17:48:09 my-host-name kernel: do_page_fault+0x36/0x120
Jun  1 17:48:09 my-host-name kernel: do_async_page_fault+0x26/0xd0
Jun  1 17:48:09 my-host-name kernel: ? do_syscall_64+0x185/0x1c0
Jun  1 17:48:09 my-host-name kernel: async_page_fault+0x3e/0x50
Jun  1 17:48:09 my-host-name kernel: RIP: 0033:0x7fd55f51e145
Jun  1 17:48:09 my-host-name kernel: Code: Bad RIP value.
Jun  1 17:48:09 my-host-name kernel: RSP: 002b:00007fd55cf0ed80 EFLAGS: 00010246
Jun  1 17:48:09 my-host-name kernel: RAX: 0000000000000000 RBX: 00007fd558033a00 RCX: 00007fd5604355ea
Jun  1 17:48:09 my-host-name kernel: RDX: 00007fd558033a28 RSI: 0000000000000000 RDI: 00007fd558033a28
Jun  1 17:48:09 my-host-name kernel: RBP: 00007fd55cf0edb0 R08: 0000000000000000 R09: 0000000000000001
Jun  1 17:48:09 my-host-name kernel: R10: 0000000000000000 R11: 0000000000000206 R12: 00007fd558032800
Jun  1 17:48:09 my-host-name kernel: R13: 00007fd55fc1bfec R14: 00007fd558007f50 R15: 00007fd558007f50
Jun  1 17:48:09 my-host-name kernel: Mem-Info:
Jun  1 17:48:09 my-host-name kernel: active_anon:7590615 inactive_anon:446533 isolated_anon:0#012 active_file:187 inactive_file:115 isolated_file:0#012 unevictable:0 dirty:1 writeback:0 unstable:0#012 slab_reclaimable:63908 slab_unreclaimable:10475#012 mapped:32351 shmem:426512 pagetables:32524 bounce:0#012 free:49792 free_pcp:1154 free_cma:0
Jun  1 17:48:09 my-host-name kernel: Node 0 active_anon:30362460kB inactive_anon:1786132kB active_file:748kB inactive_file:460kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:129404kB dirty:4kB writeback:0kB shmem:1706048kB shmem_thp: 0kB shmem_pmdmapped: 0kB anon_thp: 18085888kB writeback_tmp:0kB unstable:0kB all_unreclaimable? no
Jun  1 17:48:09 my-host-name kernel: Node 0 DMA free:15876kB min:32kB low:44kB high:56kB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:15992kB managed:15908kB mlocked:0kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB
Jun  1 17:48:09 my-host-name kernel: lowmem_reserve[]: 0 2944 32044 32044 32044
Jun  1 17:48:09 my-host-name kernel: Node 0 DMA32 free:122244kB min:6204kB low:9216kB high:12228kB active_anon:2675556kB inactive_anon:123984kB active_file:296kB inactive_file:0kB unevictable:0kB writepending:0kB present:3129180kB managed:3041652kB mlocked:0kB kernel_stack:160kB pagetables:15472kB bounce:0kB free_pcp:1512kB local_pcp:208kB free_cma:0kB
Jun  1 17:48:09 my-host-name kernel: lowmem_reserve[]: 0 0 29100 29100 29100
Jun  1 17:48:09 my-host-name kernel: Node 0 Normal free:61048kB min:61340kB low:91136kB high:120932kB active_anon:27686904kB inactive_anon:1662148kB active_file:688kB inactive_file:0kB unevictable:0kB writepending:4kB present:30408704kB managed:29798500kB mlocked:0kB kernel_stack:7184kB pagetables:114624kB bounce:0kB free_pcp:3104kB local_pcp:1192kB free_cma:0kB
Jun  1 17:48:09 my-host-name kernel: lowmem_reserve[]: 0 0 0 0 0
Jun  1 17:48:09 my-host-name kernel: Node 0 DMA: 1*4kB (U) 0*8kB 0*16kB 0*32kB 2*64kB (U) 1*128kB (U) 1*256kB (U) 0*512kB 1*1024kB (U) 1*2048kB (M) 3*4096kB (M) = 15876kB
Jun  1 17:48:09 my-host-name kernel: Node 0 DMA32: 176*4kB (UME) 489*8kB (UME) 461*16kB (UME) 1132*32kB (UME) 376*64kB (UME) 120*128kB (UME) 81*256kB (UE) 20*512kB (ME) 4*1024kB (UE) 0*2048kB 0*4096kB = 122712kB
Jun  1 17:48:09 my-host-name kernel: Node 0 Normal: 903*4kB (UME) 909*8kB (UME) 510*16kB (UME) 951*32kB (UME) 164*64kB (ME) 12*128kB (ME) 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 61508kB
Jun  1 17:48:09 my-host-name kernel: Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
Jun  1 17:48:09 my-host-name kernel: 426870 total pagecache pages
Jun  1 17:48:09 my-host-name kernel: 0 pages in swap cache
Jun  1 17:48:09 my-host-name kernel: Swap cache stats: add 0, delete 0, find 0/0
Jun  1 17:48:09 my-host-name kernel: Free swap  = 16777212kB
Jun  1 17:48:09 my-host-name kernel: Total swap = 16777212kB
Jun  1 17:48:09 my-host-name kernel: 8388469 pages RAM
Jun  1 17:48:09 my-host-name kernel: 0 pages HighMem/MovableOnly
Jun  1 17:48:09 my-host-name kernel: 174454 pages reserved
Jun  1 17:48:09 my-host-name kernel: 0 pages hwpoisoned
Jun  1 17:48:09 my-host-name kernel: Tasks state (memory values in pages):
Jun  1 17:48:09 my-host-name kernel: [  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name
Jun  1 17:48:09 my-host-name kernel: [   2897]     0  2897    74771    41584   638976        0             0 systemd-journal
Jun  1 17:48:09 my-host-name kernel: [   2925]     0  2925    47594       95   147456        0             0 lvmetad
Jun  1 17:48:09 my-host-name kernel: [   2932]     0  2932    11300      156   118784        0         -1000 systemd-udevd
Jun  1 17:48:09 my-host-name kernel: [   5441]     0  5441    13884      112   131072        0         -1000 auditd
Jun  1 17:48:09 my-host-name kernel: [   5467]     0  5467     6597       82    98304        0             0 systemd-logind
Jun  1 17:48:09 my-host-name kernel: [   5469]    81  5469    14528      141   159744        0          -900 dbus-daemon
Jun  1 17:48:09 my-host-name kernel: [   5503]    32  5503    17336      162   172032        0             0 rpcbind
Jun  1 17:48:09 my-host-name kernel: [   5540]     0  5540     4106       83    73728        0             0 qemu-ga
Jun  1 17:48:09 my-host-name kernel: [   5541]     0  5541     5411       81    86016        0             0 irqbalance
Jun  1 17:48:09 my-host-name kernel: [   5555]   999  5555   153093     1924   290816        0             0 polkitd
Jun  1 17:48:09 my-host-name kernel: [   5631]    38  5631     6434      146    90112        0             0 ntpd
Jun  1 17:48:09 my-host-name kernel: [   5790]     0  5790    48804      117   172032        0             0 gssproxy
Jun  1 17:48:09 my-host-name kernel: [   5798]     0  5798    31608      167   102400        0             0 crond
Jun  1 17:48:09 my-host-name kernel: [   5804]     0  5804    27551       34    65536        0             0 agetty
Jun  1 17:48:09 my-host-name kernel: [   5871]  1001  5871    28238     2556   110592        0             0 elasticsearch_e
Jun  1 17:48:09 my-host-name kernel: [   5873]  1001  5873    28773     3136   102400        0             0 node_exporter
Jun  1 17:48:09 my-host-name kernel: [   5875]     0  5875   143510     2774   425984        0             0 tuned
Jun  1 17:48:09 my-host-name kernel: [   5882]     0  5882   210409    69648  1216512        0             0 rsyslogd
Jun  1 17:48:09 my-host-name kernel: [  46675]     0 46675     6263       55    94208        0             0 xinetd
Jun  1 17:48:09 my-host-name kernel: [  46773]     0 46773    12927      165   135168        0         -1000 sshd
Jun  1 17:48:09 my-host-name kernel: [  40998]  1000 40998 20627928  4517584 104001536        0             0 java
Jun  1 17:48:09 my-host-name kernel: [  44596]     0 44596    29490      265   270336        0             0 sshd
Jun  1 17:48:09 my-host-name kernel: [  44599]  1000 44599    29555      311   266240        0             0 sshd
Jun  1 17:48:09 my-host-name kernel: [  44600]  1000 44600    28888       70    81920        0             0 bash
Jun  1 17:48:09 my-host-name kernel: [  44672]     0 44672    60322      291   315392        0             0 sudo
Jun  1 17:48:09 my-host-name kernel: [  44673]     0 44673    47971      146   233472        0             0 su
Jun  1 17:48:09 my-host-name kernel: [  44675]     0 44675    28921      131    77824        0             0 bash
Jun  1 17:48:09 my-host-name kernel: [  45184]     0 45184  4645951  3033311 24535040        0             0 java
Jun  1 17:48:09 my-host-name kernel: oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),cpuset=/,mems_allowed=0,global_oom,task_memcg=/,task=java,pid=40998,uid=1000
Jun  1 17:48:09 my-host-name kernel: Out of memory: Killed process 40998 (java) total-vm:82511712kB, anon-rss:18070336kB, file-rss:0kB, shmem-rss:0kB, UID:1000 pgtables:104001536kB oom_score_adj:0
Jun  1 17:48:09 my-host-name kernel: oom_reaper: reaped process 40998 (java), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
Jun  1 17:48:09 my-host-name systemd: elasticsearch6.service: main process exited, code=killed, status=9/KILL
Jun  1 17:48:10 my-host-name kill: Usage:
Jun  1 17:48:10 my-host-name kill: kill [options] <pid|name> [...]
Jun  1 17:48:10 my-host-name kill: Options:
Jun  1 17:48:10 my-host-name kill: -a, --all              do not restrict the name-to-pid conversion to processes
Jun  1 17:48:10 my-host-name kill: with the same uid as the present process
Jun  1 17:48:10 my-host-name systemd: elasticsearch6.service: control process exited, code=exited status=1
Jun  1 17:48:10 my-host-name kill: -s, --signal <sig>     send specified signal
Jun  1 17:48:10 my-host-name kill: -q, --queue <sig>      use sigqueue(2) rather than kill(2)
Jun  1 17:48:10 my-host-name kill: -p, --pid              print pids without signaling them
Jun  1 17:48:10 my-host-name kill: -l, --list [=<signal>] list signal names, or convert one to a name
Jun  1 17:48:10 my-host-name kill: -L, --table            list signal names and numbers
Jun  1 17:48:10 my-host-name kill: -h, --help     display this help and exit
Jun  1 17:48:10 my-host-name kill: -V, --version  output version information and exit
Jun  1 17:48:10 my-host-name kill: For more details see kill(1).
Jun  1 17:48:10 my-host-name systemd: Unit elasticsearch6.service entered failed state.
Jun  1 17:48:10 my-host-name systemd: elasticsearch6.service failed.

And I got some useful message from this topic:Elasticsearch 8: new OOM kills in comparison with ES7

By @ DavidTurner.

I am confused, is there any relation with the "Swap"?
In my case, the swap is on:

swapon -s -h
Filename                                Type            Size    Used    Priority
/dev/dm-1                               partition       16777212        0       -2

image

Welcome to our community! :smiley:

What do your Elasticsearch logs show?
What is your config?

Please note that version is EOL and no longer supported, you should be looking to upgrade as a matter of urgency.

The logs show nothing, just some restart logs.
and here is my config:

cluster.name: search_es6
path.data: /opt/es6/data
path.logs: /opt/es6/logs
network.host: x.x.x.x
http.port: 9210
transport.tcp.port: 9310
discovery.zen.ping.unicast.hosts: [xxxxx]
discovery.zen.minimum_master_nodes: 3

in addition, here is JVM INFO:

PS: The cluster was running for almost 1 year,just one node killed by oom-killer,and the other nodes runs well.

What is in your logs may still be important and it would be useful if you shared them.

Thanks @warkolm, here is the log at that moment. As I said before, there is nothing useful, because the first line of the log happened in '17:48:03', and the "oom killer" log happened in "17:48:09", after that,I restarted the node manually.

## Add by Andy: a dictionary plugin get file from remote(you can ignore this message)

[2023-06-01T17:48:03,320][ERROR][c.h.d.RemoteMonitor      ] remote dict org.apache.http.conn.HttpHostConnectException: Connect to x.x.x:443 [x.x.x.com/x.x.x.x] failed: Connection timed out (Connection timed out) error!

## Add by Andy: restart the node manually

[2023-06-01T17:51:03,174][INFO ][o.e.n.Node               ] [x.x.x.x] initializing ...[2023-06-01T17:51:03,323][INFO ][o.e.e.NodeEnvironment    ] [x.x.x.x] using [1] data paths, mounts [[/opt (/dev/vdc1)]], net usable_space [862.9gb], net total_space [984.1gb], types [ext4]
[2023-06-01T17:51:03,324][INFO ][o.e.e.NodeEnvironment    ] [x.x.x.x] heap size [16gb], compressed ordinary object pointers [true]
[2023-06-01T17:51:04,898][INFO ][o.e.n.Node               ] [x.x.x.x] node name [x.x.x.x], node ID [OpgcpcfjRDi3uIQ__FUhhw][2023-06-01T17:51:04,899][INFO ][o.e.n.Node               ] [x.x.x.x] version[6.2.3], pid[45302], build[c59ff00/2018-03-13T10:06:29.741383Z], OS[Linux/5.4.8-1.el7.elrepo.x86_64/amd64], JVM[Oracle Corporation/Java HotSpot(TM) 64-Bit Server VM/1.8.0_231/25.231-b11][2023-06-01T17:51:04,899][INFO ][o.e.n.Node               ] [x.x.x.x] JVM arguments [-Xms16g, -Xmx16g, -XX:+UnlockExperimentalVMOptions, -XX:+UseG1GC, -XX:G1NewSizePercent=5, -XX:G1MaxNewSizePercent=30, -XX:MaxGCPauseMillis=300, -XX:G1ReservePercent=25, -XX:InitiatingHeapOccupancyPercent=30, -XX:+AlwaysPreTouch, -Xss1m, -Djava.awt.headless=true, -Dfile.encoding=UTF-8, -Djna.nosys=true, -XX:-OmitStackTraceInFastThrow, -Dio.netty.noUnsafe=true, -Dio.netty.noKeySetOptimization=true, -Dio.netty.recycler.maxCapacityPerThread=0, -Dlog4j.shutdownHookEnabled=false, -Dlog4j2.disable.jmx=true, -Djava.io.tmpdir=/tmp/elasticsearch.FRCodhhp, -XX:+HeapDumpOnOutOfMemoryError, -XX:+PrintGCDetails, -XX:+PrintGCDateStamps, -XX:+PrintTenuringDistribution, -XX:+PrintGCApplicationStoppedTime, -Xloggc:logs/gc.log, -XX:+UseGCLogFileRotation, -XX:NumberOfGCLogFiles=32, -XX:GCLogFileSize=64m, -Des.path.home=/opt/es6/elasticsearch, -Des.path.conf=/opt/es6/elasticsearch/config]
[2023-06-01T17:51:06,546][INFO ][o.e.p.p.PrometheusExporterPlugin] starting Prometheus exporter plugin
[2023-06-01T17:51:06,912][INFO ][o.e.p.PluginsService     ] [x.x.x.x] loaded module [aggs-matrix-stats]
[2023-06-01T17:51:06,912][INFO ][o.e.p.PluginsService     ] [x.x.x.x] loaded module [analysis-common]
[2023-06-01T17:51:06,912][INFO ][o.e.p.PluginsService     ] [x.x.x.x] loaded module [ingest-common]
[2023-06-01T17:51:06,912][INFO ][o.e.p.PluginsService     ] [x.x.x.x] loaded module [lang-expression]
[2023-06-01T17:51:06,912][INFO ][o.e.p.PluginsService     ] [x.x.x.x] loaded module [lang-mustache]
[2023-06-01T17:51:06,912][INFO ][o.e.p.PluginsService     ] [x.x.x.x] loaded module [lang-painless]
[2023-06-01T17:51:06,913][INFO ][o.e.p.PluginsService     ] [x.x.x.x] loaded module [mapper-extras]
[2023-06-01T17:51:06,913][INFO ][o.e.p.PluginsService     ] [x.x.x.x] loaded module [parent-join]
[2023-06-01T17:51:06,913][INFO ][o.e.p.PluginsService     ] [x.x.x.x] loaded module [percolator]
[2023-06-01T17:51:06,914][INFO ][o.e.p.PluginsService     ] [x.x.x.x] loaded module [rank-eval]
[2023-06-01T17:51:06,914][INFO ][o.e.p.PluginsService     ] [x.x.x.x] loaded module [reindex]
[2023-06-01T17:51:06,914][INFO ][o.e.p.PluginsService     ] [x.x.x.x] loaded module [repository-url]
[2023-06-01T17:51:06,914][INFO ][o.e.p.PluginsService     ] [x.x.x.x] loaded module [transport-netty4]
[2023-06-01T17:51:06,914][INFO ][o.e.p.PluginsService     ] [x.x.x.x] loaded module [tribe]
[2023-06-01T17:51:06,914][INFO ][o.e.p.PluginsService     ] [x.x.x.x] loaded plugin [analysis-hanlp]
[2023-06-01T17:51:06,914][INFO ][o.e.p.PluginsService     ] [x.x.x.x] loaded plugin [analysis-ik]
[2023-06-01T17:51:06,915][INFO ][o.e.p.PluginsService     ] [x.x.x.x] loaded plugin [prometheus-exporter]
[2023-06-01T17:51:09,697][INFO ][o.e.d.DiscoveryModule    ] [x.x.x.x] using discovery type [zen]
[2023-06-01T17:51:10,232][INFO ][o.e.n.Node               ] [x.x.x.x] initialized
[2023-06-01T17:51:10,232][INFO ][o.e.n.Node               ] [x.x.x.x] starting ...
[2023-06-01T17:51:10,415][INFO ][o.e.t.TransportService   ] [x.x.x.x] publish_address {x.x.x.x:9310}, bound_addresses {x.x.x.x:9310}
[2023-06-01T17:51:10,479][INFO ][o.e.b.BootstrapChecks    ] [x.x.x.x] bound or publishing to a non-loopback address, enforcing bootstrap checks
[2023-06-01T17:51:13,768][INFO ][o.e.c.s.ClusterApplierService] [x.x.x.x] detected_master {1.2.3.6}{I6LUhVFDR2yP2De2nyEugQ}{uDV540TEQx2NiM-2Mx46Rg}{1.2.3.6}{1.2.3.6:9310}, added {{1.2.3.4}{1GrtPsrkSAW5zl8smoduaw}{bg4cAGxNQdCWpDMVMa7d5g}{1.2.3.4}{1.2.3.4:9310},{1.2.3.6}{I6LUhVFDR2yP2De2nyEugQ}{uDV540TEQx2NiM-2Mx46Rg}{1.2.3.6}{1.2.3.6:9310},{10.4.101.179}{j7gpcqqLT9ei3cnlIAL0Ag}{xT9NITwHTZaW3L8IxwnnVA}{10.4.101.179}{10.4.101.179:9310},{1.2.3.5}{KdafAxKGTZuoSCwZTIz2Uw}{bfGKg8GsR3Gt8TtTo3JYKQ}{1.2.3.5}{1.2.3.5:9310},}, reason: apply cluster state (from master [master {1.2.3.6}{I6LUhVFDR2yP2De2nyEugQ}{uDV540TEQx2NiM-2Mx46Rg}{1.2.3.6}{1.2.3.6:9310} committed version [789145]])
[2023-06-01T17:51:13,813][INFO ][o.e.c.s.ClusterSettings  ] [x.x.x.x] updating [cluster.routing.allocation.cluster_concurrent_rebalance] from [2] to [8]
[2023-06-01T17:51:13,813][INFO ][o.e.c.s.ClusterSettings  ] [x.x.x.x] updating [cluster.routing.allocation.node_concurrent_incoming_recoveries] from [2] to [8]
[2023-06-01T17:51:13,813][INFO ][o.e.c.s.ClusterSettings  ] [x.x.x.x] updating [cluster.routing.allocation.node_concurrent_outgoing_recoveries] from [2] to [8]
[2023-06-01T17:51:13,814][INFO ][o.e.c.s.ClusterSettings  ] [x.x.x.x] updating [action.auto_create_index] from [true] to [es_monitor_*,.*]
[2023-06-01T17:51:13,814][INFO ][o.e.c.s.ClusterSettings  ] [x.x.x.x] updating [search.max_buckets] from [-1] to [65536]
[2023-06-01T17:51:14,821][INFO ][o.e.h.n.Netty4HttpServerTransport] [x.x.x.x] publish_address {x.x.x.x:9210}, bound_addresses {x.x.x.x:9210}
[2023-06-01T17:51:14,821][INFO ][o.e.n.Node               ] [x.x.x.x] started

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.