Hi All,
This is running ELK 7.7.8 on Ubuntu 18.04.4 LTS
I had created a NFS mount on my 3 old node cluster and new 3 node cluster
I did manage to make a snapshot from old and restore to new cluster.
However, I decided (can't remember why). I wiped out the nfs directory (on the NFS server).
Now I cannot make any snapshot, it complains of missing file.
PUT /_snapshot/my_backup
{
"type": "fs",
"settings": {
"location": "/data/shared/"
}
}
POST /_snapshot/my_backup/_verify
{
"nodes" : {
"2geeJiNpQfu7uhLvXM1IcA" : {
"name" : "elk1"
},
"mnyp8rAxRpKZctGkP5-WPA" : {
"name" : "elk3"
},
"EcivYQmTSvWJfA6dQeq5JQ" : {
"name" : "elk2"
}
}
}
PUT /_snapshot/my_backup/snapshot_1?wait_for_completion=true
{
"indices": "lawful-*,snmp-inngate-*",
"ignore_unavailable": false,
"include_global_state": false,
"metadata": {
"taken_by": "michael",
"taken_because": "backup before upgrading"
}
}
Tried the above with "ignore_unavailable": true as well.
I keep getting this type of errors.
{
"snapshot" : {
"snapshot" : "snapshot_1",
"uuid" : "iBHEBZSUR1Se5n0u4Kx5-w",
"version_id" : 7070199,
"version" : "7.7.1",
"indices" : [
"lawful-2020.07.02-000003",
"lawful-2020.06.25-000002",
"lawful-2020.06.18-000001"
],
"include_global_state" : false,
"metadata" : {
"taken_by" : "michael",
"taken_because" : "backup before upgrading"
},
"state" : "PARTIAL",
"start_time" : "2020-07-07T04:48:01.286Z",
"start_time_in_millis" : 1594097281286,
"end_time" : "2020-07-07T04:48:01.486Z",
"end_time_in_millis" : 1594097281486,
"duration_in_millis" : 200,
"failures" : [
{
"index" : "lawful-2020.06.25-000002",
"index_uuid" : "lawful-2020.06.25-000002",
"shard_id" : 0,
"reason" : """java.nio.file.NoSuchFileException: /data/shared/indices/HupkQbKOSWOGRY_SYxc8WQ/0/index-SaiQ1CepSWOLuhxgcVkOiw
at java.base/sun.nio.fs.UnixException.translateToIOException(UnixException.java:92)
at java.base/sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:111)
at java.base/sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:116)
at java.base/sun.nio.fs.UnixFileSystemProvider.newByteChannel(UnixFileSystemProvider.java:219)
at java.base/java.nio.file.Files.newByteChannel(Files.java:375)
at java.base/java.nio.file.Files.newByteChannel(Files.java:426)
at java.base/java.nio.file.spi.FileSystemProvider.newInputStream(FileSystemProvider.java:420)
at java.base/java.nio.file.Files.newInputStream(Files.java:160)
at org.elasticsearch.common.blobstore.fs.FsBlobContainer.readBlob(FsBlobContainer.java:149)
at org.elasticsearch.repositories.blobstore.ChecksumBlobStoreFormat.readBlob(ChecksumBlobStoreFormat.java:128)
at org.elasticsearch.repositories.blobstore.ChecksumBlobStoreFormat.read(ChecksumBlobStoreFormat.java:114)
at org.elasticsearch.repositories.blobstore.BlobStoreRepository.buildBlobStoreIndexShardSnapshots(BlobStoreRepository.java:2014)
at org.elasticsearch.repositories.blobstore.BlobStoreRepository.snapshotShard(BlobStoreRepository.java:1575)
at org.elasticsearch.snapshots.SnapshotShardsService.snapshot(SnapshotShardsService.java:345)
at org.elasticsearch.snapshots.SnapshotShardsService.lambda$startNewShards$1(SnapshotShardsService.java:289)
at org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingRunnable.run(ThreadContext.java:633)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1130)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:630)
at java.base/java.lang.Thread.run(Thread.java:832)
""",
"node_id" : "mnyp8rAxRpKZctGkP5-WPA",
"status" : "INTERNAL_SERVER_ERROR"
},
{
"index" : "lawful-2020.07.02-000003",
"index_uuid" : "lawful-2020.07.02-000003",
"shard_id" : 0,
"reason" : """java.nio.file.NoSuchFileException: /data/shared/indices/L7AqnmbrR--1pUHsd8Mi6g/0/index-lxKVp0rTTuKhepg7PhoZJA
at java.base/sun.nio.fs.UnixException.translateToIOException(UnixException.java:92)
at java.base/sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:111)
at java.base/sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:116)
at java.base/sun.nio.fs.UnixFileSystemProvider.newByteChannel(UnixFileSystemProvider.java:219)
at java.base/java.nio.file.Files.newByteChannel(Files.java:375)
at java.base/java.nio.file.Files.newByteChannel(Files.java:426)
at java.base/java.nio.file.spi.FileSystemProvider.newInputStream(FileSystemProvider.java:420)
at java.base/java.nio.file.Files.newInputStream(Files.java:160)
at org.elasticsearch.common.blobstore.fs.FsBlobContainer.readBlob(FsBlobContainer.java:149)
at org.elasticsearch.repositories.blobstore.ChecksumBlobStoreFormat.readBlob(ChecksumBlobStoreFormat.java:128)
at org.elasticsearch.repositories.blobstore.ChecksumBlobStoreFormat.read(ChecksumBlobStoreFormat.java:114)
at org.elasticsearch.repositories.blobstore.BlobStoreRepository.buildBlobStoreIndexShardSnapshots(BlobStoreRepository.java:2014)
at org.elasticsearch.repositories.blobstore.BlobStoreRepository.snapshotShard(BlobStoreRepository.java:1575)
at org.elasticsearch.snapshots.SnapshotShardsService.snapshot(SnapshotShardsService.java:345)
at org.elasticsearch.snapshots.SnapshotShardsService.lambda$startNewShards$1(SnapshotShardsService.java:289)
at org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingRunnable.run(ThreadContext.java:633)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1130)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:630)
at java.base/java.lang.Thread.run(Thread.java:832)
""",
"node_id" : "mnyp8rAxRpKZctGkP5-WPA",
"status" : "INTERNAL_SERVER_ERROR"
},
{
"index" : "lawful-2020.06.18-000001",
"index_uuid" : "lawful-2020.06.18-000001",
"shard_id" : 0,
"reason" : """java.nio.file.NoSuchFileException: /data/shared/indices/HdNL8pleSbm-TES4IBc47A/0/index-CnZD2MfLQ8qhL1UCHZBu7w
at java.base/sun.nio.fs.UnixException.translateToIOException(UnixException.java:92)
at java.base/sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:111)
at java.base/sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:116)
at java.base/sun.nio.fs.UnixFileSystemProvider.newByteChannel(UnixFileSystemProvider.java:219)
at java.base/java.nio.file.Files.newByteChannel(Files.java:375)
at java.base/java.nio.file.Files.newByteChannel(Files.java:426)
at java.base/java.nio.file.spi.FileSystemProvider.newInputStream(FileSystemProvider.java:420)
at java.base/java.nio.file.Files.newInputStream(Files.java:160)
at org.elasticsearch.common.blobstore.fs.FsBlobContainer.readBlob(FsBlobContainer.java:149)
at org.elasticsearch.repositories.blobstore.ChecksumBlobStoreFormat.readBlob(ChecksumBlobStoreFormat.java:128)
at org.elasticsearch.repositories.blobstore.ChecksumBlobStoreFormat.read(ChecksumBlobStoreFormat.java:114)
at org.elasticsearch.repositories.blobstore.BlobStoreRepository.buildBlobStoreIndexShardSnapshots(BlobStoreRepository.java:2014)
at org.elasticsearch.repositories.blobstore.BlobStoreRepository.snapshotShard(BlobStoreRepository.java:1575)
at org.elasticsearch.snapshots.SnapshotShardsService.snapshot(SnapshotShardsService.java:345)
at org.elasticsearch.snapshots.SnapshotShardsService.lambda$startNewShards$1(SnapshotShardsService.java:289)
at org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingRunnable.run(ThreadContext.java:633)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1130)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:630)
at java.base/java.lang.Thread.run(Thread.java:832)
""",
"node_id" : "mnyp8rAxRpKZctGkP5-WPA",
"status" : "INTERNAL_SERVER_ERROR"
}
],
"shards" : {
"total" : 3,
"failed" : 3,
"successful" : 0
}
}
}
I've configured the NFS server as follows:
/shared 10.40.0.0/22(rw,sync,no_root_squash,no_subtree_check) 10.60.1.0/24(rw,sync,no_root_squash,no_subtree_check)
I've mounted on each ES node as follows;
10.40.3.14:/shared /data/shared nfs defaults 0 0
Which generates:
10.40.3.14:/shared on /data/shared type nfs4 (rw,relatime,vers=4.2,rsize=524288,wsize=524288,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,clientaddr=10.40.3.10,local_lock=none,addr=10.40.3.14)
So what I'm asking is how to I clean out the snapshots so that it can work again.
The directory of the mounted system is as follows
root@elk1:/data/shared# ls -al
total 20
drwxrwxrwx 3 nobody nogroup 135 Jul 7 04:48 .
drwxr-xr-x 4 root root 60 Jul 2 09:32 ..
-rw-r--r-- 1 elasticsearch elasticsearch 1215 Jul 7 04:48 index-43
-rw-r--r-- 1 elasticsearch elasticsearch 8 Jul 7 04:48 index.latest
drwxr-xr-x 8 elasticsearch elasticsearch 186 Jul 6 04:35 indices
-rw-r--r-- 1 elasticsearch elasticsearch 217 Jul 7 04:48 meta-iBHEBZSUR1Se5n0u4Kx5-w.dat
-rw-r--r-- 1 elasticsearch elasticsearch 6317 Jul 7 04:48 snap-iBHEBZSUR1Se5n0u4Kx5-w.dat
root@elk1:/data/shared# ls -al indices/
total 24
drwxr-xr-x 8 elasticsearch elasticsearch 186 Jul 6 04:35 .
drwxrwxrwx 3 nobody nogroup 135 Jul 7 04:48 ..
drwxr-xr-x 3 elasticsearch elasticsearch 4096 Jul 7 04:33 5oCez34XSZanhsf4NTnuOA
drwxr-xr-x 3 elasticsearch elasticsearch 4096 Jul 7 04:33 BJx3XOImTF6j418JP8zqQA
drwxr-xr-x 3 elasticsearch elasticsearch 4096 Jul 7 04:48 HdNL8pleSbm-TES4IBc47A
drwxr-xr-x 3 elasticsearch elasticsearch 4096 Jul 7 04:48 HupkQbKOSWOGRY_SYxc8WQ
drwxr-xr-x 3 elasticsearch elasticsearch 4096 Jul 7 04:48 L7AqnmbrR--1pUHsd8Mi6g
drwxr-xr-x 3 elasticsearch elasticsearch 4096 Jul 7 04:33 tDrp5NfORf6nNVRv7HSaqQ
For one of the subdirectories
root@elk1:/data/shared# ls -al /data/shared/indices/HdNL8pleSbm-TES4IBc47A/0/
total 4
drwxr-xr-x 2 elasticsearch elasticsearch 6 Jul 6 04:35 .
drwxr-xr-x 3 elasticsearch elasticsearch 4096 Jul 7 04:48 ..
So the files are actually missing.. I don't know why its not creating them instead..
The file system has a lot of space.
root@elk1:/data/shared# df -h
Filesystem Size Used Avail Use% Mounted on
udev 3.9G 0 3.9G 0% /dev
tmpfs 798M 1.3M 797M 1% /run
/dev/sda2 49G 9.2G 38G 20% /
tmpfs 3.9G 53M 3.9G 2% /dev/shm
tmpfs 5.0M 0 5.0M 0% /run/lock
tmpfs 3.9G 0 3.9G 0% /sys/fs/cgroup
/dev/loop0 98M 98M 0 100% /snap/core/9289
/dev/sdb 500G 12G 489G 3% /data
/dev/loop2 97M 97M 0 100% /snap/core/9436
tmpfs 798M 0 798M 0% /run/user/1000
10.40.3.14:/shared 1.0T 1.1G 1023G 1% /data/shared
I've tried to delete the snapshot and clean out the system
DELETE /_snapshot/my_backup/snapshot_1
{
"acknowledged" : true
}
I've done cleanup
POST /_snapshot/my_backup/_cleanup
{
"results" : {
"deleted_bytes" : 0,
"deleted_blobs" : 0
}
}
I've even deleted the depository config and recreated it.
DELETE _snapshot/my_backup?pretty
{
"acknowledged" : true
}
I'm kinda lost...
Thanks a lot for your advice.
Regards,
Michael