|
|
e3c68b |
From 40ac42501d6bbff7206e753e8e988beefe74f5f4 Mon Sep 17 00:00:00 2001
|
|
|
e3c68b |
From: Krutika Dhananjay <kdhananj@redhat.com>
|
|
|
e3c68b |
Date: Fri, 5 Apr 2019 10:30:23 +0530
|
|
|
e3c68b |
Subject: [PATCH 176/178] features/shard: Fix crash during background shard
|
|
|
e3c68b |
deletion in a specific case
|
|
|
e3c68b |
|
|
|
e3c68b |
Consider the following case -
|
|
|
e3c68b |
1. A file gets FALLOCATE'd such that > "shard-lru-limit" number of
|
|
|
e3c68b |
shards are created.
|
|
|
e3c68b |
2. And then it is deleted after that.
|
|
|
e3c68b |
|
|
|
e3c68b |
The unique thing about FALLOCATE is that unlike WRITE, all of the
|
|
|
e3c68b |
participant shards are resolved and created and fallocated in a single
|
|
|
e3c68b |
batch. This means, in this case, after the first "shard-lru-limit"
|
|
|
e3c68b |
number of shards are resolved and added to lru list, as part of
|
|
|
e3c68b |
resolution of the remaining shards, some of the existing shards in lru
|
|
|
e3c68b |
list will need to be evicted. So these evicted shards will be
|
|
|
e3c68b |
inode_unlink()d as part of eviction. Now once the fop gets to the actual
|
|
|
e3c68b |
FALLOCATE stage, the lru'd-out shards get added to fsync list.
|
|
|
e3c68b |
|
|
|
e3c68b |
2 things to note at this point:
|
|
|
e3c68b |
i. the lru'd out shards are only part of fsync list, so each holds 1 ref
|
|
|
e3c68b |
on base shard
|
|
|
e3c68b |
ii. and the more recently used shards are part of both fsync and lru list.
|
|
|
e3c68b |
So each of these shards holds 2 refs on base inode - one for being
|
|
|
e3c68b |
part of fsync list, and the other for being part of lru list.
|
|
|
e3c68b |
|
|
|
e3c68b |
FALLOCATE completes successfully and then this very file is deleted, and
|
|
|
e3c68b |
background shard deletion launched. Here's where the ref counts get mismatched.
|
|
|
e3c68b |
First as part of inode_resolve()s during the deletion, the lru'd-out inodes
|
|
|
e3c68b |
return NULL, because they are inode_unlink()'d by now. So these inodes need to
|
|
|
e3c68b |
be freshly looked up. But as part of linking them in lookup_cbk (precisely in
|
|
|
e3c68b |
shard_link_block_inode()), inode_link() returns the lru'd-out inode object.
|
|
|
e3c68b |
And its inode ctx is still valid and ctx->base_inode valid from the last
|
|
|
e3c68b |
time it was added to list.
|
|
|
e3c68b |
|
|
|
e3c68b |
But shard_common_lookup_shards_cbk() passes NULL in the place of base_pointer
|
|
|
e3c68b |
to __shard_update_shards_inode_list(). This means, as part of adding the lru'd out
|
|
|
e3c68b |
inode back to lru list, base inode is not ref'd since its NULL.
|
|
|
e3c68b |
|
|
|
e3c68b |
Whereas post unlinking this shard, during shard_unlink_block_inode(),
|
|
|
e3c68b |
ctx->base_inode is accessible and is unref'd because the shard was found to be part
|
|
|
e3c68b |
of LRU list, although the matching ref didn't occur. This at some point leads to
|
|
|
e3c68b |
base_inode refcount becoming 0 and it getting destroyed and released back while some
|
|
|
e3c68b |
of its associated shards are continuing to be unlinked in parallel and the client crashes
|
|
|
e3c68b |
whenever it is accessed next.
|
|
|
e3c68b |
|
|
|
e3c68b |
Fix is to pass base shard correctly, if available, in shard_link_block_inode().
|
|
|
e3c68b |
|
|
|
e3c68b |
Also, the patch fixes the ret value check in tests/bugs/shard/shard-fallocate.c
|
|
|
e3c68b |
|
|
|
e3c68b |
>Change-Id: Ibd0bc4c6952367608e10701473cbad3947d7559f
|
|
|
e3c68b |
>Updates: bz#1696136
|
|
|
e3c68b |
>Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
|
|
|
e3c68b |
|
|
|
e3c68b |
Upstream Patch: https://review.gluster.org/#/c/glusterfs/+/22507/
|
|
|
e3c68b |
|
|
|
e3c68b |
BUG: 1694595
|
|
|
e3c68b |
Change-Id: Ibd0bc4c6952367608e10701473cbad3947d7559f
|
|
|
e3c68b |
Signed-off-by: Sunil Kumar Acharya <sheggodu@redhat.com>
|
|
|
e3c68b |
Reviewed-on: https://code.engineering.redhat.com/gerrit/172856
|
|
|
e3c68b |
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
|
|
e3c68b |
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
|
|
|
e3c68b |
---
|
|
|
e3c68b |
tests/bugs/shard/bug-1696136.c | 121 +++++++++++++++++++++++++++++++++++++
|
|
|
e3c68b |
tests/bugs/shard/bug-1696136.t | 33 ++++++++++
|
|
|
e3c68b |
tests/bugs/shard/shard-fallocate.c | 2 +-
|
|
|
e3c68b |
xlators/features/shard/src/shard.c | 12 +++-
|
|
|
e3c68b |
4 files changed, 164 insertions(+), 4 deletions(-)
|
|
|
e3c68b |
create mode 100644 tests/bugs/shard/bug-1696136.c
|
|
|
e3c68b |
create mode 100644 tests/bugs/shard/bug-1696136.t
|
|
|
e3c68b |
|
|
|
e3c68b |
diff --git a/tests/bugs/shard/bug-1696136.c b/tests/bugs/shard/bug-1696136.c
|
|
|
e3c68b |
new file mode 100644
|
|
|
e3c68b |
index 0000000..b9e8d13
|
|
|
e3c68b |
--- /dev/null
|
|
|
e3c68b |
+++ b/tests/bugs/shard/bug-1696136.c
|
|
|
e3c68b |
@@ -0,0 +1,121 @@
|
|
|
e3c68b |
+#define _GNU_SOURCE
|
|
|
e3c68b |
+#include <fcntl.h>
|
|
|
e3c68b |
+#include <stdio.h>
|
|
|
e3c68b |
+#include <stdlib.h>
|
|
|
e3c68b |
+#include <glusterfs/api/glfs.h>
|
|
|
e3c68b |
+#include <glusterfs/api/glfs-handles.h>
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+enum fallocate_flag {
|
|
|
e3c68b |
+ TEST_FALLOCATE_NONE,
|
|
|
e3c68b |
+ TEST_FALLOCATE_KEEP_SIZE,
|
|
|
e3c68b |
+ TEST_FALLOCATE_ZERO_RANGE,
|
|
|
e3c68b |
+ TEST_FALLOCATE_PUNCH_HOLE,
|
|
|
e3c68b |
+ TEST_FALLOCATE_MAX,
|
|
|
e3c68b |
+};
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+int
|
|
|
e3c68b |
+get_fallocate_flag(int opcode)
|
|
|
e3c68b |
+{
|
|
|
e3c68b |
+ int ret = 0;
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+ switch (opcode) {
|
|
|
e3c68b |
+ case TEST_FALLOCATE_NONE:
|
|
|
e3c68b |
+ ret = 0;
|
|
|
e3c68b |
+ break;
|
|
|
e3c68b |
+ case TEST_FALLOCATE_KEEP_SIZE:
|
|
|
e3c68b |
+ ret = FALLOC_FL_KEEP_SIZE;
|
|
|
e3c68b |
+ break;
|
|
|
e3c68b |
+ case TEST_FALLOCATE_ZERO_RANGE:
|
|
|
e3c68b |
+ ret = FALLOC_FL_ZERO_RANGE;
|
|
|
e3c68b |
+ break;
|
|
|
e3c68b |
+ case TEST_FALLOCATE_PUNCH_HOLE:
|
|
|
e3c68b |
+ ret = FALLOC_FL_PUNCH_HOLE;
|
|
|
e3c68b |
+ break;
|
|
|
e3c68b |
+ default:
|
|
|
e3c68b |
+ ret = -1;
|
|
|
e3c68b |
+ break;
|
|
|
e3c68b |
+ }
|
|
|
e3c68b |
+ return ret;
|
|
|
e3c68b |
+}
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+int
|
|
|
e3c68b |
+main(int argc, char *argv[])
|
|
|
e3c68b |
+{
|
|
|
e3c68b |
+ int ret = 1;
|
|
|
e3c68b |
+ int opcode = -1;
|
|
|
e3c68b |
+ off_t offset = 0;
|
|
|
e3c68b |
+ size_t len = 0;
|
|
|
e3c68b |
+ glfs_t *fs = NULL;
|
|
|
e3c68b |
+ glfs_fd_t *fd = NULL;
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+ if (argc != 8) {
|
|
|
e3c68b |
+ fprintf(stderr,
|
|
|
e3c68b |
+ "Syntax: %s <host> <volname> <opcode> <offset> <len> "
|
|
|
e3c68b |
+ "<file-path> <log-file>\n",
|
|
|
e3c68b |
+ argv[0]);
|
|
|
e3c68b |
+ return 1;
|
|
|
e3c68b |
+ }
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+ fs = glfs_new(argv[2]);
|
|
|
e3c68b |
+ if (!fs) {
|
|
|
e3c68b |
+ fprintf(stderr, "glfs_new: returned NULL\n");
|
|
|
e3c68b |
+ return 1;
|
|
|
e3c68b |
+ }
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+ ret = glfs_set_volfile_server(fs, "tcp", argv[1], 24007);
|
|
|
e3c68b |
+ if (ret != 0) {
|
|
|
e3c68b |
+ fprintf(stderr, "glfs_set_volfile_server: returned %d\n", ret);
|
|
|
e3c68b |
+ goto out;
|
|
|
e3c68b |
+ }
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+ ret = glfs_set_logging(fs, argv[7], 7);
|
|
|
e3c68b |
+ if (ret != 0) {
|
|
|
e3c68b |
+ fprintf(stderr, "glfs_set_logging: returned %d\n", ret);
|
|
|
e3c68b |
+ goto out;
|
|
|
e3c68b |
+ }
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+ ret = glfs_init(fs);
|
|
|
e3c68b |
+ if (ret != 0) {
|
|
|
e3c68b |
+ fprintf(stderr, "glfs_init: returned %d\n", ret);
|
|
|
e3c68b |
+ goto out;
|
|
|
e3c68b |
+ }
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+ opcode = atoi(argv[3]);
|
|
|
e3c68b |
+ opcode = get_fallocate_flag(opcode);
|
|
|
e3c68b |
+ if (opcode < 0) {
|
|
|
e3c68b |
+ fprintf(stderr, "get_fallocate_flag: invalid flag \n");
|
|
|
e3c68b |
+ goto out;
|
|
|
e3c68b |
+ }
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+ offset = atoi(argv[4]);
|
|
|
e3c68b |
+ len = atoi(argv[5]);
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+ fd = glfs_open(fs, argv[6], O_RDWR);
|
|
|
e3c68b |
+ if (fd == NULL) {
|
|
|
e3c68b |
+ fprintf(stderr, "glfs_open: returned NULL\n");
|
|
|
e3c68b |
+ goto out;
|
|
|
e3c68b |
+ }
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+ ret = glfs_fallocate(fd, opcode, offset, len);
|
|
|
e3c68b |
+ if (ret < 0) {
|
|
|
e3c68b |
+ fprintf(stderr, "glfs_fallocate: returned %d\n", ret);
|
|
|
e3c68b |
+ goto out;
|
|
|
e3c68b |
+ }
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+ ret = glfs_unlink(fs, argv[6]);
|
|
|
e3c68b |
+ if (ret < 0) {
|
|
|
e3c68b |
+ fprintf(stderr, "glfs_unlink: returned %d\n", ret);
|
|
|
e3c68b |
+ goto out;
|
|
|
e3c68b |
+ }
|
|
|
e3c68b |
+ /* Sleep for 3s to give enough time for background deletion to complete
|
|
|
e3c68b |
+ * during which if the bug exists, the process will crash.
|
|
|
e3c68b |
+ */
|
|
|
e3c68b |
+ sleep(3);
|
|
|
e3c68b |
+ ret = 0;
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+out:
|
|
|
e3c68b |
+ if (fd)
|
|
|
e3c68b |
+ glfs_close(fd);
|
|
|
e3c68b |
+ glfs_fini(fs);
|
|
|
e3c68b |
+ return ret;
|
|
|
e3c68b |
+}
|
|
|
e3c68b |
diff --git a/tests/bugs/shard/bug-1696136.t b/tests/bugs/shard/bug-1696136.t
|
|
|
e3c68b |
new file mode 100644
|
|
|
e3c68b |
index 0000000..b6dc858
|
|
|
e3c68b |
--- /dev/null
|
|
|
e3c68b |
+++ b/tests/bugs/shard/bug-1696136.t
|
|
|
e3c68b |
@@ -0,0 +1,33 @@
|
|
|
e3c68b |
+#!/bin/bash
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+. $(dirname $0)/../../include.rc
|
|
|
e3c68b |
+. $(dirname $0)/../../volume.rc
|
|
|
e3c68b |
+. $(dirname $0)/../../fallocate.rc
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+cleanup
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+TEST glusterd
|
|
|
e3c68b |
+TEST pidof glusterd
|
|
|
e3c68b |
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
|
|
|
e3c68b |
+TEST $CLI volume set $V0 features.shard on
|
|
|
e3c68b |
+TEST $CLI volume set $V0 features.shard-block-size 4MB
|
|
|
e3c68b |
+TEST $CLI volume set $V0 features.shard-lru-limit 120
|
|
|
e3c68b |
+TEST $CLI volume set $V0 performance.write-behind off
|
|
|
e3c68b |
+TEST $CLI volume start $V0
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+TEST build_tester $(dirname $0)/bug-1696136.c -lgfapi -Wall -O2
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+# Create a file
|
|
|
e3c68b |
+TEST touch $M0/file1
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+# Fallocate a 500M file. This will make sure number of participant shards are > lru-limit
|
|
|
e3c68b |
+TEST $(dirname $0)/bug-1696136 $H0 $V0 "0" "0" "536870912" /file1 `gluster --print-logdir`/glfs-$V0.log
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
|
|
|
e3c68b |
+TEST $CLI volume stop $V0
|
|
|
e3c68b |
+TEST $CLI volume delete $V0
|
|
|
e3c68b |
+rm -f $(dirname $0)/bug-1696136
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+cleanup
|
|
|
e3c68b |
diff --git a/tests/bugs/shard/shard-fallocate.c b/tests/bugs/shard/shard-fallocate.c
|
|
|
e3c68b |
index 3a784d3..45b9ce0 100644
|
|
|
e3c68b |
--- a/tests/bugs/shard/shard-fallocate.c
|
|
|
e3c68b |
+++ b/tests/bugs/shard/shard-fallocate.c
|
|
|
e3c68b |
@@ -97,7 +97,7 @@ main(int argc, char *argv[])
|
|
|
e3c68b |
}
|
|
|
e3c68b |
|
|
|
e3c68b |
ret = glfs_fallocate(fd, opcode, offset, len);
|
|
|
e3c68b |
- if (ret <= 0) {
|
|
|
e3c68b |
+ if (ret < 0) {
|
|
|
e3c68b |
fprintf(stderr, "glfs_fallocate: returned %d\n", ret);
|
|
|
e3c68b |
goto out;
|
|
|
e3c68b |
}
|
|
|
e3c68b |
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
|
|
|
e3c68b |
index fa3564a..3c4bcdc 100644
|
|
|
e3c68b |
--- a/xlators/features/shard/src/shard.c
|
|
|
e3c68b |
+++ b/xlators/features/shard/src/shard.c
|
|
|
e3c68b |
@@ -2213,13 +2213,19 @@ shard_link_block_inode(shard_local_t *local, int block_num, inode_t *inode,
|
|
|
e3c68b |
xlator_t *this = NULL;
|
|
|
e3c68b |
inode_t *fsync_inode = NULL;
|
|
|
e3c68b |
shard_priv_t *priv = NULL;
|
|
|
e3c68b |
+ inode_t *base_inode = NULL;
|
|
|
e3c68b |
|
|
|
e3c68b |
this = THIS;
|
|
|
e3c68b |
priv = this->private;
|
|
|
e3c68b |
- if (local->loc.inode)
|
|
|
e3c68b |
+ if (local->loc.inode) {
|
|
|
e3c68b |
gf_uuid_copy(gfid, local->loc.inode->gfid);
|
|
|
e3c68b |
- else
|
|
|
e3c68b |
+ base_inode = local->loc.inode;
|
|
|
e3c68b |
+ } else if (local->resolver_base_inode) {
|
|
|
e3c68b |
+ gf_uuid_copy(gfid, local->resolver_base_inode->gfid);
|
|
|
e3c68b |
+ base_inode = local->resolver_base_inode;
|
|
|
e3c68b |
+ } else {
|
|
|
e3c68b |
gf_uuid_copy(gfid, local->base_gfid);
|
|
|
e3c68b |
+ }
|
|
|
e3c68b |
|
|
|
e3c68b |
shard_make_block_bname(block_num, gfid, block_bname, sizeof(block_bname));
|
|
|
e3c68b |
|
|
|
e3c68b |
@@ -2232,7 +2238,7 @@ shard_link_block_inode(shard_local_t *local, int block_num, inode_t *inode,
|
|
|
e3c68b |
LOCK(&priv->lock);
|
|
|
e3c68b |
{
|
|
|
e3c68b |
fsync_inode = __shard_update_shards_inode_list(
|
|
|
e3c68b |
- linked_inode, this, local->loc.inode, block_num, gfid);
|
|
|
e3c68b |
+ linked_inode, this, base_inode, block_num, gfid);
|
|
|
e3c68b |
}
|
|
|
e3c68b |
UNLOCK(&priv->lock);
|
|
|
e3c68b |
if (fsync_inode)
|
|
|
e3c68b |
--
|
|
|
e3c68b |
1.8.3.1
|
|
|
e3c68b |
|