e3c68b
From 40ac42501d6bbff7206e753e8e988beefe74f5f4 Mon Sep 17 00:00:00 2001
e3c68b
From: Krutika Dhananjay <kdhananj@redhat.com>
e3c68b
Date: Fri, 5 Apr 2019 10:30:23 +0530
e3c68b
Subject: [PATCH 176/178] features/shard: Fix crash during background shard
e3c68b
 deletion in a specific case
e3c68b
e3c68b
Consider the following case -
e3c68b
1. A file gets FALLOCATE'd such that > "shard-lru-limit" number of
e3c68b
   shards are created.
e3c68b
2. And then it is deleted after that.
e3c68b
e3c68b
The unique thing about FALLOCATE is that unlike WRITE, all of the
e3c68b
participant shards are resolved and created and fallocated in a single
e3c68b
batch. This means, in this case, after the first "shard-lru-limit"
e3c68b
number of shards are resolved and added to lru list, as part of
e3c68b
resolution of the remaining shards, some of the existing shards in lru
e3c68b
list will need to be evicted. So these evicted shards will be
e3c68b
inode_unlink()d as part of eviction. Now once the fop gets to the actual
e3c68b
FALLOCATE stage, the lru'd-out shards get added to fsync list.
e3c68b
e3c68b
2 things to note at this point:
e3c68b
i. the lru'd out shards are only part of fsync list, so each holds 1 ref
e3c68b
   on base shard
e3c68b
ii. and the more recently used shards are part of both fsync and lru list.
e3c68b
    So each of these shards holds 2 refs on base inode  - one for being
e3c68b
    part of fsync list, and the other for being part of lru list.
e3c68b
e3c68b
FALLOCATE completes successfully and then this very file is deleted, and
e3c68b
background shard deletion launched. Here's where the ref counts get mismatched.
e3c68b
First as part of inode_resolve()s during the deletion, the lru'd-out inodes
e3c68b
return NULL, because they are inode_unlink()'d by now. So these inodes need to
e3c68b
be freshly looked up. But as part of linking them in lookup_cbk (precisely in
e3c68b
shard_link_block_inode()), inode_link() returns the lru'd-out inode object.
e3c68b
And its inode ctx is still valid and ctx->base_inode valid from the last
e3c68b
time it was added to list.
e3c68b
e3c68b
But shard_common_lookup_shards_cbk() passes NULL in the place of base_pointer
e3c68b
to __shard_update_shards_inode_list(). This means, as part of adding the lru'd out
e3c68b
inode back to lru list, base inode is not ref'd since its NULL.
e3c68b
e3c68b
Whereas post unlinking this shard, during shard_unlink_block_inode(),
e3c68b
ctx->base_inode is accessible and is unref'd because the shard was found to be part
e3c68b
of LRU list, although the matching ref didn't occur. This at some point leads to
e3c68b
base_inode refcount becoming 0 and it getting destroyed and released back while some
e3c68b
of its associated shards are continuing to be unlinked in parallel and the client crashes
e3c68b
whenever it is accessed next.
e3c68b
e3c68b
Fix is to pass base shard correctly, if available, in shard_link_block_inode().
e3c68b
e3c68b
Also, the patch fixes the ret value check in tests/bugs/shard/shard-fallocate.c
e3c68b
e3c68b
>Change-Id: Ibd0bc4c6952367608e10701473cbad3947d7559f
e3c68b
>Updates: bz#1696136
e3c68b
>Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
e3c68b
e3c68b
Upstream Patch: https://review.gluster.org/#/c/glusterfs/+/22507/
e3c68b
e3c68b
BUG: 1694595
e3c68b
Change-Id: Ibd0bc4c6952367608e10701473cbad3947d7559f
e3c68b
Signed-off-by: Sunil Kumar Acharya <sheggodu@redhat.com>
e3c68b
Reviewed-on: https://code.engineering.redhat.com/gerrit/172856
e3c68b
Tested-by: RHGS Build Bot <nigelb@redhat.com>
e3c68b
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
e3c68b
---
e3c68b
 tests/bugs/shard/bug-1696136.c     | 121 +++++++++++++++++++++++++++++++++++++
e3c68b
 tests/bugs/shard/bug-1696136.t     |  33 ++++++++++
e3c68b
 tests/bugs/shard/shard-fallocate.c |   2 +-
e3c68b
 xlators/features/shard/src/shard.c |  12 +++-
e3c68b
 4 files changed, 164 insertions(+), 4 deletions(-)
e3c68b
 create mode 100644 tests/bugs/shard/bug-1696136.c
e3c68b
 create mode 100644 tests/bugs/shard/bug-1696136.t
e3c68b
e3c68b
diff --git a/tests/bugs/shard/bug-1696136.c b/tests/bugs/shard/bug-1696136.c
e3c68b
new file mode 100644
e3c68b
index 0000000..b9e8d13
e3c68b
--- /dev/null
e3c68b
+++ b/tests/bugs/shard/bug-1696136.c
e3c68b
@@ -0,0 +1,121 @@
e3c68b
+#define _GNU_SOURCE
e3c68b
+#include <fcntl.h>
e3c68b
+#include <stdio.h>
e3c68b
+#include <stdlib.h>
e3c68b
+#include <glusterfs/api/glfs.h>
e3c68b
+#include <glusterfs/api/glfs-handles.h>
e3c68b
+
e3c68b
+enum fallocate_flag {
e3c68b
+    TEST_FALLOCATE_NONE,
e3c68b
+    TEST_FALLOCATE_KEEP_SIZE,
e3c68b
+    TEST_FALLOCATE_ZERO_RANGE,
e3c68b
+    TEST_FALLOCATE_PUNCH_HOLE,
e3c68b
+    TEST_FALLOCATE_MAX,
e3c68b
+};
e3c68b
+
e3c68b
+int
e3c68b
+get_fallocate_flag(int opcode)
e3c68b
+{
e3c68b
+    int ret = 0;
e3c68b
+
e3c68b
+    switch (opcode) {
e3c68b
+        case TEST_FALLOCATE_NONE:
e3c68b
+            ret = 0;
e3c68b
+            break;
e3c68b
+        case TEST_FALLOCATE_KEEP_SIZE:
e3c68b
+            ret = FALLOC_FL_KEEP_SIZE;
e3c68b
+            break;
e3c68b
+        case TEST_FALLOCATE_ZERO_RANGE:
e3c68b
+            ret = FALLOC_FL_ZERO_RANGE;
e3c68b
+            break;
e3c68b
+        case TEST_FALLOCATE_PUNCH_HOLE:
e3c68b
+            ret = FALLOC_FL_PUNCH_HOLE;
e3c68b
+            break;
e3c68b
+        default:
e3c68b
+            ret = -1;
e3c68b
+            break;
e3c68b
+    }
e3c68b
+    return ret;
e3c68b
+}
e3c68b
+
e3c68b
+int
e3c68b
+main(int argc, char *argv[])
e3c68b
+{
e3c68b
+    int ret = 1;
e3c68b
+    int opcode = -1;
e3c68b
+    off_t offset = 0;
e3c68b
+    size_t len = 0;
e3c68b
+    glfs_t *fs = NULL;
e3c68b
+    glfs_fd_t *fd = NULL;
e3c68b
+
e3c68b
+    if (argc != 8) {
e3c68b
+        fprintf(stderr,
e3c68b
+                "Syntax: %s <host> <volname> <opcode> <offset> <len> "
e3c68b
+                "<file-path> <log-file>\n",
e3c68b
+                argv[0]);
e3c68b
+        return 1;
e3c68b
+    }
e3c68b
+
e3c68b
+    fs = glfs_new(argv[2]);
e3c68b
+    if (!fs) {
e3c68b
+        fprintf(stderr, "glfs_new: returned NULL\n");
e3c68b
+        return 1;
e3c68b
+    }
e3c68b
+
e3c68b
+    ret = glfs_set_volfile_server(fs, "tcp", argv[1], 24007);
e3c68b
+    if (ret != 0) {
e3c68b
+        fprintf(stderr, "glfs_set_volfile_server: returned %d\n", ret);
e3c68b
+        goto out;
e3c68b
+    }
e3c68b
+
e3c68b
+    ret = glfs_set_logging(fs, argv[7], 7);
e3c68b
+    if (ret != 0) {
e3c68b
+        fprintf(stderr, "glfs_set_logging: returned %d\n", ret);
e3c68b
+        goto out;
e3c68b
+    }
e3c68b
+
e3c68b
+    ret = glfs_init(fs);
e3c68b
+    if (ret != 0) {
e3c68b
+        fprintf(stderr, "glfs_init: returned %d\n", ret);
e3c68b
+        goto out;
e3c68b
+    }
e3c68b
+
e3c68b
+    opcode = atoi(argv[3]);
e3c68b
+    opcode = get_fallocate_flag(opcode);
e3c68b
+    if (opcode < 0) {
e3c68b
+        fprintf(stderr, "get_fallocate_flag: invalid flag \n");
e3c68b
+        goto out;
e3c68b
+    }
e3c68b
+
e3c68b
+    offset = atoi(argv[4]);
e3c68b
+    len = atoi(argv[5]);
e3c68b
+
e3c68b
+    fd = glfs_open(fs, argv[6], O_RDWR);
e3c68b
+    if (fd == NULL) {
e3c68b
+        fprintf(stderr, "glfs_open: returned NULL\n");
e3c68b
+        goto out;
e3c68b
+    }
e3c68b
+
e3c68b
+    ret = glfs_fallocate(fd, opcode, offset, len);
e3c68b
+    if (ret < 0) {
e3c68b
+        fprintf(stderr, "glfs_fallocate: returned %d\n", ret);
e3c68b
+        goto out;
e3c68b
+    }
e3c68b
+
e3c68b
+    ret = glfs_unlink(fs, argv[6]);
e3c68b
+    if (ret < 0) {
e3c68b
+        fprintf(stderr, "glfs_unlink: returned %d\n", ret);
e3c68b
+        goto out;
e3c68b
+    }
e3c68b
+    /* Sleep for 3s to give enough time for background deletion to complete
e3c68b
+     * during which if the bug exists, the process will crash.
e3c68b
+     */
e3c68b
+    sleep(3);
e3c68b
+    ret = 0;
e3c68b
+
e3c68b
+out:
e3c68b
+    if (fd)
e3c68b
+        glfs_close(fd);
e3c68b
+    glfs_fini(fs);
e3c68b
+    return ret;
e3c68b
+}
e3c68b
diff --git a/tests/bugs/shard/bug-1696136.t b/tests/bugs/shard/bug-1696136.t
e3c68b
new file mode 100644
e3c68b
index 0000000..b6dc858
e3c68b
--- /dev/null
e3c68b
+++ b/tests/bugs/shard/bug-1696136.t
e3c68b
@@ -0,0 +1,33 @@
e3c68b
+#!/bin/bash
e3c68b
+
e3c68b
+. $(dirname $0)/../../include.rc
e3c68b
+. $(dirname $0)/../../volume.rc
e3c68b
+. $(dirname $0)/../../fallocate.rc
e3c68b
+
e3c68b
+cleanup
e3c68b
+
e3c68b
+TEST glusterd
e3c68b
+TEST pidof glusterd
e3c68b
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
e3c68b
+TEST $CLI volume set $V0 features.shard on
e3c68b
+TEST $CLI volume set $V0 features.shard-block-size 4MB
e3c68b
+TEST $CLI volume set $V0 features.shard-lru-limit 120
e3c68b
+TEST $CLI volume set $V0 performance.write-behind off
e3c68b
+TEST $CLI volume start $V0
e3c68b
+
e3c68b
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
e3c68b
+
e3c68b
+TEST build_tester $(dirname $0)/bug-1696136.c -lgfapi -Wall -O2
e3c68b
+
e3c68b
+# Create a file
e3c68b
+TEST touch $M0/file1
e3c68b
+
e3c68b
+# Fallocate a 500M file. This will make sure number of participant shards are > lru-limit
e3c68b
+TEST $(dirname $0)/bug-1696136 $H0 $V0 "0" "0" "536870912" /file1 `gluster --print-logdir`/glfs-$V0.log
e3c68b
+
e3c68b
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
e3c68b
+TEST $CLI volume stop $V0
e3c68b
+TEST $CLI volume delete $V0
e3c68b
+rm -f $(dirname $0)/bug-1696136
e3c68b
+
e3c68b
+cleanup
e3c68b
diff --git a/tests/bugs/shard/shard-fallocate.c b/tests/bugs/shard/shard-fallocate.c
e3c68b
index 3a784d3..45b9ce0 100644
e3c68b
--- a/tests/bugs/shard/shard-fallocate.c
e3c68b
+++ b/tests/bugs/shard/shard-fallocate.c
e3c68b
@@ -97,7 +97,7 @@ main(int argc, char *argv[])
e3c68b
     }
e3c68b
 
e3c68b
     ret = glfs_fallocate(fd, opcode, offset, len);
e3c68b
-    if (ret <= 0) {
e3c68b
+    if (ret < 0) {
e3c68b
         fprintf(stderr, "glfs_fallocate: returned %d\n", ret);
e3c68b
         goto out;
e3c68b
     }
e3c68b
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
e3c68b
index fa3564a..3c4bcdc 100644
e3c68b
--- a/xlators/features/shard/src/shard.c
e3c68b
+++ b/xlators/features/shard/src/shard.c
e3c68b
@@ -2213,13 +2213,19 @@ shard_link_block_inode(shard_local_t *local, int block_num, inode_t *inode,
e3c68b
     xlator_t *this = NULL;
e3c68b
     inode_t *fsync_inode = NULL;
e3c68b
     shard_priv_t *priv = NULL;
e3c68b
+    inode_t *base_inode = NULL;
e3c68b
 
e3c68b
     this = THIS;
e3c68b
     priv = this->private;
e3c68b
-    if (local->loc.inode)
e3c68b
+    if (local->loc.inode) {
e3c68b
         gf_uuid_copy(gfid, local->loc.inode->gfid);
e3c68b
-    else
e3c68b
+        base_inode = local->loc.inode;
e3c68b
+    } else if (local->resolver_base_inode) {
e3c68b
+        gf_uuid_copy(gfid, local->resolver_base_inode->gfid);
e3c68b
+        base_inode = local->resolver_base_inode;
e3c68b
+    } else {
e3c68b
         gf_uuid_copy(gfid, local->base_gfid);
e3c68b
+    }
e3c68b
 
e3c68b
     shard_make_block_bname(block_num, gfid, block_bname, sizeof(block_bname));
e3c68b
 
e3c68b
@@ -2232,7 +2238,7 @@ shard_link_block_inode(shard_local_t *local, int block_num, inode_t *inode,
e3c68b
     LOCK(&priv->lock);
e3c68b
     {
e3c68b
         fsync_inode = __shard_update_shards_inode_list(
e3c68b
-            linked_inode, this, local->loc.inode, block_num, gfid);
e3c68b
+            linked_inode, this, base_inode, block_num, gfid);
e3c68b
     }
e3c68b
     UNLOCK(&priv->lock);
e3c68b
     if (fsync_inode)
e3c68b
-- 
e3c68b
1.8.3.1
e3c68b