256ebe
From 40ac42501d6bbff7206e753e8e988beefe74f5f4 Mon Sep 17 00:00:00 2001
256ebe
From: Krutika Dhananjay <kdhananj@redhat.com>
256ebe
Date: Fri, 5 Apr 2019 10:30:23 +0530
256ebe
Subject: [PATCH 176/178] features/shard: Fix crash during background shard
256ebe
 deletion in a specific case
256ebe
256ebe
Consider the following case -
256ebe
1. A file gets FALLOCATE'd such that > "shard-lru-limit" number of
256ebe
   shards are created.
256ebe
2. And then it is deleted after that.
256ebe
256ebe
The unique thing about FALLOCATE is that unlike WRITE, all of the
256ebe
participant shards are resolved and created and fallocated in a single
256ebe
batch. This means, in this case, after the first "shard-lru-limit"
256ebe
number of shards are resolved and added to lru list, as part of
256ebe
resolution of the remaining shards, some of the existing shards in lru
256ebe
list will need to be evicted. So these evicted shards will be
256ebe
inode_unlink()d as part of eviction. Now once the fop gets to the actual
256ebe
FALLOCATE stage, the lru'd-out shards get added to fsync list.
256ebe
256ebe
2 things to note at this point:
256ebe
i. the lru'd out shards are only part of fsync list, so each holds 1 ref
256ebe
   on base shard
256ebe
ii. and the more recently used shards are part of both fsync and lru list.
256ebe
    So each of these shards holds 2 refs on base inode  - one for being
256ebe
    part of fsync list, and the other for being part of lru list.
256ebe
256ebe
FALLOCATE completes successfully and then this very file is deleted, and
256ebe
background shard deletion launched. Here's where the ref counts get mismatched.
256ebe
First as part of inode_resolve()s during the deletion, the lru'd-out inodes
256ebe
return NULL, because they are inode_unlink()'d by now. So these inodes need to
256ebe
be freshly looked up. But as part of linking them in lookup_cbk (precisely in
256ebe
shard_link_block_inode()), inode_link() returns the lru'd-out inode object.
256ebe
And its inode ctx is still valid and ctx->base_inode valid from the last
256ebe
time it was added to list.
256ebe
256ebe
But shard_common_lookup_shards_cbk() passes NULL in the place of base_pointer
256ebe
to __shard_update_shards_inode_list(). This means, as part of adding the lru'd out
256ebe
inode back to lru list, base inode is not ref'd since its NULL.
256ebe
256ebe
Whereas post unlinking this shard, during shard_unlink_block_inode(),
256ebe
ctx->base_inode is accessible and is unref'd because the shard was found to be part
256ebe
of LRU list, although the matching ref didn't occur. This at some point leads to
256ebe
base_inode refcount becoming 0 and it getting destroyed and released back while some
256ebe
of its associated shards are continuing to be unlinked in parallel and the client crashes
256ebe
whenever it is accessed next.
256ebe
256ebe
Fix is to pass base shard correctly, if available, in shard_link_block_inode().
256ebe
256ebe
Also, the patch fixes the ret value check in tests/bugs/shard/shard-fallocate.c
256ebe
256ebe
>Change-Id: Ibd0bc4c6952367608e10701473cbad3947d7559f
256ebe
>Updates: bz#1696136
256ebe
>Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
256ebe
256ebe
Upstream Patch: https://review.gluster.org/#/c/glusterfs/+/22507/
256ebe
256ebe
BUG: 1694595
256ebe
Change-Id: Ibd0bc4c6952367608e10701473cbad3947d7559f
256ebe
Signed-off-by: Sunil Kumar Acharya <sheggodu@redhat.com>
256ebe
Reviewed-on: https://code.engineering.redhat.com/gerrit/172856
256ebe
Tested-by: RHGS Build Bot <nigelb@redhat.com>
256ebe
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
256ebe
---
256ebe
 tests/bugs/shard/bug-1696136.c     | 121 +++++++++++++++++++++++++++++++++++++
256ebe
 tests/bugs/shard/bug-1696136.t     |  33 ++++++++++
256ebe
 tests/bugs/shard/shard-fallocate.c |   2 +-
256ebe
 xlators/features/shard/src/shard.c |  12 +++-
256ebe
 4 files changed, 164 insertions(+), 4 deletions(-)
256ebe
 create mode 100644 tests/bugs/shard/bug-1696136.c
256ebe
 create mode 100644 tests/bugs/shard/bug-1696136.t
256ebe
256ebe
diff --git a/tests/bugs/shard/bug-1696136.c b/tests/bugs/shard/bug-1696136.c
256ebe
new file mode 100644
256ebe
index 0000000..b9e8d13
256ebe
--- /dev/null
256ebe
+++ b/tests/bugs/shard/bug-1696136.c
256ebe
@@ -0,0 +1,121 @@
256ebe
+#define _GNU_SOURCE
256ebe
+#include <fcntl.h>
256ebe
+#include <stdio.h>
256ebe
+#include <stdlib.h>
256ebe
+#include <glusterfs/api/glfs.h>
256ebe
+#include <glusterfs/api/glfs-handles.h>
256ebe
+
256ebe
+enum fallocate_flag {
256ebe
+    TEST_FALLOCATE_NONE,
256ebe
+    TEST_FALLOCATE_KEEP_SIZE,
256ebe
+    TEST_FALLOCATE_ZERO_RANGE,
256ebe
+    TEST_FALLOCATE_PUNCH_HOLE,
256ebe
+    TEST_FALLOCATE_MAX,
256ebe
+};
256ebe
+
256ebe
+int
256ebe
+get_fallocate_flag(int opcode)
256ebe
+{
256ebe
+    int ret = 0;
256ebe
+
256ebe
+    switch (opcode) {
256ebe
+        case TEST_FALLOCATE_NONE:
256ebe
+            ret = 0;
256ebe
+            break;
256ebe
+        case TEST_FALLOCATE_KEEP_SIZE:
256ebe
+            ret = FALLOC_FL_KEEP_SIZE;
256ebe
+            break;
256ebe
+        case TEST_FALLOCATE_ZERO_RANGE:
256ebe
+            ret = FALLOC_FL_ZERO_RANGE;
256ebe
+            break;
256ebe
+        case TEST_FALLOCATE_PUNCH_HOLE:
256ebe
+            ret = FALLOC_FL_PUNCH_HOLE;
256ebe
+            break;
256ebe
+        default:
256ebe
+            ret = -1;
256ebe
+            break;
256ebe
+    }
256ebe
+    return ret;
256ebe
+}
256ebe
+
256ebe
+int
256ebe
+main(int argc, char *argv[])
256ebe
+{
256ebe
+    int ret = 1;
256ebe
+    int opcode = -1;
256ebe
+    off_t offset = 0;
256ebe
+    size_t len = 0;
256ebe
+    glfs_t *fs = NULL;
256ebe
+    glfs_fd_t *fd = NULL;
256ebe
+
256ebe
+    if (argc != 8) {
256ebe
+        fprintf(stderr,
256ebe
+                "Syntax: %s <host> <volname> <opcode> <offset> <len> "
256ebe
+                "<file-path> <log-file>\n",
256ebe
+                argv[0]);
256ebe
+        return 1;
256ebe
+    }
256ebe
+
256ebe
+    fs = glfs_new(argv[2]);
256ebe
+    if (!fs) {
256ebe
+        fprintf(stderr, "glfs_new: returned NULL\n");
256ebe
+        return 1;
256ebe
+    }
256ebe
+
256ebe
+    ret = glfs_set_volfile_server(fs, "tcp", argv[1], 24007);
256ebe
+    if (ret != 0) {
256ebe
+        fprintf(stderr, "glfs_set_volfile_server: returned %d\n", ret);
256ebe
+        goto out;
256ebe
+    }
256ebe
+
256ebe
+    ret = glfs_set_logging(fs, argv[7], 7);
256ebe
+    if (ret != 0) {
256ebe
+        fprintf(stderr, "glfs_set_logging: returned %d\n", ret);
256ebe
+        goto out;
256ebe
+    }
256ebe
+
256ebe
+    ret = glfs_init(fs);
256ebe
+    if (ret != 0) {
256ebe
+        fprintf(stderr, "glfs_init: returned %d\n", ret);
256ebe
+        goto out;
256ebe
+    }
256ebe
+
256ebe
+    opcode = atoi(argv[3]);
256ebe
+    opcode = get_fallocate_flag(opcode);
256ebe
+    if (opcode < 0) {
256ebe
+        fprintf(stderr, "get_fallocate_flag: invalid flag \n");
256ebe
+        goto out;
256ebe
+    }
256ebe
+
256ebe
+    offset = atoi(argv[4]);
256ebe
+    len = atoi(argv[5]);
256ebe
+
256ebe
+    fd = glfs_open(fs, argv[6], O_RDWR);
256ebe
+    if (fd == NULL) {
256ebe
+        fprintf(stderr, "glfs_open: returned NULL\n");
256ebe
+        goto out;
256ebe
+    }
256ebe
+
256ebe
+    ret = glfs_fallocate(fd, opcode, offset, len);
256ebe
+    if (ret < 0) {
256ebe
+        fprintf(stderr, "glfs_fallocate: returned %d\n", ret);
256ebe
+        goto out;
256ebe
+    }
256ebe
+
256ebe
+    ret = glfs_unlink(fs, argv[6]);
256ebe
+    if (ret < 0) {
256ebe
+        fprintf(stderr, "glfs_unlink: returned %d\n", ret);
256ebe
+        goto out;
256ebe
+    }
256ebe
+    /* Sleep for 3s to give enough time for background deletion to complete
256ebe
+     * during which if the bug exists, the process will crash.
256ebe
+     */
256ebe
+    sleep(3);
256ebe
+    ret = 0;
256ebe
+
256ebe
+out:
256ebe
+    if (fd)
256ebe
+        glfs_close(fd);
256ebe
+    glfs_fini(fs);
256ebe
+    return ret;
256ebe
+}
256ebe
diff --git a/tests/bugs/shard/bug-1696136.t b/tests/bugs/shard/bug-1696136.t
256ebe
new file mode 100644
256ebe
index 0000000..b6dc858
256ebe
--- /dev/null
256ebe
+++ b/tests/bugs/shard/bug-1696136.t
256ebe
@@ -0,0 +1,33 @@
256ebe
+#!/bin/bash
256ebe
+
256ebe
+. $(dirname $0)/../../include.rc
256ebe
+. $(dirname $0)/../../volume.rc
256ebe
+. $(dirname $0)/../../fallocate.rc
256ebe
+
256ebe
+cleanup
256ebe
+
256ebe
+TEST glusterd
256ebe
+TEST pidof glusterd
256ebe
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
256ebe
+TEST $CLI volume set $V0 features.shard on
256ebe
+TEST $CLI volume set $V0 features.shard-block-size 4MB
256ebe
+TEST $CLI volume set $V0 features.shard-lru-limit 120
256ebe
+TEST $CLI volume set $V0 performance.write-behind off
256ebe
+TEST $CLI volume start $V0
256ebe
+
256ebe
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
256ebe
+
256ebe
+TEST build_tester $(dirname $0)/bug-1696136.c -lgfapi -Wall -O2
256ebe
+
256ebe
+# Create a file
256ebe
+TEST touch $M0/file1
256ebe
+
256ebe
+# Fallocate a 500M file. This will make sure number of participant shards are > lru-limit
256ebe
+TEST $(dirname $0)/bug-1696136 $H0 $V0 "0" "0" "536870912" /file1 `gluster --print-logdir`/glfs-$V0.log
256ebe
+
256ebe
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
256ebe
+TEST $CLI volume stop $V0
256ebe
+TEST $CLI volume delete $V0
256ebe
+rm -f $(dirname $0)/bug-1696136
256ebe
+
256ebe
+cleanup
256ebe
diff --git a/tests/bugs/shard/shard-fallocate.c b/tests/bugs/shard/shard-fallocate.c
256ebe
index 3a784d3..45b9ce0 100644
256ebe
--- a/tests/bugs/shard/shard-fallocate.c
256ebe
+++ b/tests/bugs/shard/shard-fallocate.c
256ebe
@@ -97,7 +97,7 @@ main(int argc, char *argv[])
256ebe
     }
256ebe
 
256ebe
     ret = glfs_fallocate(fd, opcode, offset, len);
256ebe
-    if (ret <= 0) {
256ebe
+    if (ret < 0) {
256ebe
         fprintf(stderr, "glfs_fallocate: returned %d\n", ret);
256ebe
         goto out;
256ebe
     }
256ebe
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
256ebe
index fa3564a..3c4bcdc 100644
256ebe
--- a/xlators/features/shard/src/shard.c
256ebe
+++ b/xlators/features/shard/src/shard.c
256ebe
@@ -2213,13 +2213,19 @@ shard_link_block_inode(shard_local_t *local, int block_num, inode_t *inode,
256ebe
     xlator_t *this = NULL;
256ebe
     inode_t *fsync_inode = NULL;
256ebe
     shard_priv_t *priv = NULL;
256ebe
+    inode_t *base_inode = NULL;
256ebe
 
256ebe
     this = THIS;
256ebe
     priv = this->private;
256ebe
-    if (local->loc.inode)
256ebe
+    if (local->loc.inode) {
256ebe
         gf_uuid_copy(gfid, local->loc.inode->gfid);
256ebe
-    else
256ebe
+        base_inode = local->loc.inode;
256ebe
+    } else if (local->resolver_base_inode) {
256ebe
+        gf_uuid_copy(gfid, local->resolver_base_inode->gfid);
256ebe
+        base_inode = local->resolver_base_inode;
256ebe
+    } else {
256ebe
         gf_uuid_copy(gfid, local->base_gfid);
256ebe
+    }
256ebe
 
256ebe
     shard_make_block_bname(block_num, gfid, block_bname, sizeof(block_bname));
256ebe
 
256ebe
@@ -2232,7 +2238,7 @@ shard_link_block_inode(shard_local_t *local, int block_num, inode_t *inode,
256ebe
     LOCK(&priv->lock);
256ebe
     {
256ebe
         fsync_inode = __shard_update_shards_inode_list(
256ebe
-            linked_inode, this, local->loc.inode, block_num, gfid);
256ebe
+            linked_inode, this, base_inode, block_num, gfid);
256ebe
     }
256ebe
     UNLOCK(&priv->lock);
256ebe
     if (fsync_inode)
256ebe
-- 
256ebe
1.8.3.1
256ebe