17b94a
From 40ac42501d6bbff7206e753e8e988beefe74f5f4 Mon Sep 17 00:00:00 2001
17b94a
From: Krutika Dhananjay <kdhananj@redhat.com>
17b94a
Date: Fri, 5 Apr 2019 10:30:23 +0530
17b94a
Subject: [PATCH 176/178] features/shard: Fix crash during background shard
17b94a
 deletion in a specific case
17b94a
17b94a
Consider the following case -
17b94a
1. A file gets FALLOCATE'd such that > "shard-lru-limit" number of
17b94a
   shards are created.
17b94a
2. And then it is deleted after that.
17b94a
17b94a
The unique thing about FALLOCATE is that unlike WRITE, all of the
17b94a
participant shards are resolved and created and fallocated in a single
17b94a
batch. This means, in this case, after the first "shard-lru-limit"
17b94a
number of shards are resolved and added to lru list, as part of
17b94a
resolution of the remaining shards, some of the existing shards in lru
17b94a
list will need to be evicted. So these evicted shards will be
17b94a
inode_unlink()d as part of eviction. Now once the fop gets to the actual
17b94a
FALLOCATE stage, the lru'd-out shards get added to fsync list.
17b94a
17b94a
2 things to note at this point:
17b94a
i. the lru'd out shards are only part of fsync list, so each holds 1 ref
17b94a
   on base shard
17b94a
ii. and the more recently used shards are part of both fsync and lru list.
17b94a
    So each of these shards holds 2 refs on base inode  - one for being
17b94a
    part of fsync list, and the other for being part of lru list.
17b94a
17b94a
FALLOCATE completes successfully and then this very file is deleted, and
17b94a
background shard deletion launched. Here's where the ref counts get mismatched.
17b94a
First as part of inode_resolve()s during the deletion, the lru'd-out inodes
17b94a
return NULL, because they are inode_unlink()'d by now. So these inodes need to
17b94a
be freshly looked up. But as part of linking them in lookup_cbk (precisely in
17b94a
shard_link_block_inode()), inode_link() returns the lru'd-out inode object.
17b94a
And its inode ctx is still valid and ctx->base_inode valid from the last
17b94a
time it was added to list.
17b94a
17b94a
But shard_common_lookup_shards_cbk() passes NULL in the place of base_pointer
17b94a
to __shard_update_shards_inode_list(). This means, as part of adding the lru'd out
17b94a
inode back to lru list, base inode is not ref'd since its NULL.
17b94a
17b94a
Whereas post unlinking this shard, during shard_unlink_block_inode(),
17b94a
ctx->base_inode is accessible and is unref'd because the shard was found to be part
17b94a
of LRU list, although the matching ref didn't occur. This at some point leads to
17b94a
base_inode refcount becoming 0 and it getting destroyed and released back while some
17b94a
of its associated shards are continuing to be unlinked in parallel and the client crashes
17b94a
whenever it is accessed next.
17b94a
17b94a
Fix is to pass base shard correctly, if available, in shard_link_block_inode().
17b94a
17b94a
Also, the patch fixes the ret value check in tests/bugs/shard/shard-fallocate.c
17b94a
17b94a
>Change-Id: Ibd0bc4c6952367608e10701473cbad3947d7559f
17b94a
>Updates: bz#1696136
17b94a
>Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
17b94a
17b94a
Upstream Patch: https://review.gluster.org/#/c/glusterfs/+/22507/
17b94a
17b94a
BUG: 1694595
17b94a
Change-Id: Ibd0bc4c6952367608e10701473cbad3947d7559f
17b94a
Signed-off-by: Sunil Kumar Acharya <sheggodu@redhat.com>
17b94a
Reviewed-on: https://code.engineering.redhat.com/gerrit/172856
17b94a
Tested-by: RHGS Build Bot <nigelb@redhat.com>
17b94a
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
17b94a
---
17b94a
 tests/bugs/shard/bug-1696136.c     | 121 +++++++++++++++++++++++++++++++++++++
17b94a
 tests/bugs/shard/bug-1696136.t     |  33 ++++++++++
17b94a
 tests/bugs/shard/shard-fallocate.c |   2 +-
17b94a
 xlators/features/shard/src/shard.c |  12 +++-
17b94a
 4 files changed, 164 insertions(+), 4 deletions(-)
17b94a
 create mode 100644 tests/bugs/shard/bug-1696136.c
17b94a
 create mode 100644 tests/bugs/shard/bug-1696136.t
17b94a
17b94a
diff --git a/tests/bugs/shard/bug-1696136.c b/tests/bugs/shard/bug-1696136.c
17b94a
new file mode 100644
17b94a
index 0000000..b9e8d13
17b94a
--- /dev/null
17b94a
+++ b/tests/bugs/shard/bug-1696136.c
17b94a
@@ -0,0 +1,121 @@
17b94a
+#define _GNU_SOURCE
17b94a
+#include <fcntl.h>
17b94a
+#include <stdio.h>
17b94a
+#include <stdlib.h>
17b94a
+#include <glusterfs/api/glfs.h>
17b94a
+#include <glusterfs/api/glfs-handles.h>
17b94a
+
17b94a
+enum fallocate_flag {
17b94a
+    TEST_FALLOCATE_NONE,
17b94a
+    TEST_FALLOCATE_KEEP_SIZE,
17b94a
+    TEST_FALLOCATE_ZERO_RANGE,
17b94a
+    TEST_FALLOCATE_PUNCH_HOLE,
17b94a
+    TEST_FALLOCATE_MAX,
17b94a
+};
17b94a
+
17b94a
+int
17b94a
+get_fallocate_flag(int opcode)
17b94a
+{
17b94a
+    int ret = 0;
17b94a
+
17b94a
+    switch (opcode) {
17b94a
+        case TEST_FALLOCATE_NONE:
17b94a
+            ret = 0;
17b94a
+            break;
17b94a
+        case TEST_FALLOCATE_KEEP_SIZE:
17b94a
+            ret = FALLOC_FL_KEEP_SIZE;
17b94a
+            break;
17b94a
+        case TEST_FALLOCATE_ZERO_RANGE:
17b94a
+            ret = FALLOC_FL_ZERO_RANGE;
17b94a
+            break;
17b94a
+        case TEST_FALLOCATE_PUNCH_HOLE:
17b94a
+            ret = FALLOC_FL_PUNCH_HOLE;
17b94a
+            break;
17b94a
+        default:
17b94a
+            ret = -1;
17b94a
+            break;
17b94a
+    }
17b94a
+    return ret;
17b94a
+}
17b94a
+
17b94a
+int
17b94a
+main(int argc, char *argv[])
17b94a
+{
17b94a
+    int ret = 1;
17b94a
+    int opcode = -1;
17b94a
+    off_t offset = 0;
17b94a
+    size_t len = 0;
17b94a
+    glfs_t *fs = NULL;
17b94a
+    glfs_fd_t *fd = NULL;
17b94a
+
17b94a
+    if (argc != 8) {
17b94a
+        fprintf(stderr,
17b94a
+                "Syntax: %s <host> <volname> <opcode> <offset> <len> "
17b94a
+                "<file-path> <log-file>\n",
17b94a
+                argv[0]);
17b94a
+        return 1;
17b94a
+    }
17b94a
+
17b94a
+    fs = glfs_new(argv[2]);
17b94a
+    if (!fs) {
17b94a
+        fprintf(stderr, "glfs_new: returned NULL\n");
17b94a
+        return 1;
17b94a
+    }
17b94a
+
17b94a
+    ret = glfs_set_volfile_server(fs, "tcp", argv[1], 24007);
17b94a
+    if (ret != 0) {
17b94a
+        fprintf(stderr, "glfs_set_volfile_server: returned %d\n", ret);
17b94a
+        goto out;
17b94a
+    }
17b94a
+
17b94a
+    ret = glfs_set_logging(fs, argv[7], 7);
17b94a
+    if (ret != 0) {
17b94a
+        fprintf(stderr, "glfs_set_logging: returned %d\n", ret);
17b94a
+        goto out;
17b94a
+    }
17b94a
+
17b94a
+    ret = glfs_init(fs);
17b94a
+    if (ret != 0) {
17b94a
+        fprintf(stderr, "glfs_init: returned %d\n", ret);
17b94a
+        goto out;
17b94a
+    }
17b94a
+
17b94a
+    opcode = atoi(argv[3]);
17b94a
+    opcode = get_fallocate_flag(opcode);
17b94a
+    if (opcode < 0) {
17b94a
+        fprintf(stderr, "get_fallocate_flag: invalid flag \n");
17b94a
+        goto out;
17b94a
+    }
17b94a
+
17b94a
+    offset = atoi(argv[4]);
17b94a
+    len = atoi(argv[5]);
17b94a
+
17b94a
+    fd = glfs_open(fs, argv[6], O_RDWR);
17b94a
+    if (fd == NULL) {
17b94a
+        fprintf(stderr, "glfs_open: returned NULL\n");
17b94a
+        goto out;
17b94a
+    }
17b94a
+
17b94a
+    ret = glfs_fallocate(fd, opcode, offset, len);
17b94a
+    if (ret < 0) {
17b94a
+        fprintf(stderr, "glfs_fallocate: returned %d\n", ret);
17b94a
+        goto out;
17b94a
+    }
17b94a
+
17b94a
+    ret = glfs_unlink(fs, argv[6]);
17b94a
+    if (ret < 0) {
17b94a
+        fprintf(stderr, "glfs_unlink: returned %d\n", ret);
17b94a
+        goto out;
17b94a
+    }
17b94a
+    /* Sleep for 3s to give enough time for background deletion to complete
17b94a
+     * during which if the bug exists, the process will crash.
17b94a
+     */
17b94a
+    sleep(3);
17b94a
+    ret = 0;
17b94a
+
17b94a
+out:
17b94a
+    if (fd)
17b94a
+        glfs_close(fd);
17b94a
+    glfs_fini(fs);
17b94a
+    return ret;
17b94a
+}
17b94a
diff --git a/tests/bugs/shard/bug-1696136.t b/tests/bugs/shard/bug-1696136.t
17b94a
new file mode 100644
17b94a
index 0000000..b6dc858
17b94a
--- /dev/null
17b94a
+++ b/tests/bugs/shard/bug-1696136.t
17b94a
@@ -0,0 +1,33 @@
17b94a
+#!/bin/bash
17b94a
+
17b94a
+. $(dirname $0)/../../include.rc
17b94a
+. $(dirname $0)/../../volume.rc
17b94a
+. $(dirname $0)/../../fallocate.rc
17b94a
+
17b94a
+cleanup
17b94a
+
17b94a
+TEST glusterd
17b94a
+TEST pidof glusterd
17b94a
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
17b94a
+TEST $CLI volume set $V0 features.shard on
17b94a
+TEST $CLI volume set $V0 features.shard-block-size 4MB
17b94a
+TEST $CLI volume set $V0 features.shard-lru-limit 120
17b94a
+TEST $CLI volume set $V0 performance.write-behind off
17b94a
+TEST $CLI volume start $V0
17b94a
+
17b94a
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
17b94a
+
17b94a
+TEST build_tester $(dirname $0)/bug-1696136.c -lgfapi -Wall -O2
17b94a
+
17b94a
+# Create a file
17b94a
+TEST touch $M0/file1
17b94a
+
17b94a
+# Fallocate a 500M file. This will make sure number of participant shards are > lru-limit
17b94a
+TEST $(dirname $0)/bug-1696136 $H0 $V0 "0" "0" "536870912" /file1 `gluster --print-logdir`/glfs-$V0.log
17b94a
+
17b94a
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
17b94a
+TEST $CLI volume stop $V0
17b94a
+TEST $CLI volume delete $V0
17b94a
+rm -f $(dirname $0)/bug-1696136
17b94a
+
17b94a
+cleanup
17b94a
diff --git a/tests/bugs/shard/shard-fallocate.c b/tests/bugs/shard/shard-fallocate.c
17b94a
index 3a784d3..45b9ce0 100644
17b94a
--- a/tests/bugs/shard/shard-fallocate.c
17b94a
+++ b/tests/bugs/shard/shard-fallocate.c
17b94a
@@ -97,7 +97,7 @@ main(int argc, char *argv[])
17b94a
     }
17b94a
 
17b94a
     ret = glfs_fallocate(fd, opcode, offset, len);
17b94a
-    if (ret <= 0) {
17b94a
+    if (ret < 0) {
17b94a
         fprintf(stderr, "glfs_fallocate: returned %d\n", ret);
17b94a
         goto out;
17b94a
     }
17b94a
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
17b94a
index fa3564a..3c4bcdc 100644
17b94a
--- a/xlators/features/shard/src/shard.c
17b94a
+++ b/xlators/features/shard/src/shard.c
17b94a
@@ -2213,13 +2213,19 @@ shard_link_block_inode(shard_local_t *local, int block_num, inode_t *inode,
17b94a
     xlator_t *this = NULL;
17b94a
     inode_t *fsync_inode = NULL;
17b94a
     shard_priv_t *priv = NULL;
17b94a
+    inode_t *base_inode = NULL;
17b94a
 
17b94a
     this = THIS;
17b94a
     priv = this->private;
17b94a
-    if (local->loc.inode)
17b94a
+    if (local->loc.inode) {
17b94a
         gf_uuid_copy(gfid, local->loc.inode->gfid);
17b94a
-    else
17b94a
+        base_inode = local->loc.inode;
17b94a
+    } else if (local->resolver_base_inode) {
17b94a
+        gf_uuid_copy(gfid, local->resolver_base_inode->gfid);
17b94a
+        base_inode = local->resolver_base_inode;
17b94a
+    } else {
17b94a
         gf_uuid_copy(gfid, local->base_gfid);
17b94a
+    }
17b94a
 
17b94a
     shard_make_block_bname(block_num, gfid, block_bname, sizeof(block_bname));
17b94a
 
17b94a
@@ -2232,7 +2238,7 @@ shard_link_block_inode(shard_local_t *local, int block_num, inode_t *inode,
17b94a
     LOCK(&priv->lock);
17b94a
     {
17b94a
         fsync_inode = __shard_update_shards_inode_list(
17b94a
-            linked_inode, this, local->loc.inode, block_num, gfid);
17b94a
+            linked_inode, this, base_inode, block_num, gfid);
17b94a
     }
17b94a
     UNLOCK(&priv->lock);
17b94a
     if (fsync_inode)
17b94a
-- 
17b94a
1.8.3.1
17b94a