233933
From 40ac42501d6bbff7206e753e8e988beefe74f5f4 Mon Sep 17 00:00:00 2001
233933
From: Krutika Dhananjay <kdhananj@redhat.com>
233933
Date: Fri, 5 Apr 2019 10:30:23 +0530
233933
Subject: [PATCH 176/178] features/shard: Fix crash during background shard
233933
 deletion in a specific case
233933
233933
Consider the following case -
233933
1. A file gets FALLOCATE'd such that > "shard-lru-limit" number of
233933
   shards are created.
233933
2. And then it is deleted after that.
233933
233933
The unique thing about FALLOCATE is that unlike WRITE, all of the
233933
participant shards are resolved and created and fallocated in a single
233933
batch. This means, in this case, after the first "shard-lru-limit"
233933
number of shards are resolved and added to lru list, as part of
233933
resolution of the remaining shards, some of the existing shards in lru
233933
list will need to be evicted. So these evicted shards will be
233933
inode_unlink()d as part of eviction. Now once the fop gets to the actual
233933
FALLOCATE stage, the lru'd-out shards get added to fsync list.
233933
233933
2 things to note at this point:
233933
i. the lru'd out shards are only part of fsync list, so each holds 1 ref
233933
   on base shard
233933
ii. and the more recently used shards are part of both fsync and lru list.
233933
    So each of these shards holds 2 refs on base inode  - one for being
233933
    part of fsync list, and the other for being part of lru list.
233933
233933
FALLOCATE completes successfully and then this very file is deleted, and
233933
background shard deletion launched. Here's where the ref counts get mismatched.
233933
First as part of inode_resolve()s during the deletion, the lru'd-out inodes
233933
return NULL, because they are inode_unlink()'d by now. So these inodes need to
233933
be freshly looked up. But as part of linking them in lookup_cbk (precisely in
233933
shard_link_block_inode()), inode_link() returns the lru'd-out inode object.
233933
And its inode ctx is still valid and ctx->base_inode valid from the last
233933
time it was added to list.
233933
233933
But shard_common_lookup_shards_cbk() passes NULL in the place of base_pointer
233933
to __shard_update_shards_inode_list(). This means, as part of adding the lru'd out
233933
inode back to lru list, base inode is not ref'd since its NULL.
233933
233933
Whereas post unlinking this shard, during shard_unlink_block_inode(),
233933
ctx->base_inode is accessible and is unref'd because the shard was found to be part
233933
of LRU list, although the matching ref didn't occur. This at some point leads to
233933
base_inode refcount becoming 0 and it getting destroyed and released back while some
233933
of its associated shards are continuing to be unlinked in parallel and the client crashes
233933
whenever it is accessed next.
233933
233933
Fix is to pass base shard correctly, if available, in shard_link_block_inode().
233933
233933
Also, the patch fixes the ret value check in tests/bugs/shard/shard-fallocate.c
233933
233933
>Change-Id: Ibd0bc4c6952367608e10701473cbad3947d7559f
233933
>Updates: bz#1696136
233933
>Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
233933
233933
Upstream Patch: https://review.gluster.org/#/c/glusterfs/+/22507/
233933
233933
BUG: 1694595
233933
Change-Id: Ibd0bc4c6952367608e10701473cbad3947d7559f
233933
Signed-off-by: Sunil Kumar Acharya <sheggodu@redhat.com>
233933
Reviewed-on: https://code.engineering.redhat.com/gerrit/172856
233933
Tested-by: RHGS Build Bot <nigelb@redhat.com>
233933
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
233933
---
233933
 tests/bugs/shard/bug-1696136.c     | 121 +++++++++++++++++++++++++++++++++++++
233933
 tests/bugs/shard/bug-1696136.t     |  33 ++++++++++
233933
 tests/bugs/shard/shard-fallocate.c |   2 +-
233933
 xlators/features/shard/src/shard.c |  12 +++-
233933
 4 files changed, 164 insertions(+), 4 deletions(-)
233933
 create mode 100644 tests/bugs/shard/bug-1696136.c
233933
 create mode 100644 tests/bugs/shard/bug-1696136.t
233933
233933
diff --git a/tests/bugs/shard/bug-1696136.c b/tests/bugs/shard/bug-1696136.c
233933
new file mode 100644
233933
index 0000000..b9e8d13
233933
--- /dev/null
233933
+++ b/tests/bugs/shard/bug-1696136.c
233933
@@ -0,0 +1,121 @@
233933
+#define _GNU_SOURCE
233933
+#include <fcntl.h>
233933
+#include <stdio.h>
233933
+#include <stdlib.h>
233933
+#include <glusterfs/api/glfs.h>
233933
+#include <glusterfs/api/glfs-handles.h>
233933
+
233933
+enum fallocate_flag {
233933
+    TEST_FALLOCATE_NONE,
233933
+    TEST_FALLOCATE_KEEP_SIZE,
233933
+    TEST_FALLOCATE_ZERO_RANGE,
233933
+    TEST_FALLOCATE_PUNCH_HOLE,
233933
+    TEST_FALLOCATE_MAX,
233933
+};
233933
+
233933
+int
233933
+get_fallocate_flag(int opcode)
233933
+{
233933
+    int ret = 0;
233933
+
233933
+    switch (opcode) {
233933
+        case TEST_FALLOCATE_NONE:
233933
+            ret = 0;
233933
+            break;
233933
+        case TEST_FALLOCATE_KEEP_SIZE:
233933
+            ret = FALLOC_FL_KEEP_SIZE;
233933
+            break;
233933
+        case TEST_FALLOCATE_ZERO_RANGE:
233933
+            ret = FALLOC_FL_ZERO_RANGE;
233933
+            break;
233933
+        case TEST_FALLOCATE_PUNCH_HOLE:
233933
+            ret = FALLOC_FL_PUNCH_HOLE;
233933
+            break;
233933
+        default:
233933
+            ret = -1;
233933
+            break;
233933
+    }
233933
+    return ret;
233933
+}
233933
+
233933
+int
233933
+main(int argc, char *argv[])
233933
+{
233933
+    int ret = 1;
233933
+    int opcode = -1;
233933
+    off_t offset = 0;
233933
+    size_t len = 0;
233933
+    glfs_t *fs = NULL;
233933
+    glfs_fd_t *fd = NULL;
233933
+
233933
+    if (argc != 8) {
233933
+        fprintf(stderr,
233933
+                "Syntax: %s <host> <volname> <opcode> <offset> <len> "
233933
+                "<file-path> <log-file>\n",
233933
+                argv[0]);
233933
+        return 1;
233933
+    }
233933
+
233933
+    fs = glfs_new(argv[2]);
233933
+    if (!fs) {
233933
+        fprintf(stderr, "glfs_new: returned NULL\n");
233933
+        return 1;
233933
+    }
233933
+
233933
+    ret = glfs_set_volfile_server(fs, "tcp", argv[1], 24007);
233933
+    if (ret != 0) {
233933
+        fprintf(stderr, "glfs_set_volfile_server: returned %d\n", ret);
233933
+        goto out;
233933
+    }
233933
+
233933
+    ret = glfs_set_logging(fs, argv[7], 7);
233933
+    if (ret != 0) {
233933
+        fprintf(stderr, "glfs_set_logging: returned %d\n", ret);
233933
+        goto out;
233933
+    }
233933
+
233933
+    ret = glfs_init(fs);
233933
+    if (ret != 0) {
233933
+        fprintf(stderr, "glfs_init: returned %d\n", ret);
233933
+        goto out;
233933
+    }
233933
+
233933
+    opcode = atoi(argv[3]);
233933
+    opcode = get_fallocate_flag(opcode);
233933
+    if (opcode < 0) {
233933
+        fprintf(stderr, "get_fallocate_flag: invalid flag \n");
233933
+        goto out;
233933
+    }
233933
+
233933
+    offset = atoi(argv[4]);
233933
+    len = atoi(argv[5]);
233933
+
233933
+    fd = glfs_open(fs, argv[6], O_RDWR);
233933
+    if (fd == NULL) {
233933
+        fprintf(stderr, "glfs_open: returned NULL\n");
233933
+        goto out;
233933
+    }
233933
+
233933
+    ret = glfs_fallocate(fd, opcode, offset, len);
233933
+    if (ret < 0) {
233933
+        fprintf(stderr, "glfs_fallocate: returned %d\n", ret);
233933
+        goto out;
233933
+    }
233933
+
233933
+    ret = glfs_unlink(fs, argv[6]);
233933
+    if (ret < 0) {
233933
+        fprintf(stderr, "glfs_unlink: returned %d\n", ret);
233933
+        goto out;
233933
+    }
233933
+    /* Sleep for 3s to give enough time for background deletion to complete
233933
+     * during which if the bug exists, the process will crash.
233933
+     */
233933
+    sleep(3);
233933
+    ret = 0;
233933
+
233933
+out:
233933
+    if (fd)
233933
+        glfs_close(fd);
233933
+    glfs_fini(fs);
233933
+    return ret;
233933
+}
233933
diff --git a/tests/bugs/shard/bug-1696136.t b/tests/bugs/shard/bug-1696136.t
233933
new file mode 100644
233933
index 0000000..b6dc858
233933
--- /dev/null
233933
+++ b/tests/bugs/shard/bug-1696136.t
233933
@@ -0,0 +1,33 @@
233933
+#!/bin/bash
233933
+
233933
+. $(dirname $0)/../../include.rc
233933
+. $(dirname $0)/../../volume.rc
233933
+. $(dirname $0)/../../fallocate.rc
233933
+
233933
+cleanup
233933
+
233933
+TEST glusterd
233933
+TEST pidof glusterd
233933
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
233933
+TEST $CLI volume set $V0 features.shard on
233933
+TEST $CLI volume set $V0 features.shard-block-size 4MB
233933
+TEST $CLI volume set $V0 features.shard-lru-limit 120
233933
+TEST $CLI volume set $V0 performance.write-behind off
233933
+TEST $CLI volume start $V0
233933
+
233933
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
233933
+
233933
+TEST build_tester $(dirname $0)/bug-1696136.c -lgfapi -Wall -O2
233933
+
233933
+# Create a file
233933
+TEST touch $M0/file1
233933
+
233933
+# Fallocate a 500M file. This will make sure number of participant shards are > lru-limit
233933
+TEST $(dirname $0)/bug-1696136 $H0 $V0 "0" "0" "536870912" /file1 `gluster --print-logdir`/glfs-$V0.log
233933
+
233933
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
233933
+TEST $CLI volume stop $V0
233933
+TEST $CLI volume delete $V0
233933
+rm -f $(dirname $0)/bug-1696136
233933
+
233933
+cleanup
233933
diff --git a/tests/bugs/shard/shard-fallocate.c b/tests/bugs/shard/shard-fallocate.c
233933
index 3a784d3..45b9ce0 100644
233933
--- a/tests/bugs/shard/shard-fallocate.c
233933
+++ b/tests/bugs/shard/shard-fallocate.c
233933
@@ -97,7 +97,7 @@ main(int argc, char *argv[])
233933
     }
233933
 
233933
     ret = glfs_fallocate(fd, opcode, offset, len);
233933
-    if (ret <= 0) {
233933
+    if (ret < 0) {
233933
         fprintf(stderr, "glfs_fallocate: returned %d\n", ret);
233933
         goto out;
233933
     }
233933
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
233933
index fa3564a..3c4bcdc 100644
233933
--- a/xlators/features/shard/src/shard.c
233933
+++ b/xlators/features/shard/src/shard.c
233933
@@ -2213,13 +2213,19 @@ shard_link_block_inode(shard_local_t *local, int block_num, inode_t *inode,
233933
     xlator_t *this = NULL;
233933
     inode_t *fsync_inode = NULL;
233933
     shard_priv_t *priv = NULL;
233933
+    inode_t *base_inode = NULL;
233933
 
233933
     this = THIS;
233933
     priv = this->private;
233933
-    if (local->loc.inode)
233933
+    if (local->loc.inode) {
233933
         gf_uuid_copy(gfid, local->loc.inode->gfid);
233933
-    else
233933
+        base_inode = local->loc.inode;
233933
+    } else if (local->resolver_base_inode) {
233933
+        gf_uuid_copy(gfid, local->resolver_base_inode->gfid);
233933
+        base_inode = local->resolver_base_inode;
233933
+    } else {
233933
         gf_uuid_copy(gfid, local->base_gfid);
233933
+    }
233933
 
233933
     shard_make_block_bname(block_num, gfid, block_bname, sizeof(block_bname));
233933
 
233933
@@ -2232,7 +2238,7 @@ shard_link_block_inode(shard_local_t *local, int block_num, inode_t *inode,
233933
     LOCK(&priv->lock);
233933
     {
233933
         fsync_inode = __shard_update_shards_inode_list(
233933
-            linked_inode, this, local->loc.inode, block_num, gfid);
233933
+            linked_inode, this, base_inode, block_num, gfid);
233933
     }
233933
     UNLOCK(&priv->lock);
233933
     if (fsync_inode)
233933
-- 
233933
1.8.3.1
233933