50dc83
From 40ac42501d6bbff7206e753e8e988beefe74f5f4 Mon Sep 17 00:00:00 2001
50dc83
From: Krutika Dhananjay <kdhananj@redhat.com>
50dc83
Date: Fri, 5 Apr 2019 10:30:23 +0530
50dc83
Subject: [PATCH 176/178] features/shard: Fix crash during background shard
50dc83
 deletion in a specific case
50dc83
50dc83
Consider the following case -
50dc83
1. A file gets FALLOCATE'd such that > "shard-lru-limit" number of
50dc83
   shards are created.
50dc83
2. And then it is deleted after that.
50dc83
50dc83
The unique thing about FALLOCATE is that unlike WRITE, all of the
50dc83
participant shards are resolved and created and fallocated in a single
50dc83
batch. This means, in this case, after the first "shard-lru-limit"
50dc83
number of shards are resolved and added to lru list, as part of
50dc83
resolution of the remaining shards, some of the existing shards in lru
50dc83
list will need to be evicted. So these evicted shards will be
50dc83
inode_unlink()d as part of eviction. Now once the fop gets to the actual
50dc83
FALLOCATE stage, the lru'd-out shards get added to fsync list.
50dc83
50dc83
2 things to note at this point:
50dc83
i. the lru'd out shards are only part of fsync list, so each holds 1 ref
50dc83
   on base shard
50dc83
ii. and the more recently used shards are part of both fsync and lru list.
50dc83
    So each of these shards holds 2 refs on base inode  - one for being
50dc83
    part of fsync list, and the other for being part of lru list.
50dc83
50dc83
FALLOCATE completes successfully and then this very file is deleted, and
50dc83
background shard deletion launched. Here's where the ref counts get mismatched.
50dc83
First as part of inode_resolve()s during the deletion, the lru'd-out inodes
50dc83
return NULL, because they are inode_unlink()'d by now. So these inodes need to
50dc83
be freshly looked up. But as part of linking them in lookup_cbk (precisely in
50dc83
shard_link_block_inode()), inode_link() returns the lru'd-out inode object.
50dc83
And its inode ctx is still valid and ctx->base_inode valid from the last
50dc83
time it was added to list.
50dc83
50dc83
But shard_common_lookup_shards_cbk() passes NULL in the place of base_pointer
50dc83
to __shard_update_shards_inode_list(). This means, as part of adding the lru'd out
50dc83
inode back to lru list, base inode is not ref'd since its NULL.
50dc83
50dc83
Whereas post unlinking this shard, during shard_unlink_block_inode(),
50dc83
ctx->base_inode is accessible and is unref'd because the shard was found to be part
50dc83
of LRU list, although the matching ref didn't occur. This at some point leads to
50dc83
base_inode refcount becoming 0 and it getting destroyed and released back while some
50dc83
of its associated shards are continuing to be unlinked in parallel and the client crashes
50dc83
whenever it is accessed next.
50dc83
50dc83
Fix is to pass base shard correctly, if available, in shard_link_block_inode().
50dc83
50dc83
Also, the patch fixes the ret value check in tests/bugs/shard/shard-fallocate.c
50dc83
50dc83
>Change-Id: Ibd0bc4c6952367608e10701473cbad3947d7559f
50dc83
>Updates: bz#1696136
50dc83
>Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
50dc83
50dc83
Upstream Patch: https://review.gluster.org/#/c/glusterfs/+/22507/
50dc83
50dc83
BUG: 1694595
50dc83
Change-Id: Ibd0bc4c6952367608e10701473cbad3947d7559f
50dc83
Signed-off-by: Sunil Kumar Acharya <sheggodu@redhat.com>
50dc83
Reviewed-on: https://code.engineering.redhat.com/gerrit/172856
50dc83
Tested-by: RHGS Build Bot <nigelb@redhat.com>
50dc83
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
50dc83
---
50dc83
 tests/bugs/shard/bug-1696136.c     | 121 +++++++++++++++++++++++++++++++++++++
50dc83
 tests/bugs/shard/bug-1696136.t     |  33 ++++++++++
50dc83
 tests/bugs/shard/shard-fallocate.c |   2 +-
50dc83
 xlators/features/shard/src/shard.c |  12 +++-
50dc83
 4 files changed, 164 insertions(+), 4 deletions(-)
50dc83
 create mode 100644 tests/bugs/shard/bug-1696136.c
50dc83
 create mode 100644 tests/bugs/shard/bug-1696136.t
50dc83
50dc83
diff --git a/tests/bugs/shard/bug-1696136.c b/tests/bugs/shard/bug-1696136.c
50dc83
new file mode 100644
50dc83
index 0000000..b9e8d13
50dc83
--- /dev/null
50dc83
+++ b/tests/bugs/shard/bug-1696136.c
50dc83
@@ -0,0 +1,121 @@
50dc83
+#define _GNU_SOURCE
50dc83
+#include <fcntl.h>
50dc83
+#include <stdio.h>
50dc83
+#include <stdlib.h>
50dc83
+#include <glusterfs/api/glfs.h>
50dc83
+#include <glusterfs/api/glfs-handles.h>
50dc83
+
50dc83
+enum fallocate_flag {
50dc83
+    TEST_FALLOCATE_NONE,
50dc83
+    TEST_FALLOCATE_KEEP_SIZE,
50dc83
+    TEST_FALLOCATE_ZERO_RANGE,
50dc83
+    TEST_FALLOCATE_PUNCH_HOLE,
50dc83
+    TEST_FALLOCATE_MAX,
50dc83
+};
50dc83
+
50dc83
+int
50dc83
+get_fallocate_flag(int opcode)
50dc83
+{
50dc83
+    int ret = 0;
50dc83
+
50dc83
+    switch (opcode) {
50dc83
+        case TEST_FALLOCATE_NONE:
50dc83
+            ret = 0;
50dc83
+            break;
50dc83
+        case TEST_FALLOCATE_KEEP_SIZE:
50dc83
+            ret = FALLOC_FL_KEEP_SIZE;
50dc83
+            break;
50dc83
+        case TEST_FALLOCATE_ZERO_RANGE:
50dc83
+            ret = FALLOC_FL_ZERO_RANGE;
50dc83
+            break;
50dc83
+        case TEST_FALLOCATE_PUNCH_HOLE:
50dc83
+            ret = FALLOC_FL_PUNCH_HOLE;
50dc83
+            break;
50dc83
+        default:
50dc83
+            ret = -1;
50dc83
+            break;
50dc83
+    }
50dc83
+    return ret;
50dc83
+}
50dc83
+
50dc83
+int
50dc83
+main(int argc, char *argv[])
50dc83
+{
50dc83
+    int ret = 1;
50dc83
+    int opcode = -1;
50dc83
+    off_t offset = 0;
50dc83
+    size_t len = 0;
50dc83
+    glfs_t *fs = NULL;
50dc83
+    glfs_fd_t *fd = NULL;
50dc83
+
50dc83
+    if (argc != 8) {
50dc83
+        fprintf(stderr,
50dc83
+                "Syntax: %s <host> <volname> <opcode> <offset> <len> "
50dc83
+                "<file-path> <log-file>\n",
50dc83
+                argv[0]);
50dc83
+        return 1;
50dc83
+    }
50dc83
+
50dc83
+    fs = glfs_new(argv[2]);
50dc83
+    if (!fs) {
50dc83
+        fprintf(stderr, "glfs_new: returned NULL\n");
50dc83
+        return 1;
50dc83
+    }
50dc83
+
50dc83
+    ret = glfs_set_volfile_server(fs, "tcp", argv[1], 24007);
50dc83
+    if (ret != 0) {
50dc83
+        fprintf(stderr, "glfs_set_volfile_server: returned %d\n", ret);
50dc83
+        goto out;
50dc83
+    }
50dc83
+
50dc83
+    ret = glfs_set_logging(fs, argv[7], 7);
50dc83
+    if (ret != 0) {
50dc83
+        fprintf(stderr, "glfs_set_logging: returned %d\n", ret);
50dc83
+        goto out;
50dc83
+    }
50dc83
+
50dc83
+    ret = glfs_init(fs);
50dc83
+    if (ret != 0) {
50dc83
+        fprintf(stderr, "glfs_init: returned %d\n", ret);
50dc83
+        goto out;
50dc83
+    }
50dc83
+
50dc83
+    opcode = atoi(argv[3]);
50dc83
+    opcode = get_fallocate_flag(opcode);
50dc83
+    if (opcode < 0) {
50dc83
+        fprintf(stderr, "get_fallocate_flag: invalid flag \n");
50dc83
+        goto out;
50dc83
+    }
50dc83
+
50dc83
+    offset = atoi(argv[4]);
50dc83
+    len = atoi(argv[5]);
50dc83
+
50dc83
+    fd = glfs_open(fs, argv[6], O_RDWR);
50dc83
+    if (fd == NULL) {
50dc83
+        fprintf(stderr, "glfs_open: returned NULL\n");
50dc83
+        goto out;
50dc83
+    }
50dc83
+
50dc83
+    ret = glfs_fallocate(fd, opcode, offset, len);
50dc83
+    if (ret < 0) {
50dc83
+        fprintf(stderr, "glfs_fallocate: returned %d\n", ret);
50dc83
+        goto out;
50dc83
+    }
50dc83
+
50dc83
+    ret = glfs_unlink(fs, argv[6]);
50dc83
+    if (ret < 0) {
50dc83
+        fprintf(stderr, "glfs_unlink: returned %d\n", ret);
50dc83
+        goto out;
50dc83
+    }
50dc83
+    /* Sleep for 3s to give enough time for background deletion to complete
50dc83
+     * during which if the bug exists, the process will crash.
50dc83
+     */
50dc83
+    sleep(3);
50dc83
+    ret = 0;
50dc83
+
50dc83
+out:
50dc83
+    if (fd)
50dc83
+        glfs_close(fd);
50dc83
+    glfs_fini(fs);
50dc83
+    return ret;
50dc83
+}
50dc83
diff --git a/tests/bugs/shard/bug-1696136.t b/tests/bugs/shard/bug-1696136.t
50dc83
new file mode 100644
50dc83
index 0000000..b6dc858
50dc83
--- /dev/null
50dc83
+++ b/tests/bugs/shard/bug-1696136.t
50dc83
@@ -0,0 +1,33 @@
50dc83
+#!/bin/bash
50dc83
+
50dc83
+. $(dirname $0)/../../include.rc
50dc83
+. $(dirname $0)/../../volume.rc
50dc83
+. $(dirname $0)/../../fallocate.rc
50dc83
+
50dc83
+cleanup
50dc83
+
50dc83
+TEST glusterd
50dc83
+TEST pidof glusterd
50dc83
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
50dc83
+TEST $CLI volume set $V0 features.shard on
50dc83
+TEST $CLI volume set $V0 features.shard-block-size 4MB
50dc83
+TEST $CLI volume set $V0 features.shard-lru-limit 120
50dc83
+TEST $CLI volume set $V0 performance.write-behind off
50dc83
+TEST $CLI volume start $V0
50dc83
+
50dc83
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
50dc83
+
50dc83
+TEST build_tester $(dirname $0)/bug-1696136.c -lgfapi -Wall -O2
50dc83
+
50dc83
+# Create a file
50dc83
+TEST touch $M0/file1
50dc83
+
50dc83
+# Fallocate a 500M file. This will make sure number of participant shards are > lru-limit
50dc83
+TEST $(dirname $0)/bug-1696136 $H0 $V0 "0" "0" "536870912" /file1 `gluster --print-logdir`/glfs-$V0.log
50dc83
+
50dc83
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
50dc83
+TEST $CLI volume stop $V0
50dc83
+TEST $CLI volume delete $V0
50dc83
+rm -f $(dirname $0)/bug-1696136
50dc83
+
50dc83
+cleanup
50dc83
diff --git a/tests/bugs/shard/shard-fallocate.c b/tests/bugs/shard/shard-fallocate.c
50dc83
index 3a784d3..45b9ce0 100644
50dc83
--- a/tests/bugs/shard/shard-fallocate.c
50dc83
+++ b/tests/bugs/shard/shard-fallocate.c
50dc83
@@ -97,7 +97,7 @@ main(int argc, char *argv[])
50dc83
     }
50dc83
 
50dc83
     ret = glfs_fallocate(fd, opcode, offset, len);
50dc83
-    if (ret <= 0) {
50dc83
+    if (ret < 0) {
50dc83
         fprintf(stderr, "glfs_fallocate: returned %d\n", ret);
50dc83
         goto out;
50dc83
     }
50dc83
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
50dc83
index fa3564a..3c4bcdc 100644
50dc83
--- a/xlators/features/shard/src/shard.c
50dc83
+++ b/xlators/features/shard/src/shard.c
50dc83
@@ -2213,13 +2213,19 @@ shard_link_block_inode(shard_local_t *local, int block_num, inode_t *inode,
50dc83
     xlator_t *this = NULL;
50dc83
     inode_t *fsync_inode = NULL;
50dc83
     shard_priv_t *priv = NULL;
50dc83
+    inode_t *base_inode = NULL;
50dc83
 
50dc83
     this = THIS;
50dc83
     priv = this->private;
50dc83
-    if (local->loc.inode)
50dc83
+    if (local->loc.inode) {
50dc83
         gf_uuid_copy(gfid, local->loc.inode->gfid);
50dc83
-    else
50dc83
+        base_inode = local->loc.inode;
50dc83
+    } else if (local->resolver_base_inode) {
50dc83
+        gf_uuid_copy(gfid, local->resolver_base_inode->gfid);
50dc83
+        base_inode = local->resolver_base_inode;
50dc83
+    } else {
50dc83
         gf_uuid_copy(gfid, local->base_gfid);
50dc83
+    }
50dc83
 
50dc83
     shard_make_block_bname(block_num, gfid, block_bname, sizeof(block_bname));
50dc83
 
50dc83
@@ -2232,7 +2238,7 @@ shard_link_block_inode(shard_local_t *local, int block_num, inode_t *inode,
50dc83
     LOCK(&priv->lock);
50dc83
     {
50dc83
         fsync_inode = __shard_update_shards_inode_list(
50dc83
-            linked_inode, this, local->loc.inode, block_num, gfid);
50dc83
+            linked_inode, this, base_inode, block_num, gfid);
50dc83
     }
50dc83
     UNLOCK(&priv->lock);
50dc83
     if (fsync_inode)
50dc83
-- 
50dc83
1.8.3.1
50dc83