74b1de
From 40ac42501d6bbff7206e753e8e988beefe74f5f4 Mon Sep 17 00:00:00 2001
74b1de
From: Krutika Dhananjay <kdhananj@redhat.com>
74b1de
Date: Fri, 5 Apr 2019 10:30:23 +0530
74b1de
Subject: [PATCH 176/178] features/shard: Fix crash during background shard
74b1de
 deletion in a specific case
74b1de
74b1de
Consider the following case -
74b1de
1. A file gets FALLOCATE'd such that > "shard-lru-limit" number of
74b1de
   shards are created.
74b1de
2. And then it is deleted after that.
74b1de
74b1de
The unique thing about FALLOCATE is that unlike WRITE, all of the
74b1de
participant shards are resolved and created and fallocated in a single
74b1de
batch. This means, in this case, after the first "shard-lru-limit"
74b1de
number of shards are resolved and added to lru list, as part of
74b1de
resolution of the remaining shards, some of the existing shards in lru
74b1de
list will need to be evicted. So these evicted shards will be
74b1de
inode_unlink()d as part of eviction. Now once the fop gets to the actual
74b1de
FALLOCATE stage, the lru'd-out shards get added to fsync list.
74b1de
74b1de
2 things to note at this point:
74b1de
i. the lru'd out shards are only part of fsync list, so each holds 1 ref
74b1de
   on base shard
74b1de
ii. and the more recently used shards are part of both fsync and lru list.
74b1de
    So each of these shards holds 2 refs on base inode  - one for being
74b1de
    part of fsync list, and the other for being part of lru list.
74b1de
74b1de
FALLOCATE completes successfully and then this very file is deleted, and
74b1de
background shard deletion launched. Here's where the ref counts get mismatched.
74b1de
First as part of inode_resolve()s during the deletion, the lru'd-out inodes
74b1de
return NULL, because they are inode_unlink()'d by now. So these inodes need to
74b1de
be freshly looked up. But as part of linking them in lookup_cbk (precisely in
74b1de
shard_link_block_inode()), inode_link() returns the lru'd-out inode object.
74b1de
And its inode ctx is still valid and ctx->base_inode valid from the last
74b1de
time it was added to list.
74b1de
74b1de
But shard_common_lookup_shards_cbk() passes NULL in the place of base_pointer
74b1de
to __shard_update_shards_inode_list(). This means, as part of adding the lru'd out
74b1de
inode back to lru list, base inode is not ref'd since its NULL.
74b1de
74b1de
Whereas post unlinking this shard, during shard_unlink_block_inode(),
74b1de
ctx->base_inode is accessible and is unref'd because the shard was found to be part
74b1de
of LRU list, although the matching ref didn't occur. This at some point leads to
74b1de
base_inode refcount becoming 0 and it getting destroyed and released back while some
74b1de
of its associated shards are continuing to be unlinked in parallel and the client crashes
74b1de
whenever it is accessed next.
74b1de
74b1de
Fix is to pass base shard correctly, if available, in shard_link_block_inode().
74b1de
74b1de
Also, the patch fixes the ret value check in tests/bugs/shard/shard-fallocate.c
74b1de
74b1de
>Change-Id: Ibd0bc4c6952367608e10701473cbad3947d7559f
74b1de
>Updates: bz#1696136
74b1de
>Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
74b1de
74b1de
Upstream Patch: https://review.gluster.org/#/c/glusterfs/+/22507/
74b1de
74b1de
BUG: 1694595
74b1de
Change-Id: Ibd0bc4c6952367608e10701473cbad3947d7559f
74b1de
Signed-off-by: Sunil Kumar Acharya <sheggodu@redhat.com>
74b1de
Reviewed-on: https://code.engineering.redhat.com/gerrit/172856
74b1de
Tested-by: RHGS Build Bot <nigelb@redhat.com>
74b1de
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
74b1de
---
74b1de
 tests/bugs/shard/bug-1696136.c     | 121 +++++++++++++++++++++++++++++++++++++
74b1de
 tests/bugs/shard/bug-1696136.t     |  33 ++++++++++
74b1de
 tests/bugs/shard/shard-fallocate.c |   2 +-
74b1de
 xlators/features/shard/src/shard.c |  12 +++-
74b1de
 4 files changed, 164 insertions(+), 4 deletions(-)
74b1de
 create mode 100644 tests/bugs/shard/bug-1696136.c
74b1de
 create mode 100644 tests/bugs/shard/bug-1696136.t
74b1de
74b1de
diff --git a/tests/bugs/shard/bug-1696136.c b/tests/bugs/shard/bug-1696136.c
74b1de
new file mode 100644
74b1de
index 0000000..b9e8d13
74b1de
--- /dev/null
74b1de
+++ b/tests/bugs/shard/bug-1696136.c
74b1de
@@ -0,0 +1,121 @@
74b1de
+#define _GNU_SOURCE
74b1de
+#include <fcntl.h>
74b1de
+#include <stdio.h>
74b1de
+#include <stdlib.h>
74b1de
+#include <glusterfs/api/glfs.h>
74b1de
+#include <glusterfs/api/glfs-handles.h>
74b1de
+
74b1de
+enum fallocate_flag {
74b1de
+    TEST_FALLOCATE_NONE,
74b1de
+    TEST_FALLOCATE_KEEP_SIZE,
74b1de
+    TEST_FALLOCATE_ZERO_RANGE,
74b1de
+    TEST_FALLOCATE_PUNCH_HOLE,
74b1de
+    TEST_FALLOCATE_MAX,
74b1de
+};
74b1de
+
74b1de
+int
74b1de
+get_fallocate_flag(int opcode)
74b1de
+{
74b1de
+    int ret = 0;
74b1de
+
74b1de
+    switch (opcode) {
74b1de
+        case TEST_FALLOCATE_NONE:
74b1de
+            ret = 0;
74b1de
+            break;
74b1de
+        case TEST_FALLOCATE_KEEP_SIZE:
74b1de
+            ret = FALLOC_FL_KEEP_SIZE;
74b1de
+            break;
74b1de
+        case TEST_FALLOCATE_ZERO_RANGE:
74b1de
+            ret = FALLOC_FL_ZERO_RANGE;
74b1de
+            break;
74b1de
+        case TEST_FALLOCATE_PUNCH_HOLE:
74b1de
+            ret = FALLOC_FL_PUNCH_HOLE;
74b1de
+            break;
74b1de
+        default:
74b1de
+            ret = -1;
74b1de
+            break;
74b1de
+    }
74b1de
+    return ret;
74b1de
+}
74b1de
+
74b1de
+int
74b1de
+main(int argc, char *argv[])
74b1de
+{
74b1de
+    int ret = 1;
74b1de
+    int opcode = -1;
74b1de
+    off_t offset = 0;
74b1de
+    size_t len = 0;
74b1de
+    glfs_t *fs = NULL;
74b1de
+    glfs_fd_t *fd = NULL;
74b1de
+
74b1de
+    if (argc != 8) {
74b1de
+        fprintf(stderr,
74b1de
+                "Syntax: %s <host> <volname> <opcode> <offset> <len> "
74b1de
+                "<file-path> <log-file>\n",
74b1de
+                argv[0]);
74b1de
+        return 1;
74b1de
+    }
74b1de
+
74b1de
+    fs = glfs_new(argv[2]);
74b1de
+    if (!fs) {
74b1de
+        fprintf(stderr, "glfs_new: returned NULL\n");
74b1de
+        return 1;
74b1de
+    }
74b1de
+
74b1de
+    ret = glfs_set_volfile_server(fs, "tcp", argv[1], 24007);
74b1de
+    if (ret != 0) {
74b1de
+        fprintf(stderr, "glfs_set_volfile_server: returned %d\n", ret);
74b1de
+        goto out;
74b1de
+    }
74b1de
+
74b1de
+    ret = glfs_set_logging(fs, argv[7], 7);
74b1de
+    if (ret != 0) {
74b1de
+        fprintf(stderr, "glfs_set_logging: returned %d\n", ret);
74b1de
+        goto out;
74b1de
+    }
74b1de
+
74b1de
+    ret = glfs_init(fs);
74b1de
+    if (ret != 0) {
74b1de
+        fprintf(stderr, "glfs_init: returned %d\n", ret);
74b1de
+        goto out;
74b1de
+    }
74b1de
+
74b1de
+    opcode = atoi(argv[3]);
74b1de
+    opcode = get_fallocate_flag(opcode);
74b1de
+    if (opcode < 0) {
74b1de
+        fprintf(stderr, "get_fallocate_flag: invalid flag \n");
74b1de
+        goto out;
74b1de
+    }
74b1de
+
74b1de
+    offset = atoi(argv[4]);
74b1de
+    len = atoi(argv[5]);
74b1de
+
74b1de
+    fd = glfs_open(fs, argv[6], O_RDWR);
74b1de
+    if (fd == NULL) {
74b1de
+        fprintf(stderr, "glfs_open: returned NULL\n");
74b1de
+        goto out;
74b1de
+    }
74b1de
+
74b1de
+    ret = glfs_fallocate(fd, opcode, offset, len);
74b1de
+    if (ret < 0) {
74b1de
+        fprintf(stderr, "glfs_fallocate: returned %d\n", ret);
74b1de
+        goto out;
74b1de
+    }
74b1de
+
74b1de
+    ret = glfs_unlink(fs, argv[6]);
74b1de
+    if (ret < 0) {
74b1de
+        fprintf(stderr, "glfs_unlink: returned %d\n", ret);
74b1de
+        goto out;
74b1de
+    }
74b1de
+    /* Sleep for 3s to give enough time for background deletion to complete
74b1de
+     * during which if the bug exists, the process will crash.
74b1de
+     */
74b1de
+    sleep(3);
74b1de
+    ret = 0;
74b1de
+
74b1de
+out:
74b1de
+    if (fd)
74b1de
+        glfs_close(fd);
74b1de
+    glfs_fini(fs);
74b1de
+    return ret;
74b1de
+}
74b1de
diff --git a/tests/bugs/shard/bug-1696136.t b/tests/bugs/shard/bug-1696136.t
74b1de
new file mode 100644
74b1de
index 0000000..b6dc858
74b1de
--- /dev/null
74b1de
+++ b/tests/bugs/shard/bug-1696136.t
74b1de
@@ -0,0 +1,33 @@
74b1de
+#!/bin/bash
74b1de
+
74b1de
+. $(dirname $0)/../../include.rc
74b1de
+. $(dirname $0)/../../volume.rc
74b1de
+. $(dirname $0)/../../fallocate.rc
74b1de
+
74b1de
+cleanup
74b1de
+
74b1de
+TEST glusterd
74b1de
+TEST pidof glusterd
74b1de
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
74b1de
+TEST $CLI volume set $V0 features.shard on
74b1de
+TEST $CLI volume set $V0 features.shard-block-size 4MB
74b1de
+TEST $CLI volume set $V0 features.shard-lru-limit 120
74b1de
+TEST $CLI volume set $V0 performance.write-behind off
74b1de
+TEST $CLI volume start $V0
74b1de
+
74b1de
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
74b1de
+
74b1de
+TEST build_tester $(dirname $0)/bug-1696136.c -lgfapi -Wall -O2
74b1de
+
74b1de
+# Create a file
74b1de
+TEST touch $M0/file1
74b1de
+
74b1de
+# Fallocate a 500M file. This will make sure number of participant shards are > lru-limit
74b1de
+TEST $(dirname $0)/bug-1696136 $H0 $V0 "0" "0" "536870912" /file1 `gluster --print-logdir`/glfs-$V0.log
74b1de
+
74b1de
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
74b1de
+TEST $CLI volume stop $V0
74b1de
+TEST $CLI volume delete $V0
74b1de
+rm -f $(dirname $0)/bug-1696136
74b1de
+
74b1de
+cleanup
74b1de
diff --git a/tests/bugs/shard/shard-fallocate.c b/tests/bugs/shard/shard-fallocate.c
74b1de
index 3a784d3..45b9ce0 100644
74b1de
--- a/tests/bugs/shard/shard-fallocate.c
74b1de
+++ b/tests/bugs/shard/shard-fallocate.c
74b1de
@@ -97,7 +97,7 @@ main(int argc, char *argv[])
74b1de
     }
74b1de
 
74b1de
     ret = glfs_fallocate(fd, opcode, offset, len);
74b1de
-    if (ret <= 0) {
74b1de
+    if (ret < 0) {
74b1de
         fprintf(stderr, "glfs_fallocate: returned %d\n", ret);
74b1de
         goto out;
74b1de
     }
74b1de
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
74b1de
index fa3564a..3c4bcdc 100644
74b1de
--- a/xlators/features/shard/src/shard.c
74b1de
+++ b/xlators/features/shard/src/shard.c
74b1de
@@ -2213,13 +2213,19 @@ shard_link_block_inode(shard_local_t *local, int block_num, inode_t *inode,
74b1de
     xlator_t *this = NULL;
74b1de
     inode_t *fsync_inode = NULL;
74b1de
     shard_priv_t *priv = NULL;
74b1de
+    inode_t *base_inode = NULL;
74b1de
 
74b1de
     this = THIS;
74b1de
     priv = this->private;
74b1de
-    if (local->loc.inode)
74b1de
+    if (local->loc.inode) {
74b1de
         gf_uuid_copy(gfid, local->loc.inode->gfid);
74b1de
-    else
74b1de
+        base_inode = local->loc.inode;
74b1de
+    } else if (local->resolver_base_inode) {
74b1de
+        gf_uuid_copy(gfid, local->resolver_base_inode->gfid);
74b1de
+        base_inode = local->resolver_base_inode;
74b1de
+    } else {
74b1de
         gf_uuid_copy(gfid, local->base_gfid);
74b1de
+    }
74b1de
 
74b1de
     shard_make_block_bname(block_num, gfid, block_bname, sizeof(block_bname));
74b1de
 
74b1de
@@ -2232,7 +2238,7 @@ shard_link_block_inode(shard_local_t *local, int block_num, inode_t *inode,
74b1de
     LOCK(&priv->lock);
74b1de
     {
74b1de
         fsync_inode = __shard_update_shards_inode_list(
74b1de
-            linked_inode, this, local->loc.inode, block_num, gfid);
74b1de
+            linked_inode, this, base_inode, block_num, gfid);
74b1de
     }
74b1de
     UNLOCK(&priv->lock);
74b1de
     if (fsync_inode)
74b1de
-- 
74b1de
1.8.3.1
74b1de