14f8ab
From 40ac42501d6bbff7206e753e8e988beefe74f5f4 Mon Sep 17 00:00:00 2001
14f8ab
From: Krutika Dhananjay <kdhananj@redhat.com>
14f8ab
Date: Fri, 5 Apr 2019 10:30:23 +0530
14f8ab
Subject: [PATCH 176/178] features/shard: Fix crash during background shard
14f8ab
 deletion in a specific case
14f8ab
14f8ab
Consider the following case -
14f8ab
1. A file gets FALLOCATE'd such that > "shard-lru-limit" number of
14f8ab
   shards are created.
14f8ab
2. And then it is deleted after that.
14f8ab
14f8ab
The unique thing about FALLOCATE is that unlike WRITE, all of the
14f8ab
participant shards are resolved and created and fallocated in a single
14f8ab
batch. This means, in this case, after the first "shard-lru-limit"
14f8ab
number of shards are resolved and added to lru list, as part of
14f8ab
resolution of the remaining shards, some of the existing shards in lru
14f8ab
list will need to be evicted. So these evicted shards will be
14f8ab
inode_unlink()d as part of eviction. Now once the fop gets to the actual
14f8ab
FALLOCATE stage, the lru'd-out shards get added to fsync list.
14f8ab
14f8ab
2 things to note at this point:
14f8ab
i. the lru'd out shards are only part of fsync list, so each holds 1 ref
14f8ab
   on base shard
14f8ab
ii. and the more recently used shards are part of both fsync and lru list.
14f8ab
    So each of these shards holds 2 refs on base inode  - one for being
14f8ab
    part of fsync list, and the other for being part of lru list.
14f8ab
14f8ab
FALLOCATE completes successfully and then this very file is deleted, and
14f8ab
background shard deletion launched. Here's where the ref counts get mismatched.
14f8ab
First as part of inode_resolve()s during the deletion, the lru'd-out inodes
14f8ab
return NULL, because they are inode_unlink()'d by now. So these inodes need to
14f8ab
be freshly looked up. But as part of linking them in lookup_cbk (precisely in
14f8ab
shard_link_block_inode()), inode_link() returns the lru'd-out inode object.
14f8ab
And its inode ctx is still valid and ctx->base_inode valid from the last
14f8ab
time it was added to list.
14f8ab
14f8ab
But shard_common_lookup_shards_cbk() passes NULL in the place of base_pointer
14f8ab
to __shard_update_shards_inode_list(). This means, as part of adding the lru'd out
14f8ab
inode back to lru list, base inode is not ref'd since its NULL.
14f8ab
14f8ab
Whereas post unlinking this shard, during shard_unlink_block_inode(),
14f8ab
ctx->base_inode is accessible and is unref'd because the shard was found to be part
14f8ab
of LRU list, although the matching ref didn't occur. This at some point leads to
14f8ab
base_inode refcount becoming 0 and it getting destroyed and released back while some
14f8ab
of its associated shards are continuing to be unlinked in parallel and the client crashes
14f8ab
whenever it is accessed next.
14f8ab
14f8ab
Fix is to pass base shard correctly, if available, in shard_link_block_inode().
14f8ab
14f8ab
Also, the patch fixes the ret value check in tests/bugs/shard/shard-fallocate.c
14f8ab
14f8ab
>Change-Id: Ibd0bc4c6952367608e10701473cbad3947d7559f
14f8ab
>Updates: bz#1696136
14f8ab
>Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
14f8ab
14f8ab
Upstream Patch: https://review.gluster.org/#/c/glusterfs/+/22507/
14f8ab
14f8ab
BUG: 1694595
14f8ab
Change-Id: Ibd0bc4c6952367608e10701473cbad3947d7559f
14f8ab
Signed-off-by: Sunil Kumar Acharya <sheggodu@redhat.com>
14f8ab
Reviewed-on: https://code.engineering.redhat.com/gerrit/172856
14f8ab
Tested-by: RHGS Build Bot <nigelb@redhat.com>
14f8ab
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
14f8ab
---
14f8ab
 tests/bugs/shard/bug-1696136.c     | 121 +++++++++++++++++++++++++++++++++++++
14f8ab
 tests/bugs/shard/bug-1696136.t     |  33 ++++++++++
14f8ab
 tests/bugs/shard/shard-fallocate.c |   2 +-
14f8ab
 xlators/features/shard/src/shard.c |  12 +++-
14f8ab
 4 files changed, 164 insertions(+), 4 deletions(-)
14f8ab
 create mode 100644 tests/bugs/shard/bug-1696136.c
14f8ab
 create mode 100644 tests/bugs/shard/bug-1696136.t
14f8ab
14f8ab
diff --git a/tests/bugs/shard/bug-1696136.c b/tests/bugs/shard/bug-1696136.c
14f8ab
new file mode 100644
14f8ab
index 0000000..b9e8d13
14f8ab
--- /dev/null
14f8ab
+++ b/tests/bugs/shard/bug-1696136.c
14f8ab
@@ -0,0 +1,121 @@
14f8ab
+#define _GNU_SOURCE
14f8ab
+#include <fcntl.h>
14f8ab
+#include <stdio.h>
14f8ab
+#include <stdlib.h>
14f8ab
+#include <glusterfs/api/glfs.h>
14f8ab
+#include <glusterfs/api/glfs-handles.h>
14f8ab
+
14f8ab
+enum fallocate_flag {
14f8ab
+    TEST_FALLOCATE_NONE,
14f8ab
+    TEST_FALLOCATE_KEEP_SIZE,
14f8ab
+    TEST_FALLOCATE_ZERO_RANGE,
14f8ab
+    TEST_FALLOCATE_PUNCH_HOLE,
14f8ab
+    TEST_FALLOCATE_MAX,
14f8ab
+};
14f8ab
+
14f8ab
+int
14f8ab
+get_fallocate_flag(int opcode)
14f8ab
+{
14f8ab
+    int ret = 0;
14f8ab
+
14f8ab
+    switch (opcode) {
14f8ab
+        case TEST_FALLOCATE_NONE:
14f8ab
+            ret = 0;
14f8ab
+            break;
14f8ab
+        case TEST_FALLOCATE_KEEP_SIZE:
14f8ab
+            ret = FALLOC_FL_KEEP_SIZE;
14f8ab
+            break;
14f8ab
+        case TEST_FALLOCATE_ZERO_RANGE:
14f8ab
+            ret = FALLOC_FL_ZERO_RANGE;
14f8ab
+            break;
14f8ab
+        case TEST_FALLOCATE_PUNCH_HOLE:
14f8ab
+            ret = FALLOC_FL_PUNCH_HOLE;
14f8ab
+            break;
14f8ab
+        default:
14f8ab
+            ret = -1;
14f8ab
+            break;
14f8ab
+    }
14f8ab
+    return ret;
14f8ab
+}
14f8ab
+
14f8ab
+int
14f8ab
+main(int argc, char *argv[])
14f8ab
+{
14f8ab
+    int ret = 1;
14f8ab
+    int opcode = -1;
14f8ab
+    off_t offset = 0;
14f8ab
+    size_t len = 0;
14f8ab
+    glfs_t *fs = NULL;
14f8ab
+    glfs_fd_t *fd = NULL;
14f8ab
+
14f8ab
+    if (argc != 8) {
14f8ab
+        fprintf(stderr,
14f8ab
+                "Syntax: %s <host> <volname> <opcode> <offset> <len> "
14f8ab
+                "<file-path> <log-file>\n",
14f8ab
+                argv[0]);
14f8ab
+        return 1;
14f8ab
+    }
14f8ab
+
14f8ab
+    fs = glfs_new(argv[2]);
14f8ab
+    if (!fs) {
14f8ab
+        fprintf(stderr, "glfs_new: returned NULL\n");
14f8ab
+        return 1;
14f8ab
+    }
14f8ab
+
14f8ab
+    ret = glfs_set_volfile_server(fs, "tcp", argv[1], 24007);
14f8ab
+    if (ret != 0) {
14f8ab
+        fprintf(stderr, "glfs_set_volfile_server: returned %d\n", ret);
14f8ab
+        goto out;
14f8ab
+    }
14f8ab
+
14f8ab
+    ret = glfs_set_logging(fs, argv[7], 7);
14f8ab
+    if (ret != 0) {
14f8ab
+        fprintf(stderr, "glfs_set_logging: returned %d\n", ret);
14f8ab
+        goto out;
14f8ab
+    }
14f8ab
+
14f8ab
+    ret = glfs_init(fs);
14f8ab
+    if (ret != 0) {
14f8ab
+        fprintf(stderr, "glfs_init: returned %d\n", ret);
14f8ab
+        goto out;
14f8ab
+    }
14f8ab
+
14f8ab
+    opcode = atoi(argv[3]);
14f8ab
+    opcode = get_fallocate_flag(opcode);
14f8ab
+    if (opcode < 0) {
14f8ab
+        fprintf(stderr, "get_fallocate_flag: invalid flag \n");
14f8ab
+        goto out;
14f8ab
+    }
14f8ab
+
14f8ab
+    offset = atoi(argv[4]);
14f8ab
+    len = atoi(argv[5]);
14f8ab
+
14f8ab
+    fd = glfs_open(fs, argv[6], O_RDWR);
14f8ab
+    if (fd == NULL) {
14f8ab
+        fprintf(stderr, "glfs_open: returned NULL\n");
14f8ab
+        goto out;
14f8ab
+    }
14f8ab
+
14f8ab
+    ret = glfs_fallocate(fd, opcode, offset, len);
14f8ab
+    if (ret < 0) {
14f8ab
+        fprintf(stderr, "glfs_fallocate: returned %d\n", ret);
14f8ab
+        goto out;
14f8ab
+    }
14f8ab
+
14f8ab
+    ret = glfs_unlink(fs, argv[6]);
14f8ab
+    if (ret < 0) {
14f8ab
+        fprintf(stderr, "glfs_unlink: returned %d\n", ret);
14f8ab
+        goto out;
14f8ab
+    }
14f8ab
+    /* Sleep for 3s to give enough time for background deletion to complete
14f8ab
+     * during which if the bug exists, the process will crash.
14f8ab
+     */
14f8ab
+    sleep(3);
14f8ab
+    ret = 0;
14f8ab
+
14f8ab
+out:
14f8ab
+    if (fd)
14f8ab
+        glfs_close(fd);
14f8ab
+    glfs_fini(fs);
14f8ab
+    return ret;
14f8ab
+}
14f8ab
diff --git a/tests/bugs/shard/bug-1696136.t b/tests/bugs/shard/bug-1696136.t
14f8ab
new file mode 100644
14f8ab
index 0000000..b6dc858
14f8ab
--- /dev/null
14f8ab
+++ b/tests/bugs/shard/bug-1696136.t
14f8ab
@@ -0,0 +1,33 @@
14f8ab
+#!/bin/bash
14f8ab
+
14f8ab
+. $(dirname $0)/../../include.rc
14f8ab
+. $(dirname $0)/../../volume.rc
14f8ab
+. $(dirname $0)/../../fallocate.rc
14f8ab
+
14f8ab
+cleanup
14f8ab
+
14f8ab
+TEST glusterd
14f8ab
+TEST pidof glusterd
14f8ab
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
14f8ab
+TEST $CLI volume set $V0 features.shard on
14f8ab
+TEST $CLI volume set $V0 features.shard-block-size 4MB
14f8ab
+TEST $CLI volume set $V0 features.shard-lru-limit 120
14f8ab
+TEST $CLI volume set $V0 performance.write-behind off
14f8ab
+TEST $CLI volume start $V0
14f8ab
+
14f8ab
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
14f8ab
+
14f8ab
+TEST build_tester $(dirname $0)/bug-1696136.c -lgfapi -Wall -O2
14f8ab
+
14f8ab
+# Create a file
14f8ab
+TEST touch $M0/file1
14f8ab
+
14f8ab
+# Fallocate a 500M file. This will make sure number of participant shards are > lru-limit
14f8ab
+TEST $(dirname $0)/bug-1696136 $H0 $V0 "0" "0" "536870912" /file1 `gluster --print-logdir`/glfs-$V0.log
14f8ab
+
14f8ab
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
14f8ab
+TEST $CLI volume stop $V0
14f8ab
+TEST $CLI volume delete $V0
14f8ab
+rm -f $(dirname $0)/bug-1696136
14f8ab
+
14f8ab
+cleanup
14f8ab
diff --git a/tests/bugs/shard/shard-fallocate.c b/tests/bugs/shard/shard-fallocate.c
14f8ab
index 3a784d3..45b9ce0 100644
14f8ab
--- a/tests/bugs/shard/shard-fallocate.c
14f8ab
+++ b/tests/bugs/shard/shard-fallocate.c
14f8ab
@@ -97,7 +97,7 @@ main(int argc, char *argv[])
14f8ab
     }
14f8ab
 
14f8ab
     ret = glfs_fallocate(fd, opcode, offset, len);
14f8ab
-    if (ret <= 0) {
14f8ab
+    if (ret < 0) {
14f8ab
         fprintf(stderr, "glfs_fallocate: returned %d\n", ret);
14f8ab
         goto out;
14f8ab
     }
14f8ab
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
14f8ab
index fa3564a..3c4bcdc 100644
14f8ab
--- a/xlators/features/shard/src/shard.c
14f8ab
+++ b/xlators/features/shard/src/shard.c
14f8ab
@@ -2213,13 +2213,19 @@ shard_link_block_inode(shard_local_t *local, int block_num, inode_t *inode,
14f8ab
     xlator_t *this = NULL;
14f8ab
     inode_t *fsync_inode = NULL;
14f8ab
     shard_priv_t *priv = NULL;
14f8ab
+    inode_t *base_inode = NULL;
14f8ab
 
14f8ab
     this = THIS;
14f8ab
     priv = this->private;
14f8ab
-    if (local->loc.inode)
14f8ab
+    if (local->loc.inode) {
14f8ab
         gf_uuid_copy(gfid, local->loc.inode->gfid);
14f8ab
-    else
14f8ab
+        base_inode = local->loc.inode;
14f8ab
+    } else if (local->resolver_base_inode) {
14f8ab
+        gf_uuid_copy(gfid, local->resolver_base_inode->gfid);
14f8ab
+        base_inode = local->resolver_base_inode;
14f8ab
+    } else {
14f8ab
         gf_uuid_copy(gfid, local->base_gfid);
14f8ab
+    }
14f8ab
 
14f8ab
     shard_make_block_bname(block_num, gfid, block_bname, sizeof(block_bname));
14f8ab
 
14f8ab
@@ -2232,7 +2238,7 @@ shard_link_block_inode(shard_local_t *local, int block_num, inode_t *inode,
14f8ab
     LOCK(&priv->lock);
14f8ab
     {
14f8ab
         fsync_inode = __shard_update_shards_inode_list(
14f8ab
-            linked_inode, this, local->loc.inode, block_num, gfid);
14f8ab
+            linked_inode, this, base_inode, block_num, gfid);
14f8ab
     }
14f8ab
     UNLOCK(&priv->lock);
14f8ab
     if (fsync_inode)
14f8ab
-- 
14f8ab
1.8.3.1
14f8ab