Blob Blame History Raw
From 72ce80749fca03ab97a63af79d4e6bc76a49ab64 Mon Sep 17 00:00:00 2001
From: Krutika Dhananjay <kdhananj@redhat.com>
Date: Fri, 5 Oct 2018 11:32:21 +0530
Subject: [PATCH 443/444] features/shard: Hold a ref on base inode when adding
 a shard to lru list

    > Upstream: https://review.gluster.org/21454
    > BUG: 1605056
    > Change-Id: Ic15ca41444dd04684a9458bd4a526b1d3e160499

In __shard_update_shards_inode_list(), previously shard translator
was not holding a ref on the base inode whenever a shard was added to
the lru list. But if the base shard is forgotten and destroyed either
by fuse due to memory pressure or due to the file being deleted at some
point by a different client with this client still containing stale
shards in its lru list, the client would crash at the time of locking
lru_base_inode->lock owing to illegal memory access.

So now the base shard is ref'd into the inode ctx of every shard that
is added to lru list until it gets lru'd out.

The patch also handles the case where none of the shards associated
with a file that is about to be deleted are part of the LRU list and
where an unlink at the beginning of the operation destroys the base
inode (because there are no refkeepers) and hence all of the shards
that are about to be deleted will be resolved without the existence
of a base shard in-memory. This, if not handled properly, could lead
to a crash.

Change-Id: Ic15ca41444dd04684a9458bd4a526b1d3e160499
BUG: 1603118
Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/155318
Reviewed-by: Xavi Hernandez <xhernandez@redhat.com>
Tested-by: RHGS Build Bot <nigelb@redhat.com>
---
 tests/bugs/shard/bug-1605056-2.t             | 34 +++++++++++++++
 tests/bugs/shard/bug-1605056.t               | 63 ++++++++++++++++++++++++++++
 tests/bugs/shard/shard-inode-refcount-test.t |  2 +-
 tests/volume.rc                              | 12 ++++--
 xlators/features/shard/src/shard.c           | 48 +++++++++++++++------
 5 files changed, 141 insertions(+), 18 deletions(-)
 create mode 100644 tests/bugs/shard/bug-1605056-2.t
 create mode 100644 tests/bugs/shard/bug-1605056.t

diff --git a/tests/bugs/shard/bug-1605056-2.t b/tests/bugs/shard/bug-1605056-2.t
new file mode 100644
index 0000000..a9c10fe
--- /dev/null
+++ b/tests/bugs/shard/bug-1605056-2.t
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+cleanup
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+TEST $CLI volume set $V0 features.shard on
+TEST $CLI volume set $V0 features.shard-block-size 4MB
+TEST $CLI volume set $V0 features.shard-lru-limit 25
+TEST $CLI volume set $V0 performance.write-behind off
+
+TEST $CLI volume start $V0
+
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
+
+# Perform a write that would cause 25 shards to be created under .shard
+TEST dd if=/dev/zero of=$M0/foo bs=1M count=104
+
+# Write into another file bar to ensure all of foo's shards are evicted from lru list of $M0
+TEST dd if=/dev/zero of=$M0/bar bs=1M count=104
+
+# Delete foo from $M0. If there's a bug, the mount will crash.
+TEST unlink $M0/foo
+
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
+
+TEST $CLI volume stop $V0
+TEST $CLI volume delete $V0
+
+cleanup
diff --git a/tests/bugs/shard/bug-1605056.t b/tests/bugs/shard/bug-1605056.t
new file mode 100644
index 0000000..c2329ea
--- /dev/null
+++ b/tests/bugs/shard/bug-1605056.t
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+SHARD_COUNT_TIME=5
+
+cleanup
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+TEST $CLI volume set $V0 features.shard on
+TEST $CLI volume set $V0 features.shard-block-size 4MB
+TEST $CLI volume set $V0 features.shard-lru-limit 25
+TEST $CLI volume set $V0 performance.write-behind off
+
+TEST $CLI volume start $V0
+
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M1
+
+# Perform a write that would cause 25 shards to be created under .shard
+TEST dd if=/dev/zero of=$M0/foo bs=1M count=104
+
+# Read the file from $M1, indirectly filling up the lru list.
+TEST `cat $M1/foo > /dev/null`
+statedump=$(generate_mount_statedump $V0 $M1)
+sleep 1
+EXPECT "25" echo $(grep "inode-count" $statedump | cut -f2 -d'=' | tail -1)
+rm -f $statedump
+
+# Delete foo from $M0.
+TEST unlink $M0/foo
+
+# Send stat on foo from $M1 to force $M1 to "forget" inode associated with foo.
+# Now the ghost shards associated with "foo" are still in lru list of $M1.
+TEST ! stat $M1/foo
+
+# Let's force the ghost shards of "foo" out of lru list by looking up more shards
+# through I/O on a file named "bar" from $M1. This should crash if the base inode
+# had been destroyed by now.
+
+TEST dd if=/dev/zero of=$M1/bar bs=1M count=104
+
+###############################################
+#### Now for some inode ref-leak tests ... ####
+###############################################
+
+# Expect there to be 29 active inodes - 26 belonging to "bar", 1 for .shard,
+# 1 for .shard/remove_me and 1 for '/'
+EXPECT_WITHIN $SHARD_COUNT_TIME `expr 26 + 3` get_mount_active_size_value $V0 $M1
+
+TEST rm -f $M1/bar
+EXPECT_WITHIN $SHARD_COUNT_TIME 3 get_mount_active_size_value $V0 $M1
+
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M1
+
+TEST $CLI volume stop $V0
+TEST $CLI volume delete $V0
+
+cleanup
diff --git a/tests/bugs/shard/shard-inode-refcount-test.t b/tests/bugs/shard/shard-inode-refcount-test.t
index 087c8ba..3fd181b 100644
--- a/tests/bugs/shard/shard-inode-refcount-test.t
+++ b/tests/bugs/shard/shard-inode-refcount-test.t
@@ -21,7 +21,7 @@ TEST dd if=/dev/zero conv=fsync of=$M0/one-plus-five-shards bs=1M count=23
 ACTIVE_INODES_BEFORE=$(get_mount_active_size_value $V0)
 TEST rm -f $M0/one-plus-five-shards
 # Expect 5 inodes less. But one inode more than before because .remove_me would be created.
-EXPECT_WITHIN $SHARD_COUNT_TIME `expr $ACTIVE_INODES_BEFORE - 5 + 1` get_mount_active_size_value $V0
+EXPECT_WITHIN $SHARD_COUNT_TIME `expr $ACTIVE_INODES_BEFORE - 5 + 1` get_mount_active_size_value $V0 $M0
 
 EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
 TEST $CLI volume stop $V0
diff --git a/tests/volume.rc b/tests/volume.rc
index bba7e4e..6a983fd 100644
--- a/tests/volume.rc
+++ b/tests/volume.rc
@@ -93,7 +93,8 @@ function remove_brick_status_completed_field {
 
 function get_mount_process_pid {
         local vol=$1
-        ps auxww | grep glusterfs | grep -E "volfile-id[ =]/?$vol " | awk '{print $2}' | head -1
+        local mnt=$2
+        ps auxww | grep glusterfs | grep -E "volfile-id[ =]/?$vol .*$mnt" | awk '{print $2}' | head -1
 }
 
 function get_nfs_pid ()
@@ -126,7 +127,8 @@ function generate_statedump {
 
 function generate_mount_statedump {
         local vol=$1
-        generate_statedump $(get_mount_process_pid $vol)
+        local mnt=$2
+        generate_statedump $(get_mount_process_pid $vol $mnt)
 }
 
 function cleanup_mount_statedump {
@@ -850,7 +852,8 @@ function get_active_fd_count {
 
 function get_mount_active_size_value {
         local vol=$1
-        local statedump=$(generate_mount_statedump $vol)
+        local mount=$2
+        local statedump=$(generate_mount_statedump $vol $mount)
         sleep 1
         local val=$(grep "active_size" $statedump | cut -f2 -d'=' | tail -1)
         rm -f $statedump
@@ -859,7 +862,8 @@ function get_mount_active_size_value {
 
 function get_mount_lru_size_value {
         local vol=$1
-        local statedump=$(generate_mount_statedump $vol)
+        local mount=$2
+        local statedump=$(generate_mount_statedump $vol $mount)
         sleep 1
         local val=$(grep "lru_size" $statedump | cut -f2 -d'=' | tail -1)
         rm -f $statedump
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
index eb32168..fb88315 100644
--- a/xlators/features/shard/src/shard.c
+++ b/xlators/features/shard/src/shard.c
@@ -651,7 +651,8 @@ out:
 
 inode_t *
 __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this,
-                                  inode_t *base_inode, int block_num)
+                                  inode_t *base_inode, int block_num,
+                                  uuid_t gfid)
 {
         char                block_bname[256]     = {0,};
         inode_t            *lru_inode            = NULL;
@@ -679,10 +680,13 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this,
                         inode_ref (linked_inode);
                         if (base_inode)
                                 gf_uuid_copy (ctx->base_gfid, base_inode->gfid);
+                        else
+                                gf_uuid_copy(ctx->base_gfid, gfid);
                         ctx->block_num = block_num;
                         list_add_tail (&ctx->ilist, &priv->ilist_head);
                         priv->inode_count++;
-                        ctx->base_inode = base_inode;
+                        if (base_inode)
+                                ctx->base_inode = inode_ref (base_inode);
                 } else {
                 /*If on the other hand there is no available slot for this inode
                  * in the list, delete the lru inode from the head of the list,
@@ -701,6 +705,8 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this,
                          * deleted from fsync list and fsync'd in a new frame,
                          * and then unlinked in memory and forgotten.
                          */
+                        if (!lru_base_inode)
+                                goto after_fsync_check;
                         LOCK (&lru_base_inode->lock);
                         LOCK (&lru_inode->lock);
                         {
@@ -715,6 +721,7 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this,
                         UNLOCK (&lru_inode->lock);
                         UNLOCK (&lru_base_inode->lock);
 
+after_fsync_check:
                         if (!do_fsync) {
                                 shard_make_block_bname (lru_inode_ctx->block_num,
                                                         lru_inode_ctx->base_gfid,
@@ -729,20 +736,31 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this,
                                 inode_forget (lru_inode, 0);
                         } else {
                                 fsync_inode = lru_inode;
-                                inode_unref (lru_base_inode);
+                                if (lru_base_inode)
+                                        inode_unref (lru_base_inode);
                         }
                         /* The following unref corresponds to the ref
                          * held by inode_find() above.
                          */
                         inode_unref (lru_inode);
+
+                        /* The following unref corresponds to the ref held on
+                         * the base shard at the time of adding shard inode to
+                         * lru list
+                        */
+                        if (lru_base_inode)
+                                inode_unref (lru_base_inode);
                         /* For as long as an inode is in lru list, we try to
                          * keep it alive by holding a ref on it.
                          */
                         inode_ref (linked_inode);
                         if (base_inode)
                                 gf_uuid_copy (ctx->base_gfid, base_inode->gfid);
+                        else
+                                gf_uuid_copy (ctx->base_gfid, gfid);
                         ctx->block_num = block_num;
-                        ctx->base_inode = base_inode;
+                        if (base_inode)
+                                ctx->base_inode = inode_ref (base_inode);
                         list_add_tail (&ctx->ilist, &priv->ilist_head);
                 }
         } else {
@@ -1027,7 +1045,7 @@ shard_common_resolve_shards (call_frame_t *frame, xlator_t *this,
                                 fsync_inode = __shard_update_shards_inode_list (inode,
                                                                   this,
                                                                   res_inode,
-                                                                shard_idx_iter);
+                                                                shard_idx_iter, gfid);
                         }
                         UNLOCK(&priv->lock);
                         shard_idx_iter++;
@@ -2173,7 +2191,8 @@ shard_link_block_inode (shard_local_t *local, int block_num, inode_t *inode,
                 fsync_inode = __shard_update_shards_inode_list (linked_inode,
                                                                 this,
                                                                 local->loc.inode,
-                                                                block_num);
+                                                                block_num,
+                                                                gfid);
         }
         UNLOCK(&priv->lock);
         if (fsync_inode)
@@ -2881,6 +2900,7 @@ void
 shard_unlink_block_inode (shard_local_t *local, int shard_block_num)
 {
         char                  block_bname[256]  = {0,};
+        int                   unref_base_inode  = 0;
         uuid_t                gfid              = {0,};
         inode_t              *inode             = NULL;
         inode_t              *base_inode        = NULL;
@@ -2894,11 +2914,12 @@ shard_unlink_block_inode (shard_local_t *local, int shard_block_num)
         priv = this->private;
 
         inode = local->inode_list[shard_block_num - local->first_block];
-        base_inode = local->resolver_base_inode;
+        shard_inode_ctx_get (inode, this, &ctx);
+        base_inode = ctx->base_inode;
         if (base_inode)
                 gf_uuid_copy (gfid, base_inode->gfid);
         else
-                gf_uuid_copy (gfid, local->base_gfid);
+                gf_uuid_copy (gfid, ctx->base_gfid);
 
         shard_make_block_bname (shard_block_num, gfid,
                                 block_bname, sizeof (block_bname));
@@ -2912,17 +2933,16 @@ shard_unlink_block_inode (shard_local_t *local, int shard_block_num)
                 if (!list_empty (&ctx->ilist)) {
                         list_del_init (&ctx->ilist);
                         priv->inode_count--;
+                        unref_base_inode++;
                         GF_ASSERT (priv->inode_count >= 0);
                         unlink_unref_forget = _gf_true;
                 }
                 if (ctx->fsync_needed) {
-                        if (base_inode)
-                                inode_unref (base_inode);
+                        unref_base_inode++;
                         list_del_init (&ctx->to_fsync_list);
-                        if (base_inode) {
+                        if (base_inode)
                                 __shard_inode_ctx_get (base_inode, this, &base_ictx);
-                                base_ictx->fsync_count--;
-                        }
+                        base_ictx->fsync_count--;
                 }
         }
         UNLOCK(&inode->lock);
@@ -2933,6 +2953,8 @@ shard_unlink_block_inode (shard_local_t *local, int shard_block_num)
                 inode_unref (inode);
                 inode_forget (inode, 0);
         }
+        if (base_inode && unref_base_inode)
+                inode_ref_reduce_by_n (base_inode, unref_base_inode);
         UNLOCK(&priv->lock);
 }
 
-- 
1.8.3.1