887953
From 72ce80749fca03ab97a63af79d4e6bc76a49ab64 Mon Sep 17 00:00:00 2001
887953
From: Krutika Dhananjay <kdhananj@redhat.com>
887953
Date: Fri, 5 Oct 2018 11:32:21 +0530
887953
Subject: [PATCH 443/444] features/shard: Hold a ref on base inode when adding
887953
 a shard to lru list
887953
887953
    > Upstream: https://review.gluster.org/21454
887953
    > BUG: 1605056
887953
    > Change-Id: Ic15ca41444dd04684a9458bd4a526b1d3e160499
887953
887953
In __shard_update_shards_inode_list(), previously shard translator
887953
was not holding a ref on the base inode whenever a shard was added to
887953
the lru list. But if the base shard is forgotten and destroyed either
887953
by fuse due to memory pressure or due to the file being deleted at some
887953
point by a different client with this client still containing stale
887953
shards in its lru list, the client would crash at the time of locking
887953
lru_base_inode->lock owing to illegal memory access.
887953
887953
So now the base shard is ref'd into the inode ctx of every shard that
887953
is added to lru list until it gets lru'd out.
887953
887953
The patch also handles the case where none of the shards associated
887953
with a file that is about to be deleted are part of the LRU list and
887953
where an unlink at the beginning of the operation destroys the base
887953
inode (because there are no refkeepers) and hence all of the shards
887953
that are about to be deleted will be resolved without the existence
887953
of a base shard in-memory. This, if not handled properly, could lead
887953
to a crash.
887953
887953
Change-Id: Ic15ca41444dd04684a9458bd4a526b1d3e160499
887953
BUG: 1603118
887953
Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
887953
Reviewed-on: https://code.engineering.redhat.com/gerrit/155318
887953
Reviewed-by: Xavi Hernandez <xhernandez@redhat.com>
887953
Tested-by: RHGS Build Bot <nigelb@redhat.com>
887953
---
887953
 tests/bugs/shard/bug-1605056-2.t             | 34 +++++++++++++++
887953
 tests/bugs/shard/bug-1605056.t               | 63 ++++++++++++++++++++++++++++
887953
 tests/bugs/shard/shard-inode-refcount-test.t |  2 +-
887953
 tests/volume.rc                              | 12 ++++--
887953
 xlators/features/shard/src/shard.c           | 48 +++++++++++++++------
887953
 5 files changed, 141 insertions(+), 18 deletions(-)
887953
 create mode 100644 tests/bugs/shard/bug-1605056-2.t
887953
 create mode 100644 tests/bugs/shard/bug-1605056.t
887953
887953
diff --git a/tests/bugs/shard/bug-1605056-2.t b/tests/bugs/shard/bug-1605056-2.t
887953
new file mode 100644
887953
index 0000000..a9c10fe
887953
--- /dev/null
887953
+++ b/tests/bugs/shard/bug-1605056-2.t
887953
@@ -0,0 +1,34 @@
887953
+#!/bin/bash
887953
+
887953
+. $(dirname $0)/../../include.rc
887953
+. $(dirname $0)/../../volume.rc
887953
+
887953
+cleanup
887953
+
887953
+TEST glusterd
887953
+TEST pidof glusterd
887953
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
887953
+TEST $CLI volume set $V0 features.shard on
887953
+TEST $CLI volume set $V0 features.shard-block-size 4MB
887953
+TEST $CLI volume set $V0 features.shard-lru-limit 25
887953
+TEST $CLI volume set $V0 performance.write-behind off
887953
+
887953
+TEST $CLI volume start $V0
887953
+
887953
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
887953
+
887953
+# Perform a write that would cause 25 shards to be created under .shard
887953
+TEST dd if=/dev/zero of=$M0/foo bs=1M count=104
887953
+
887953
+# Write into another file bar to ensure all of foo's shards are evicted from lru list of $M0
887953
+TEST dd if=/dev/zero of=$M0/bar bs=1M count=104
887953
+
887953
+# Delete foo from $M0. If there's a bug, the mount will crash.
887953
+TEST unlink $M0/foo
887953
+
887953
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
887953
+
887953
+TEST $CLI volume stop $V0
887953
+TEST $CLI volume delete $V0
887953
+
887953
+cleanup
887953
diff --git a/tests/bugs/shard/bug-1605056.t b/tests/bugs/shard/bug-1605056.t
887953
new file mode 100644
887953
index 0000000..c2329ea
887953
--- /dev/null
887953
+++ b/tests/bugs/shard/bug-1605056.t
887953
@@ -0,0 +1,63 @@
887953
+#!/bin/bash
887953
+
887953
+. $(dirname $0)/../../include.rc
887953
+. $(dirname $0)/../../volume.rc
887953
+
887953
+SHARD_COUNT_TIME=5
887953
+
887953
+cleanup
887953
+
887953
+TEST glusterd
887953
+TEST pidof glusterd
887953
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
887953
+TEST $CLI volume set $V0 features.shard on
887953
+TEST $CLI volume set $V0 features.shard-block-size 4MB
887953
+TEST $CLI volume set $V0 features.shard-lru-limit 25
887953
+TEST $CLI volume set $V0 performance.write-behind off
887953
+
887953
+TEST $CLI volume start $V0
887953
+
887953
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
887953
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M1
887953
+
887953
+# Perform a write that would cause 25 shards to be created under .shard
887953
+TEST dd if=/dev/zero of=$M0/foo bs=1M count=104
887953
+
887953
+# Read the file from $M1, indirectly filling up the lru list.
887953
+TEST `cat $M1/foo > /dev/null`
887953
+statedump=$(generate_mount_statedump $V0 $M1)
887953
+sleep 1
887953
+EXPECT "25" echo $(grep "inode-count" $statedump | cut -f2 -d'=' | tail -1)
887953
+rm -f $statedump
887953
+
887953
+# Delete foo from $M0.
887953
+TEST unlink $M0/foo
887953
+
887953
+# Send stat on foo from $M1 to force $M1 to "forget" inode associated with foo.
887953
+# Now the ghost shards associated with "foo" are still in lru list of $M1.
887953
+TEST ! stat $M1/foo
887953
+
887953
+# Let's force the ghost shards of "foo" out of lru list by looking up more shards
887953
+# through I/O on a file named "bar" from $M1. This should crash if the base inode
887953
+# had been destroyed by now.
887953
+
887953
+TEST dd if=/dev/zero of=$M1/bar bs=1M count=104
887953
+
887953
+###############################################
887953
+#### Now for some inode ref-leak tests ... ####
887953
+###############################################
887953
+
887953
+# Expect there to be 29 active inodes - 26 belonging to "bar", 1 for .shard,
887953
+# 1 for .shard/remove_me and 1 for '/'
887953
+EXPECT_WITHIN $SHARD_COUNT_TIME `expr 26 + 3` get_mount_active_size_value $V0 $M1
887953
+
887953
+TEST rm -f $M1/bar
887953
+EXPECT_WITHIN $SHARD_COUNT_TIME 3 get_mount_active_size_value $V0 $M1
887953
+
887953
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
887953
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M1
887953
+
887953
+TEST $CLI volume stop $V0
887953
+TEST $CLI volume delete $V0
887953
+
887953
+cleanup
887953
diff --git a/tests/bugs/shard/shard-inode-refcount-test.t b/tests/bugs/shard/shard-inode-refcount-test.t
887953
index 087c8ba..3fd181b 100644
887953
--- a/tests/bugs/shard/shard-inode-refcount-test.t
887953
+++ b/tests/bugs/shard/shard-inode-refcount-test.t
887953
@@ -21,7 +21,7 @@ TEST dd if=/dev/zero conv=fsync of=$M0/one-plus-five-shards bs=1M count=23
887953
 ACTIVE_INODES_BEFORE=$(get_mount_active_size_value $V0)
887953
 TEST rm -f $M0/one-plus-five-shards
887953
 # Expect 5 inodes less. But one inode more than before because .remove_me would be created.
887953
-EXPECT_WITHIN $SHARD_COUNT_TIME `expr $ACTIVE_INODES_BEFORE - 5 + 1` get_mount_active_size_value $V0
887953
+EXPECT_WITHIN $SHARD_COUNT_TIME `expr $ACTIVE_INODES_BEFORE - 5 + 1` get_mount_active_size_value $V0 $M0
887953
 
887953
 EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
887953
 TEST $CLI volume stop $V0
887953
diff --git a/tests/volume.rc b/tests/volume.rc
887953
index bba7e4e..6a983fd 100644
887953
--- a/tests/volume.rc
887953
+++ b/tests/volume.rc
887953
@@ -93,7 +93,8 @@ function remove_brick_status_completed_field {
887953
 
887953
 function get_mount_process_pid {
887953
         local vol=$1
887953
-        ps auxww | grep glusterfs | grep -E "volfile-id[ =]/?$vol " | awk '{print $2}' | head -1
887953
+        local mnt=$2
887953
+        ps auxww | grep glusterfs | grep -E "volfile-id[ =]/?$vol .*$mnt" | awk '{print $2}' | head -1
887953
 }
887953
 
887953
 function get_nfs_pid ()
887953
@@ -126,7 +127,8 @@ function generate_statedump {
887953
 
887953
 function generate_mount_statedump {
887953
         local vol=$1
887953
-        generate_statedump $(get_mount_process_pid $vol)
887953
+        local mnt=$2
887953
+        generate_statedump $(get_mount_process_pid $vol $mnt)
887953
 }
887953
 
887953
 function cleanup_mount_statedump {
887953
@@ -850,7 +852,8 @@ function get_active_fd_count {
887953
 
887953
 function get_mount_active_size_value {
887953
         local vol=$1
887953
-        local statedump=$(generate_mount_statedump $vol)
887953
+        local mount=$2
887953
+        local statedump=$(generate_mount_statedump $vol $mount)
887953
         sleep 1
887953
         local val=$(grep "active_size" $statedump | cut -f2 -d'=' | tail -1)
887953
         rm -f $statedump
887953
@@ -859,7 +862,8 @@ function get_mount_active_size_value {
887953
 
887953
 function get_mount_lru_size_value {
887953
         local vol=$1
887953
-        local statedump=$(generate_mount_statedump $vol)
887953
+        local mount=$2
887953
+        local statedump=$(generate_mount_statedump $vol $mount)
887953
         sleep 1
887953
         local val=$(grep "lru_size" $statedump | cut -f2 -d'=' | tail -1)
887953
         rm -f $statedump
887953
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
887953
index eb32168..fb88315 100644
887953
--- a/xlators/features/shard/src/shard.c
887953
+++ b/xlators/features/shard/src/shard.c
887953
@@ -651,7 +651,8 @@ out:
887953
 
887953
 inode_t *
887953
 __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this,
887953
-                                  inode_t *base_inode, int block_num)
887953
+                                  inode_t *base_inode, int block_num,
887953
+                                  uuid_t gfid)
887953
 {
887953
         char                block_bname[256]     = {0,};
887953
         inode_t            *lru_inode            = NULL;
887953
@@ -679,10 +680,13 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this,
887953
                         inode_ref (linked_inode);
887953
                         if (base_inode)
887953
                                 gf_uuid_copy (ctx->base_gfid, base_inode->gfid);
887953
+                        else
887953
+                                gf_uuid_copy(ctx->base_gfid, gfid);
887953
                         ctx->block_num = block_num;
887953
                         list_add_tail (&ctx->ilist, &priv->ilist_head);
887953
                         priv->inode_count++;
887953
-                        ctx->base_inode = base_inode;
887953
+                        if (base_inode)
887953
+                                ctx->base_inode = inode_ref (base_inode);
887953
                 } else {
887953
                 /*If on the other hand there is no available slot for this inode
887953
                  * in the list, delete the lru inode from the head of the list,
887953
@@ -701,6 +705,8 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this,
887953
                          * deleted from fsync list and fsync'd in a new frame,
887953
                          * and then unlinked in memory and forgotten.
887953
                          */
887953
+                        if (!lru_base_inode)
887953
+                                goto after_fsync_check;
887953
                         LOCK (&lru_base_inode->lock);
887953
                         LOCK (&lru_inode->lock);
887953
                         {
887953
@@ -715,6 +721,7 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this,
887953
                         UNLOCK (&lru_inode->lock);
887953
                         UNLOCK (&lru_base_inode->lock);
887953
 
887953
+after_fsync_check:
887953
                         if (!do_fsync) {
887953
                                 shard_make_block_bname (lru_inode_ctx->block_num,
887953
                                                         lru_inode_ctx->base_gfid,
887953
@@ -729,20 +736,31 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this,
887953
                                 inode_forget (lru_inode, 0);
887953
                         } else {
887953
                                 fsync_inode = lru_inode;
887953
-                                inode_unref (lru_base_inode);
887953
+                                if (lru_base_inode)
887953
+                                        inode_unref (lru_base_inode);
887953
                         }
887953
                         /* The following unref corresponds to the ref
887953
                          * held by inode_find() above.
887953
                          */
887953
                         inode_unref (lru_inode);
887953
+
887953
+                        /* The following unref corresponds to the ref held on
887953
+                         * the base shard at the time of adding shard inode to
887953
+                         * lru list
887953
+                        */
887953
+                        if (lru_base_inode)
887953
+                                inode_unref (lru_base_inode);
887953
                         /* For as long as an inode is in lru list, we try to
887953
                          * keep it alive by holding a ref on it.
887953
                          */
887953
                         inode_ref (linked_inode);
887953
                         if (base_inode)
887953
                                 gf_uuid_copy (ctx->base_gfid, base_inode->gfid);
887953
+                        else
887953
+                                gf_uuid_copy (ctx->base_gfid, gfid);
887953
                         ctx->block_num = block_num;
887953
-                        ctx->base_inode = base_inode;
887953
+                        if (base_inode)
887953
+                                ctx->base_inode = inode_ref (base_inode);
887953
                         list_add_tail (&ctx->ilist, &priv->ilist_head);
887953
                 }
887953
         } else {
887953
@@ -1027,7 +1045,7 @@ shard_common_resolve_shards (call_frame_t *frame, xlator_t *this,
887953
                                 fsync_inode = __shard_update_shards_inode_list (inode,
887953
                                                                   this,
887953
                                                                   res_inode,
887953
-                                                                shard_idx_iter);
887953
+                                                                shard_idx_iter, gfid);
887953
                         }
887953
                         UNLOCK(&priv->lock);
887953
                         shard_idx_iter++;
887953
@@ -2173,7 +2191,8 @@ shard_link_block_inode (shard_local_t *local, int block_num, inode_t *inode,
887953
                 fsync_inode = __shard_update_shards_inode_list (linked_inode,
887953
                                                                 this,
887953
                                                                 local->loc.inode,
887953
-                                                                block_num);
887953
+                                                                block_num,
887953
+                                                                gfid);
887953
         }
887953
         UNLOCK(&priv->lock);
887953
         if (fsync_inode)
887953
@@ -2881,6 +2900,7 @@ void
887953
 shard_unlink_block_inode (shard_local_t *local, int shard_block_num)
887953
 {
887953
         char                  block_bname[256]  = {0,};
887953
+        int                   unref_base_inode  = 0;
887953
         uuid_t                gfid              = {0,};
887953
         inode_t              *inode             = NULL;
887953
         inode_t              *base_inode        = NULL;
887953
@@ -2894,11 +2914,12 @@ shard_unlink_block_inode (shard_local_t *local, int shard_block_num)
887953
         priv = this->private;
887953
 
887953
         inode = local->inode_list[shard_block_num - local->first_block];
887953
-        base_inode = local->resolver_base_inode;
887953
+        shard_inode_ctx_get (inode, this, &ctx;;
887953
+        base_inode = ctx->base_inode;
887953
         if (base_inode)
887953
                 gf_uuid_copy (gfid, base_inode->gfid);
887953
         else
887953
-                gf_uuid_copy (gfid, local->base_gfid);
887953
+                gf_uuid_copy (gfid, ctx->base_gfid);
887953
 
887953
         shard_make_block_bname (shard_block_num, gfid,
887953
                                 block_bname, sizeof (block_bname));
887953
@@ -2912,17 +2933,16 @@ shard_unlink_block_inode (shard_local_t *local, int shard_block_num)
887953
                 if (!list_empty (&ctx->ilist)) {
887953
                         list_del_init (&ctx->ilist);
887953
                         priv->inode_count--;
887953
+                        unref_base_inode++;
887953
                         GF_ASSERT (priv->inode_count >= 0);
887953
                         unlink_unref_forget = _gf_true;
887953
                 }
887953
                 if (ctx->fsync_needed) {
887953
-                        if (base_inode)
887953
-                                inode_unref (base_inode);
887953
+                        unref_base_inode++;
887953
                         list_del_init (&ctx->to_fsync_list);
887953
-                        if (base_inode) {
887953
+                        if (base_inode)
887953
                                 __shard_inode_ctx_get (base_inode, this, &base_ictx);
887953
-                                base_ictx->fsync_count--;
887953
-                        }
887953
+                        base_ictx->fsync_count--;
887953
                 }
887953
         }
887953
         UNLOCK(&inode->lock);
887953
@@ -2933,6 +2953,8 @@ shard_unlink_block_inode (shard_local_t *local, int shard_block_num)
887953
                 inode_unref (inode);
887953
                 inode_forget (inode, 0);
887953
         }
887953
+        if (base_inode && unref_base_inode)
887953
+                inode_ref_reduce_by_n (base_inode, unref_base_inode);
887953
         UNLOCK(&priv->lock);
887953
 }
887953
 
887953
-- 
887953
1.8.3.1
887953