From f4d1a1683882a4da81240413dae1f6a390ee2442 Mon Sep 17 00:00:00 2001 From: Krutika Dhananjay Date: Thu, 24 Jan 2019 14:14:39 +0530 Subject: [PATCH 510/510] features/shard: Ref shard inode while adding to fsync list > Upstream: https://review.gluster.org/22091 > BUG: 1669077 > Change-Id: Iab460667d091b8388322f59b6cb27ce69299b1b2 PROBLEM: Lot of the earlier changes in the management of shards in lru, fsync lists assumed that if a given shard exists in fsync list, it must be part of lru list as well. This was found to be not true. Consider this - a file is FALLOCATE'd to a size which would make the number of participant shards to be greater than the lru list size. In this case, some of the resolved shards that are to participate in this fop will be evicted from lru list to give way to the rest of the shards. And once FALLOCATE completes, these shards are added to fsync list but without a ref. After the fop completes, these shard inodes are unref'd and destroyed while their inode ctxs are still part of fsync list. Now when an FSYNC is called on the base file and the fsync-list traversed, the client crashes due to illegal memory access. FIX: Hold a ref on the shard inode when adding to fsync list as well. And unref under following conditions: 1. when the shard is evicted from lru list 2. when the base file is fsync'd 3. when the shards are deleted. Change-Id: Iab460667d091b8388322f59b6cb27ce69299b1b2 BUG: 1668304 Signed-off-by: Krutika Dhananjay Reviewed-on: https://code.engineering.redhat.com/gerrit/161397 Tested-by: RHGS Build Bot Reviewed-by: Xavi Hernandez --- tests/bugs/shard/bug-1669077.t | 29 +++++++++++++++++++++++++++++ xlators/features/shard/src/shard.c | 29 +++++++++++++++++++++-------- 2 files changed, 50 insertions(+), 8 deletions(-) create mode 100644 tests/bugs/shard/bug-1669077.t diff --git a/tests/bugs/shard/bug-1669077.t b/tests/bugs/shard/bug-1669077.t new file mode 100644 index 000000000..8d3a67a36 --- /dev/null +++ b/tests/bugs/shard/bug-1669077.t @@ -0,0 +1,29 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +SHARD_COUNT_TIME=5 + +cleanup + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} +TEST $CLI volume set $V0 features.shard on +TEST $CLI volume set $V0 features.shard-block-size 4MB +TEST $CLI volume set $V0 features.shard-lru-limit 25 + +TEST $CLI volume start $V0 + +TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0 + +# If the bug still exists, client should crash during fallocate below +TEST fallocate -l 200M $M0/foo + +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 + +TEST $CLI volume stop $V0 +TEST $CLI volume delete $V0 + +cleanup diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c index 19dd3e4ba..cd388500d 100644 --- a/xlators/features/shard/src/shard.c +++ b/xlators/features/shard/src/shard.c @@ -272,6 +272,7 @@ shard_inode_ctx_add_to_fsync_list (inode_t *base_inode, xlator_t *this, * of the to_fsync_list. */ inode_ref (base_inode); + inode_ref(shard_inode); LOCK (&base_inode->lock); LOCK (&shard_inode->lock); @@ -285,8 +286,10 @@ shard_inode_ctx_add_to_fsync_list (inode_t *base_inode, xlator_t *this, /* Unref the base inode corresponding to the ref above, if the shard is * found to be already part of the fsync list. */ - if (ret != 0) + if (ret != 0) { inode_unref (base_inode); + inode_unref(shard_inode); + } return ret; } @@ -735,6 +738,10 @@ after_fsync_check: block_bname); inode_forget (lru_inode, 0); } else { + /* The following unref corresponds to the ref + * held when the shard was added to fsync list. + */ + inode_unref(lru_inode); fsync_inode = lru_inode; if (lru_base_inode) inode_unref (lru_base_inode); @@ -2947,7 +2954,7 @@ shard_unlink_block_inode (shard_local_t *local, int shard_block_num) shard_priv_t *priv = NULL; shard_inode_ctx_t *ctx = NULL; shard_inode_ctx_t *base_ictx = NULL; - gf_boolean_t unlink_unref_forget = _gf_false; + int unref_shard_inode = 0; this = THIS; priv = this->private; @@ -2973,11 +2980,12 @@ shard_unlink_block_inode (shard_local_t *local, int shard_block_num) list_del_init (&ctx->ilist); priv->inode_count--; unref_base_inode++; + unref_shard_inode++; GF_ASSERT (priv->inode_count >= 0); - unlink_unref_forget = _gf_true; } if (ctx->fsync_needed) { unref_base_inode++; + unref_shard_inode++; list_del_init (&ctx->to_fsync_list); if (base_inode) { __shard_inode_ctx_get (base_inode, this, &base_ictx); @@ -2988,11 +2996,11 @@ shard_unlink_block_inode (shard_local_t *local, int shard_block_num) UNLOCK(&inode->lock); if (base_inode) UNLOCK(&base_inode->lock); - if (unlink_unref_forget) { - inode_unlink (inode, priv->dot_shard_inode, block_bname); - inode_unref (inode); - inode_forget (inode, 0); - } + + inode_unlink(inode, priv->dot_shard_inode, block_bname); + inode_ref_reduce_by_n(inode, unref_shard_inode); + inode_forget(inode, 0); + if (base_inode && unref_base_inode) inode_ref_reduce_by_n (base_inode, unref_base_inode); UNLOCK(&priv->lock); @@ -5824,6 +5832,7 @@ shard_fsync_shards_cbk (call_frame_t *frame, void *cookie, xlator_t *this, shard_inode_ctx_t *ctx = NULL; shard_inode_ctx_t *base_ictx = NULL; inode_t *base_inode = NULL; + gf_boolean_t unref_shard_inode = _gf_false; local = frame->local; base_inode = local->fd->inode; @@ -5858,11 +5867,15 @@ out: list_add_tail (&ctx->to_fsync_list, &base_ictx->to_fsync_list); base_ictx->fsync_count++; + } else { + unref_shard_inode = _gf_true; } } UNLOCK (&anon_fd->inode->lock); UNLOCK (&base_inode->lock); } + if (unref_shard_inode) + inode_unref(anon_fd->inode); if (anon_fd) fd_unref (anon_fd); -- 2.20.1