From f4d1a1683882a4da81240413dae1f6a390ee2442 Mon Sep 17 00:00:00 2001
From: Krutika Dhananjay <kdhananj@redhat.com>
Date: Thu, 24 Jan 2019 14:14:39 +0530
Subject: [PATCH 510/510] features/shard: Ref shard inode while adding to fsync
list
> Upstream: https://review.gluster.org/22091
> BUG: 1669077
> Change-Id: Iab460667d091b8388322f59b6cb27ce69299b1b2
PROBLEM:
Lot of the earlier changes in the management of shards in lru, fsync
lists assumed that if a given shard exists in fsync list, it must be
part of lru list as well. This was found to be not true.
Consider this - a file is FALLOCATE'd to a size which would make the
number of participant shards to be greater than the lru list size.
In this case, some of the resolved shards that are to participate in
this fop will be evicted from lru list to give way to the rest of the
shards. And once FALLOCATE completes, these shards are added to fsync
list but without a ref. After the fop completes, these shard inodes
are unref'd and destroyed while their inode ctxs are still part of
fsync list. Now when an FSYNC is called on the base file and the
fsync-list traversed, the client crashes due to illegal memory access.
FIX:
Hold a ref on the shard inode when adding to fsync list as well.
And unref under following conditions:
1. when the shard is evicted from lru list
2. when the base file is fsync'd
3. when the shards are deleted.
Change-Id: Iab460667d091b8388322f59b6cb27ce69299b1b2
BUG: 1668304
Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/161397
Tested-by: RHGS Build Bot <nigelb@redhat.com>
Reviewed-by: Xavi Hernandez <xhernandez@redhat.com>
---
tests/bugs/shard/bug-1669077.t | 29 +++++++++++++++++++++++++++++
xlators/features/shard/src/shard.c | 29 +++++++++++++++++++++--------
2 files changed, 50 insertions(+), 8 deletions(-)
create mode 100644 tests/bugs/shard/bug-1669077.t
diff --git a/tests/bugs/shard/bug-1669077.t b/tests/bugs/shard/bug-1669077.t
new file mode 100644
index 000000000..8d3a67a36
--- /dev/null
+++ b/tests/bugs/shard/bug-1669077.t
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+SHARD_COUNT_TIME=5
+
+cleanup
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+TEST $CLI volume set $V0 features.shard on
+TEST $CLI volume set $V0 features.shard-block-size 4MB
+TEST $CLI volume set $V0 features.shard-lru-limit 25
+
+TEST $CLI volume start $V0
+
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
+
+# If the bug still exists, client should crash during fallocate below
+TEST fallocate -l 200M $M0/foo
+
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
+
+TEST $CLI volume stop $V0
+TEST $CLI volume delete $V0
+
+cleanup
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
index 19dd3e4ba..cd388500d 100644
--- a/xlators/features/shard/src/shard.c
+++ b/xlators/features/shard/src/shard.c
@@ -272,6 +272,7 @@ shard_inode_ctx_add_to_fsync_list (inode_t *base_inode, xlator_t *this,
* of the to_fsync_list.
*/
inode_ref (base_inode);
+ inode_ref(shard_inode);
LOCK (&base_inode->lock);
LOCK (&shard_inode->lock);
@@ -285,8 +286,10 @@ shard_inode_ctx_add_to_fsync_list (inode_t *base_inode, xlator_t *this,
/* Unref the base inode corresponding to the ref above, if the shard is
* found to be already part of the fsync list.
*/
- if (ret != 0)
+ if (ret != 0) {
inode_unref (base_inode);
+ inode_unref(shard_inode);
+ }
return ret;
}
@@ -735,6 +738,10 @@ after_fsync_check:
block_bname);
inode_forget (lru_inode, 0);
} else {
+ /* The following unref corresponds to the ref
+ * held when the shard was added to fsync list.
+ */
+ inode_unref(lru_inode);
fsync_inode = lru_inode;
if (lru_base_inode)
inode_unref (lru_base_inode);
@@ -2947,7 +2954,7 @@ shard_unlink_block_inode (shard_local_t *local, int shard_block_num)
shard_priv_t *priv = NULL;
shard_inode_ctx_t *ctx = NULL;
shard_inode_ctx_t *base_ictx = NULL;
- gf_boolean_t unlink_unref_forget = _gf_false;
+ int unref_shard_inode = 0;
this = THIS;
priv = this->private;
@@ -2973,11 +2980,12 @@ shard_unlink_block_inode (shard_local_t *local, int shard_block_num)
list_del_init (&ctx->ilist);
priv->inode_count--;
unref_base_inode++;
+ unref_shard_inode++;
GF_ASSERT (priv->inode_count >= 0);
- unlink_unref_forget = _gf_true;
}
if (ctx->fsync_needed) {
unref_base_inode++;
+ unref_shard_inode++;
list_del_init (&ctx->to_fsync_list);
if (base_inode) {
__shard_inode_ctx_get (base_inode, this, &base_ictx);
@@ -2988,11 +2996,11 @@ shard_unlink_block_inode (shard_local_t *local, int shard_block_num)
UNLOCK(&inode->lock);
if (base_inode)
UNLOCK(&base_inode->lock);
- if (unlink_unref_forget) {
- inode_unlink (inode, priv->dot_shard_inode, block_bname);
- inode_unref (inode);
- inode_forget (inode, 0);
- }
+
+ inode_unlink(inode, priv->dot_shard_inode, block_bname);
+ inode_ref_reduce_by_n(inode, unref_shard_inode);
+ inode_forget(inode, 0);
+
if (base_inode && unref_base_inode)
inode_ref_reduce_by_n (base_inode, unref_base_inode);
UNLOCK(&priv->lock);
@@ -5824,6 +5832,7 @@ shard_fsync_shards_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
shard_inode_ctx_t *ctx = NULL;
shard_inode_ctx_t *base_ictx = NULL;
inode_t *base_inode = NULL;
+ gf_boolean_t unref_shard_inode = _gf_false;
local = frame->local;
base_inode = local->fd->inode;
@@ -5858,11 +5867,15 @@ out:
list_add_tail (&ctx->to_fsync_list,
&base_ictx->to_fsync_list);
base_ictx->fsync_count++;
+ } else {
+ unref_shard_inode = _gf_true;
}
}
UNLOCK (&anon_fd->inode->lock);
UNLOCK (&base_inode->lock);
}
+ if (unref_shard_inode)
+ inode_unref(anon_fd->inode);
if (anon_fd)
fd_unref (anon_fd);
--
2.20.1