From 8a3c0fb64c8798ecf5a3635fe0922e3cfd476817 Mon Sep 17 00:00:00 2001
From: Pranith Kumar K <pkarampu@redhat.com>
Date: Mon, 27 Aug 2018 12:40:16 +0530
Subject: [PATCH 425/444] cluster/afr: Delegate name-heal when possible
Problem:
When name-self-heal is triggered on the mount, it blocks
lookup until name-self-heal completes. But that can lead
to hangs when lot of clients are accessing a directory which
needs name heal and all of them trigger heals waiting
for other clients to complete heal.
Fix:
When a name-heal is needed but quorum number of names have the
file and pending xattrs exist on the parent, then better to
delegate the heal to SHD which will be completed as part of
entry-heal of the parent directory. We could also do the same
for quorum-number of names not present but we don't have
any known use-case where this is a frequent occurrence so
not changing that part at the moment. When there is a gfid
mismatch or missing gfid it is important to complete the heal
so that next rename doesn't assume everything is fine and
perform a rename etc
BUG: 1619357
Upstream Patch: https://review.gluster.org/c/glusterfs/+/21087
Change-Id: I8b002c85dffc6eb6f2833e742684a233daefeb2c
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/155029
Tested-by: RHGS Build Bot <nigelb@redhat.com>
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
---
tests/afr.rc | 8 ++
tests/basic/afr/name-self-heal.t | 112 +++++++++++++++++++++++++++
xlators/cluster/afr/src/afr-common.c | 100 ++++++++++++++++++------
xlators/cluster/afr/src/afr-self-heal-name.c | 12 ++-
4 files changed, 205 insertions(+), 27 deletions(-)
create mode 100644 tests/basic/afr/name-self-heal.t
diff --git a/tests/afr.rc b/tests/afr.rc
index 1fd0310..a1e8a44 100644
--- a/tests/afr.rc
+++ b/tests/afr.rc
@@ -89,3 +89,11 @@ function count_index_entries()
{
ls $1/.glusterfs/indices/xattrop | wc -l
}
+
+function get_quorum_type()
+{
+ local m="$1"
+ local v="$2"
+ local repl_id="$3"
+ cat $m/.meta/graphs/active/$v-replicate-$repl_id/private|grep quorum-type|awk '{print $3}'
+}
diff --git a/tests/basic/afr/name-self-heal.t b/tests/basic/afr/name-self-heal.t
new file mode 100644
index 0000000..50fc2ec
--- /dev/null
+++ b/tests/basic/afr/name-self-heal.t
@@ -0,0 +1,112 @@
+#!/bin/bash
+#Self-heal tests
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+. $(dirname $0)/../../afr.rc
+cleanup;
+
+#Check that when quorum is not enabled name-heal happens correctly
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 2 $H0:$B0/brick{0,1}
+TEST $CLI volume set $V0 performance.stat-prefetch off
+TEST $CLI volume start $V0
+TEST $CLI volume heal $V0 disable
+TEST $CLI volume set $V0 cluster.entry-self-heal off
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;
+
+TEST touch $M0/a
+TEST touch $M0/c
+TEST kill_brick $V0 $H0 $B0/brick0
+TEST touch $M0/b
+TEST rm -f $M0/a
+TEST rm -f $M0/c
+TEST touch $M0/c #gfid mismatch case
+c_gfid=$(gf_get_gfid_xattr $B0/brick1/c)
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
+TEST ! stat $M0/a
+TEST ! stat $B0/brick0/a
+TEST ! stat $B0/brick1/a
+
+TEST stat $M0/b
+TEST stat $B0/brick0/b
+TEST stat $B0/brick1/b
+TEST [[ "$(gf_get_gfid_xattr $B0/brick0/b)" == "$(gf_get_gfid_xattr $B0/brick1/b)" ]]
+
+TEST stat $M0/c
+TEST stat $B0/brick0/c
+TEST stat $B0/brick1/c
+TEST [[ "$(gf_get_gfid_xattr $B0/brick0/c)" == "$c_gfid" ]]
+
+cleanup;
+
+#Check that when quorum is enabled name-heal happens as expected
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 3 $H0:$B0/brick{0,1,2}
+TEST $CLI volume set $V0 performance.stat-prefetch off
+TEST $CLI volume start $V0
+TEST $CLI volume heal $V0 disable
+TEST $CLI volume set $V0 cluster.entry-self-heal off
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;
+
+TEST touch $M0/a
+TEST touch $M0/c
+TEST kill_brick $V0 $H0 $B0/brick0
+TEST touch $M0/b
+TEST rm -f $M0/a
+TEST rm -f $M0/c
+TEST touch $M0/c #gfid mismatch case
+c_gfid=$(gf_get_gfid_xattr $B0/brick1/c)
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
+TEST ! stat $M0/a
+TEST ! stat $B0/brick0/a
+TEST ! stat $B0/brick1/a
+TEST ! stat $B0/brick2/a
+
+TEST stat $M0/b
+TEST ! stat $B0/brick0/b #Name heal shouldn't be triggered
+TEST stat $B0/brick1/b
+TEST stat $B0/brick2/b
+
+TEST stat $M0/c
+TEST stat $B0/brick0/c
+TEST stat $B0/brick1/c
+TEST stat $B0/brick2/c
+TEST [[ "$(gf_get_gfid_xattr $B0/brick0/c)" == "$c_gfid" ]]
+
+TEST $CLI volume set $V0 cluster.quorum-type none
+EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "none" get_quorum_type $M0 $V0 0
+TEST stat $M0/b
+TEST stat $B0/brick0/b #Name heal should be triggered
+TEST stat $B0/brick1/b
+TEST stat $B0/brick2/b
+TEST [[ "$(gf_get_gfid_xattr $B0/brick0/b)" == "$(gf_get_gfid_xattr $B0/brick1/b)" ]]
+TEST $CLI volume set $V0 cluster.quorum-type auto
+EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "auto" get_quorum_type $M0 $V0 0
+
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
+
+#Missing parent xattrs cases
+TEST $CLI volume heal $V0 enable
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
+TEST $CLI volume heal $V0
+EXPECT_WITHIN $HEAL_TIMEOUT "0" get_pending_heal_count $V0
+
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;
+TEST $CLI volume heal $V0 disable
+#In cases where a good parent doesn't have pending xattrs and a file,
+#name-heal will be triggered
+TEST gf_rm_file_and_gfid_link $B0/brick1 c
+TEST stat $M0/c
+TEST stat $B0/brick0/c
+TEST stat $B0/brick1/c
+TEST stat $B0/brick2/c
+TEST [[ "$(gf_get_gfid_xattr $B0/brick0/c)" == "$c_gfid" ]]
+cleanup
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index e74fdec..ce2b17a 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -2302,8 +2302,6 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
*/
for (i = 0; i < priv->child_count; i++) {
if (!replies[i].valid || replies[i].op_ret == -1) {
- if (priv->child_up[i])
- can_interpret = _gf_false;
continue;
}
@@ -2742,21 +2740,52 @@ afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this)
afr_private_t *priv = NULL;
call_frame_t *heal = NULL;
int i = 0, first = -1;
- gf_boolean_t need_heal = _gf_false;
+ gf_boolean_t name_state_mismatch = _gf_false;
struct afr_reply *replies = NULL;
int ret = 0;
+ unsigned char *par_readables = NULL;
+ unsigned char *success = NULL;
+ int32_t op_errno = 0;
+ uuid_t gfid = {0};
local = frame->local;
replies = local->replies;
priv = this->private;
+ par_readables = alloca0(priv->child_count);
+ success = alloca0(priv->child_count);
+
+ ret = afr_inode_read_subvol_get (local->loc.parent, this, par_readables,
+ NULL, NULL);
+ if (ret < 0 || AFR_COUNT (par_readables, priv->child_count) == 0) {
+ /* In this case set par_readables to all 1 so that name_heal
+ * need checks at the end of this function will flag missing
+ * entry when name state mismatches*/
+ memset (par_readables, 1, priv->child_count);
+ }
for (i = 0; i < priv->child_count; i++) {
if (!replies[i].valid)
continue;
+ if (replies[i].op_ret == 0) {
+ if (uuid_is_null (gfid)) {
+ gf_uuid_copy (gfid,
+ replies[i].poststat.ia_gfid);
+ }
+ success[i] = 1;
+ } else {
+ if ((replies[i].op_errno != ENOTCONN) &&
+ (replies[i].op_errno != ENOENT) &&
+ (replies[i].op_errno != ESTALE)) {
+ op_errno = replies[i].op_errno;
+ }
+ }
+
+ /*gfid is missing, needs heal*/
if ((replies[i].op_ret == -1) &&
- (replies[i].op_errno == ENODATA))
- need_heal = _gf_true;
+ (replies[i].op_errno == ENODATA)) {
+ goto name_heal;
+ }
if (first == -1) {
first = i;
@@ -2764,30 +2793,53 @@ afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this)
}
if (replies[i].op_ret != replies[first].op_ret) {
- need_heal = _gf_true;
- break;
+ name_state_mismatch = _gf_true;
}
- if (gf_uuid_compare (replies[i].poststat.ia_gfid,
- replies[first].poststat.ia_gfid)) {
- need_heal = _gf_true;
- break;
- }
+ if (replies[i].op_ret == 0) {
+ /* Rename after this lookup may succeed if we don't do
+ * a name-heal and the destination may not have pending xattrs
+ * to indicate which name is good and which is bad so always do
+ * this heal*/
+ if (gf_uuid_compare (replies[i].poststat.ia_gfid,
+ gfid)) {
+ goto name_heal;
+ }
+ }
}
- if (need_heal) {
- heal = afr_frame_create (this, NULL);
- if (!heal)
- goto metadata_heal;
-
- ret = synctask_new (this->ctx->env, afr_lookup_selfheal_wrap,
- afr_refresh_selfheal_done, heal, frame);
- if (ret) {
- AFR_STACK_DESTROY (heal);
- goto metadata_heal;
+ if (name_state_mismatch) {
+ if (!priv->quorum_count)
+ goto name_heal;
+ if (!afr_has_quorum (success, this))
+ goto name_heal;
+ if (op_errno)
+ goto name_heal;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+ if (par_readables[i] && replies[i].op_ret < 0 &&
+ replies[i].op_errno != ENOTCONN) {
+ goto name_heal;
+ }
}
- return ret;
- }
+ }
+
+ goto metadata_heal;
+
+name_heal:
+ heal = afr_frame_create (this, NULL);
+ if (!heal)
+ goto metadata_heal;
+
+ ret = synctask_new (this->ctx->env, afr_lookup_selfheal_wrap,
+ afr_refresh_selfheal_done, heal, frame);
+ if (ret) {
+ AFR_STACK_DESTROY (heal);
+ goto metadata_heal;
+ }
+ return ret;
+
metadata_heal:
ret = afr_lookup_metadata_heal_check (frame, this);
diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c
index bcd0e60..0a5be29 100644
--- a/xlators/cluster/afr/src/afr-self-heal-name.c
+++ b/xlators/cluster/afr/src/afr-self-heal-name.c
@@ -634,20 +634,26 @@ afr_selfheal_name_unlocked_inspect (call_frame_t *frame, xlator_t *this,
continue;
if ((replies[i].op_ret == -1) &&
- (replies[i].op_errno == ENODATA))
+ (replies[i].op_errno == ENODATA)) {
*need_heal = _gf_true;
+ break;
+ }
if (first_idx == -1) {
first_idx = i;
continue;
}
- if (replies[i].op_ret != replies[first_idx].op_ret)
+ if (replies[i].op_ret != replies[first_idx].op_ret) {
*need_heal = _gf_true;
+ break;
+ }
if (gf_uuid_compare (replies[i].poststat.ia_gfid,
- replies[first_idx].poststat.ia_gfid))
+ replies[first_idx].poststat.ia_gfid)) {
*need_heal = _gf_true;
+ break;
+ }
}
if (inode)
--
1.8.3.1