887953
From 8a3c0fb64c8798ecf5a3635fe0922e3cfd476817 Mon Sep 17 00:00:00 2001
887953
From: Pranith Kumar K <pkarampu@redhat.com>
887953
Date: Mon, 27 Aug 2018 12:40:16 +0530
887953
Subject: [PATCH 425/444] cluster/afr: Delegate name-heal when possible
887953
887953
Problem:
887953
When name-self-heal is triggered on the mount, it blocks
887953
lookup until name-self-heal completes. But that can lead
887953
to hangs when lot of clients are accessing a directory which
887953
needs name heal and all of them trigger heals waiting
887953
for other clients to complete heal.
887953
887953
Fix:
887953
When a name-heal is needed but quorum number of names have the
887953
file and pending xattrs exist on the parent, then better to
887953
delegate the heal to SHD which will be completed as part of
887953
entry-heal of the parent directory. We could also do the same
887953
for quorum-number of names not present but we don't have
887953
any known use-case where this is a frequent occurrence so
887953
not changing that part at the moment. When there is a gfid
887953
mismatch or missing gfid it is important to complete the heal
887953
so that next rename doesn't assume everything is fine and
887953
perform a rename etc
887953
887953
BUG: 1619357
887953
Upstream Patch: https://review.gluster.org/c/glusterfs/+/21087
887953
Change-Id: I8b002c85dffc6eb6f2833e742684a233daefeb2c
887953
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
887953
Reviewed-on: https://code.engineering.redhat.com/gerrit/155029
887953
Tested-by: RHGS Build Bot <nigelb@redhat.com>
887953
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
887953
---
887953
 tests/afr.rc                                 |   8 ++
887953
 tests/basic/afr/name-self-heal.t             | 112 +++++++++++++++++++++++++++
887953
 xlators/cluster/afr/src/afr-common.c         | 100 ++++++++++++++++++------
887953
 xlators/cluster/afr/src/afr-self-heal-name.c |  12 ++-
887953
 4 files changed, 205 insertions(+), 27 deletions(-)
887953
 create mode 100644 tests/basic/afr/name-self-heal.t
887953
887953
diff --git a/tests/afr.rc b/tests/afr.rc
887953
index 1fd0310..a1e8a44 100644
887953
--- a/tests/afr.rc
887953
+++ b/tests/afr.rc
887953
@@ -89,3 +89,11 @@ function count_index_entries()
887953
 {
887953
     ls $1/.glusterfs/indices/xattrop | wc -l
887953
 }
887953
+
887953
+function get_quorum_type()
887953
+{
887953
+        local m="$1"
887953
+        local v="$2"
887953
+        local repl_id="$3"
887953
+        cat $m/.meta/graphs/active/$v-replicate-$repl_id/private|grep quorum-type|awk '{print $3}'
887953
+}
887953
diff --git a/tests/basic/afr/name-self-heal.t b/tests/basic/afr/name-self-heal.t
887953
new file mode 100644
887953
index 0000000..50fc2ec
887953
--- /dev/null
887953
+++ b/tests/basic/afr/name-self-heal.t
887953
@@ -0,0 +1,112 @@
887953
+#!/bin/bash
887953
+#Self-heal tests
887953
+
887953
+. $(dirname $0)/../../include.rc
887953
+. $(dirname $0)/../../volume.rc
887953
+. $(dirname $0)/../../afr.rc
887953
+cleanup;
887953
+
887953
+#Check that when quorum is not enabled name-heal happens correctly
887953
+TEST glusterd
887953
+TEST pidof glusterd
887953
+TEST $CLI volume create $V0 replica 2 $H0:$B0/brick{0,1}
887953
+TEST $CLI volume set $V0 performance.stat-prefetch off
887953
+TEST $CLI volume start $V0
887953
+TEST $CLI volume heal $V0 disable
887953
+TEST $CLI volume set $V0 cluster.entry-self-heal off
887953
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;
887953
+
887953
+TEST touch $M0/a
887953
+TEST touch $M0/c
887953
+TEST kill_brick $V0 $H0 $B0/brick0
887953
+TEST touch $M0/b
887953
+TEST rm -f $M0/a
887953
+TEST rm -f $M0/c
887953
+TEST touch $M0/c #gfid mismatch case
887953
+c_gfid=$(gf_get_gfid_xattr $B0/brick1/c)
887953
+TEST $CLI volume start $V0 force
887953
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
887953
+TEST ! stat $M0/a
887953
+TEST ! stat $B0/brick0/a
887953
+TEST ! stat $B0/brick1/a
887953
+
887953
+TEST stat $M0/b
887953
+TEST stat $B0/brick0/b
887953
+TEST stat $B0/brick1/b
887953
+TEST [[ "$(gf_get_gfid_xattr $B0/brick0/b)" == "$(gf_get_gfid_xattr $B0/brick1/b)" ]]
887953
+
887953
+TEST stat $M0/c
887953
+TEST stat $B0/brick0/c
887953
+TEST stat $B0/brick1/c
887953
+TEST [[ "$(gf_get_gfid_xattr $B0/brick0/c)" == "$c_gfid" ]]
887953
+
887953
+cleanup;
887953
+
887953
+#Check that when quorum is enabled name-heal happens as expected
887953
+TEST glusterd
887953
+TEST pidof glusterd
887953
+TEST $CLI volume create $V0 replica 3 $H0:$B0/brick{0,1,2}
887953
+TEST $CLI volume set $V0 performance.stat-prefetch off
887953
+TEST $CLI volume start $V0
887953
+TEST $CLI volume heal $V0 disable
887953
+TEST $CLI volume set $V0 cluster.entry-self-heal off
887953
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;
887953
+
887953
+TEST touch $M0/a
887953
+TEST touch $M0/c
887953
+TEST kill_brick $V0 $H0 $B0/brick0
887953
+TEST touch $M0/b
887953
+TEST rm -f $M0/a
887953
+TEST rm -f $M0/c
887953
+TEST touch $M0/c #gfid mismatch case
887953
+c_gfid=$(gf_get_gfid_xattr $B0/brick1/c)
887953
+TEST $CLI volume start $V0 force
887953
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
887953
+TEST ! stat $M0/a
887953
+TEST ! stat $B0/brick0/a
887953
+TEST ! stat $B0/brick1/a
887953
+TEST ! stat $B0/brick2/a
887953
+
887953
+TEST stat $M0/b
887953
+TEST ! stat $B0/brick0/b #Name heal shouldn't be triggered
887953
+TEST stat $B0/brick1/b
887953
+TEST stat $B0/brick2/b
887953
+
887953
+TEST stat $M0/c
887953
+TEST stat $B0/brick0/c
887953
+TEST stat $B0/brick1/c
887953
+TEST stat $B0/brick2/c
887953
+TEST [[ "$(gf_get_gfid_xattr $B0/brick0/c)" == "$c_gfid" ]]
887953
+
887953
+TEST $CLI volume set $V0 cluster.quorum-type none
887953
+EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "none" get_quorum_type $M0 $V0 0
887953
+TEST stat $M0/b
887953
+TEST stat $B0/brick0/b #Name heal should be triggered
887953
+TEST stat $B0/brick1/b
887953
+TEST stat $B0/brick2/b
887953
+TEST [[ "$(gf_get_gfid_xattr $B0/brick0/b)" == "$(gf_get_gfid_xattr $B0/brick1/b)" ]]
887953
+TEST $CLI volume set $V0 cluster.quorum-type auto
887953
+EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "auto" get_quorum_type $M0 $V0 0
887953
+
887953
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
887953
+
887953
+#Missing parent xattrs cases
887953
+TEST $CLI volume heal $V0 enable
887953
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
887953
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
887953
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
887953
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
887953
+TEST $CLI volume heal $V0
887953
+EXPECT_WITHIN $HEAL_TIMEOUT "0" get_pending_heal_count $V0
887953
+
887953
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;
887953
+TEST $CLI volume heal $V0 disable
887953
+#In cases where a good parent doesn't have pending xattrs and a file,
887953
+#name-heal will be triggered
887953
+TEST gf_rm_file_and_gfid_link $B0/brick1 c
887953
+TEST stat $M0/c
887953
+TEST stat $B0/brick0/c
887953
+TEST stat $B0/brick1/c
887953
+TEST stat $B0/brick2/c
887953
+TEST [[ "$(gf_get_gfid_xattr $B0/brick0/c)" == "$c_gfid" ]]
887953
+cleanup
887953
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
887953
index e74fdec..ce2b17a 100644
887953
--- a/xlators/cluster/afr/src/afr-common.c
887953
+++ b/xlators/cluster/afr/src/afr-common.c
887953
@@ -2302,8 +2302,6 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
887953
 	*/
887953
 	for (i = 0; i < priv->child_count; i++) {
887953
 		if (!replies[i].valid || replies[i].op_ret == -1) {
887953
-			if (priv->child_up[i])
887953
-				can_interpret = _gf_false;
887953
 			continue;
887953
 		}
887953
 
887953
@@ -2742,21 +2740,52 @@ afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this)
887953
 	afr_private_t *priv = NULL;
887953
 	call_frame_t *heal = NULL;
887953
 	int i = 0, first = -1;
887953
-	gf_boolean_t need_heal = _gf_false;
887953
+	gf_boolean_t name_state_mismatch = _gf_false;
887953
 	struct afr_reply *replies = NULL;
887953
 	int ret = 0;
887953
+        unsigned char *par_readables = NULL;
887953
+        unsigned char *success = NULL;
887953
+        int32_t op_errno = 0;
887953
+        uuid_t gfid = {0};
887953
 
887953
 	local = frame->local;
887953
 	replies = local->replies;
887953
 	priv = this->private;
887953
+        par_readables = alloca0(priv->child_count);
887953
+        success = alloca0(priv->child_count);
887953
+
887953
+        ret = afr_inode_read_subvol_get (local->loc.parent, this, par_readables,
887953
+                                         NULL, NULL);
887953
+        if (ret < 0 || AFR_COUNT (par_readables, priv->child_count) == 0) {
887953
+                /* In this case set par_readables to all 1 so that name_heal
887953
+                 * need checks at the end of this function will flag missing
887953
+                 * entry when name state mismatches*/
887953
+                memset (par_readables, 1, priv->child_count);
887953
+        }
887953
 
887953
 	for (i = 0; i < priv->child_count; i++) {
887953
 		if (!replies[i].valid)
887953
 			continue;
887953
 
887953
+                if (replies[i].op_ret == 0) {
887953
+                        if (uuid_is_null (gfid)) {
887953
+                                gf_uuid_copy (gfid,
887953
+                                              replies[i].poststat.ia_gfid);
887953
+                        }
887953
+                        success[i] = 1;
887953
+                } else {
887953
+                        if ((replies[i].op_errno != ENOTCONN) &&
887953
+                            (replies[i].op_errno != ENOENT) &&
887953
+                            (replies[i].op_errno != ESTALE)) {
887953
+                                op_errno = replies[i].op_errno;
887953
+                        }
887953
+                }
887953
+
887953
+                /*gfid is missing, needs heal*/
887953
                 if ((replies[i].op_ret == -1) &&
887953
-                    (replies[i].op_errno == ENODATA))
887953
-                        need_heal = _gf_true;
887953
+                    (replies[i].op_errno == ENODATA)) {
887953
+                        goto name_heal;
887953
+                }
887953
 
887953
 		if (first == -1) {
887953
 			first = i;
887953
@@ -2764,30 +2793,53 @@ afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this)
887953
 		}
887953
 
887953
 		if (replies[i].op_ret != replies[first].op_ret) {
887953
-			need_heal = _gf_true;
887953
-			break;
887953
+                        name_state_mismatch = _gf_true;
887953
 		}
887953
 
887953
-		if (gf_uuid_compare (replies[i].poststat.ia_gfid,
887953
-				  replies[first].poststat.ia_gfid)) {
887953
-			need_heal = _gf_true;
887953
-			break;
887953
-		}
887953
+                if (replies[i].op_ret == 0) {
887953
+                        /* Rename after this lookup may succeed if we don't do
887953
+                         * a name-heal and the destination may not have pending xattrs
887953
+                         * to indicate which name is good and which is bad so always do
887953
+                         * this heal*/
887953
+                        if (gf_uuid_compare (replies[i].poststat.ia_gfid,
887953
+                                             gfid)) {
887953
+                                goto name_heal;
887953
+                        }
887953
+                }
887953
 	}
887953
 
887953
-	if (need_heal) {
887953
-		heal = afr_frame_create (this, NULL);
887953
-		if (!heal)
887953
-                        goto metadata_heal;
887953
-
887953
-		ret = synctask_new (this->ctx->env, afr_lookup_selfheal_wrap,
887953
-				    afr_refresh_selfheal_done, heal, frame);
887953
-		if (ret) {
887953
-                        AFR_STACK_DESTROY (heal);
887953
-			goto metadata_heal;
887953
+        if (name_state_mismatch) {
887953
+                if (!priv->quorum_count)
887953
+                        goto name_heal;
887953
+                if (!afr_has_quorum (success, this))
887953
+                        goto name_heal;
887953
+                if (op_errno)
887953
+                        goto name_heal;
887953
+                for (i = 0; i < priv->child_count; i++) {
887953
+                        if (!replies[i].valid)
887953
+                                continue;
887953
+                        if (par_readables[i] && replies[i].op_ret < 0 &&
887953
+                            replies[i].op_errno != ENOTCONN) {
887953
+                                goto name_heal;
887953
+                        }
887953
                 }
887953
-                return ret;
887953
-	}
887953
+        }
887953
+
887953
+        goto metadata_heal;
887953
+
887953
+name_heal:
887953
+        heal = afr_frame_create (this, NULL);
887953
+        if (!heal)
887953
+                goto metadata_heal;
887953
+
887953
+        ret = synctask_new (this->ctx->env, afr_lookup_selfheal_wrap,
887953
+                            afr_refresh_selfheal_done, heal, frame);
887953
+        if (ret) {
887953
+                AFR_STACK_DESTROY (heal);
887953
+                goto metadata_heal;
887953
+        }
887953
+        return ret;
887953
+
887953
 metadata_heal:
887953
         ret = afr_lookup_metadata_heal_check (frame, this);
887953
 
887953
diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c
887953
index bcd0e60..0a5be29 100644
887953
--- a/xlators/cluster/afr/src/afr-self-heal-name.c
887953
+++ b/xlators/cluster/afr/src/afr-self-heal-name.c
887953
@@ -634,20 +634,26 @@ afr_selfheal_name_unlocked_inspect (call_frame_t *frame, xlator_t *this,
887953
 			continue;
887953
 
887953
                 if ((replies[i].op_ret == -1) &&
887953
-                    (replies[i].op_errno == ENODATA))
887953
+                    (replies[i].op_errno == ENODATA)) {
887953
                         *need_heal = _gf_true;
887953
+                        break;
887953
+                }
887953
 
887953
 		if (first_idx == -1) {
887953
 			first_idx = i;
887953
 			continue;
887953
 		}
887953
 
887953
-		if (replies[i].op_ret != replies[first_idx].op_ret)
887953
+		if (replies[i].op_ret != replies[first_idx].op_ret) {
887953
 			*need_heal = _gf_true;
887953
+                        break;
887953
+                }
887953
 
887953
 		if (gf_uuid_compare (replies[i].poststat.ia_gfid,
887953
-				  replies[first_idx].poststat.ia_gfid))
887953
+				  replies[first_idx].poststat.ia_gfid)) {
887953
 			*need_heal = _gf_true;
887953
+                        break;
887953
+                }
887953
 	}
887953
 
887953
 	if (inode)
887953
-- 
887953
1.8.3.1
887953