e7a346
From 8a3c0fb64c8798ecf5a3635fe0922e3cfd476817 Mon Sep 17 00:00:00 2001
e7a346
From: Pranith Kumar K <pkarampu@redhat.com>
e7a346
Date: Mon, 27 Aug 2018 12:40:16 +0530
e7a346
Subject: [PATCH 425/444] cluster/afr: Delegate name-heal when possible
e7a346
e7a346
Problem:
e7a346
When name-self-heal is triggered on the mount, it blocks
e7a346
lookup until name-self-heal completes. But that can lead
e7a346
to hangs when lot of clients are accessing a directory which
e7a346
needs name heal and all of them trigger heals waiting
e7a346
for other clients to complete heal.
e7a346
e7a346
Fix:
e7a346
When a name-heal is needed but quorum number of names have the
e7a346
file and pending xattrs exist on the parent, then better to
e7a346
delegate the heal to SHD which will be completed as part of
e7a346
entry-heal of the parent directory. We could also do the same
e7a346
for quorum-number of names not present but we don't have
e7a346
any known use-case where this is a frequent occurrence so
e7a346
not changing that part at the moment. When there is a gfid
e7a346
mismatch or missing gfid it is important to complete the heal
e7a346
so that next rename doesn't assume everything is fine and
e7a346
perform a rename etc
e7a346
e7a346
BUG: 1619357
e7a346
Upstream Patch: https://review.gluster.org/c/glusterfs/+/21087
e7a346
Change-Id: I8b002c85dffc6eb6f2833e742684a233daefeb2c
e7a346
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
e7a346
Reviewed-on: https://code.engineering.redhat.com/gerrit/155029
e7a346
Tested-by: RHGS Build Bot <nigelb@redhat.com>
e7a346
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
e7a346
---
e7a346
 tests/afr.rc                                 |   8 ++
e7a346
 tests/basic/afr/name-self-heal.t             | 112 +++++++++++++++++++++++++++
e7a346
 xlators/cluster/afr/src/afr-common.c         | 100 ++++++++++++++++++------
e7a346
 xlators/cluster/afr/src/afr-self-heal-name.c |  12 ++-
e7a346
 4 files changed, 205 insertions(+), 27 deletions(-)
e7a346
 create mode 100644 tests/basic/afr/name-self-heal.t
e7a346
e7a346
diff --git a/tests/afr.rc b/tests/afr.rc
e7a346
index 1fd0310..a1e8a44 100644
e7a346
--- a/tests/afr.rc
e7a346
+++ b/tests/afr.rc
e7a346
@@ -89,3 +89,11 @@ function count_index_entries()
e7a346
 {
e7a346
     ls $1/.glusterfs/indices/xattrop | wc -l
e7a346
 }
e7a346
+
e7a346
+function get_quorum_type()
e7a346
+{
e7a346
+        local m="$1"
e7a346
+        local v="$2"
e7a346
+        local repl_id="$3"
e7a346
+        cat $m/.meta/graphs/active/$v-replicate-$repl_id/private|grep quorum-type|awk '{print $3}'
e7a346
+}
e7a346
diff --git a/tests/basic/afr/name-self-heal.t b/tests/basic/afr/name-self-heal.t
e7a346
new file mode 100644
e7a346
index 0000000..50fc2ec
e7a346
--- /dev/null
e7a346
+++ b/tests/basic/afr/name-self-heal.t
e7a346
@@ -0,0 +1,112 @@
e7a346
+#!/bin/bash
e7a346
+#Self-heal tests
e7a346
+
e7a346
+. $(dirname $0)/../../include.rc
e7a346
+. $(dirname $0)/../../volume.rc
e7a346
+. $(dirname $0)/../../afr.rc
e7a346
+cleanup;
e7a346
+
e7a346
+#Check that when quorum is not enabled name-heal happens correctly
e7a346
+TEST glusterd
e7a346
+TEST pidof glusterd
e7a346
+TEST $CLI volume create $V0 replica 2 $H0:$B0/brick{0,1}
e7a346
+TEST $CLI volume set $V0 performance.stat-prefetch off
e7a346
+TEST $CLI volume start $V0
e7a346
+TEST $CLI volume heal $V0 disable
e7a346
+TEST $CLI volume set $V0 cluster.entry-self-heal off
e7a346
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;
e7a346
+
e7a346
+TEST touch $M0/a
e7a346
+TEST touch $M0/c
e7a346
+TEST kill_brick $V0 $H0 $B0/brick0
e7a346
+TEST touch $M0/b
e7a346
+TEST rm -f $M0/a
e7a346
+TEST rm -f $M0/c
e7a346
+TEST touch $M0/c #gfid mismatch case
e7a346
+c_gfid=$(gf_get_gfid_xattr $B0/brick1/c)
e7a346
+TEST $CLI volume start $V0 force
e7a346
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
e7a346
+TEST ! stat $M0/a
e7a346
+TEST ! stat $B0/brick0/a
e7a346
+TEST ! stat $B0/brick1/a
e7a346
+
e7a346
+TEST stat $M0/b
e7a346
+TEST stat $B0/brick0/b
e7a346
+TEST stat $B0/brick1/b
e7a346
+TEST [[ "$(gf_get_gfid_xattr $B0/brick0/b)" == "$(gf_get_gfid_xattr $B0/brick1/b)" ]]
e7a346
+
e7a346
+TEST stat $M0/c
e7a346
+TEST stat $B0/brick0/c
e7a346
+TEST stat $B0/brick1/c
e7a346
+TEST [[ "$(gf_get_gfid_xattr $B0/brick0/c)" == "$c_gfid" ]]
e7a346
+
e7a346
+cleanup;
e7a346
+
e7a346
+#Check that when quorum is enabled name-heal happens as expected
e7a346
+TEST glusterd
e7a346
+TEST pidof glusterd
e7a346
+TEST $CLI volume create $V0 replica 3 $H0:$B0/brick{0,1,2}
e7a346
+TEST $CLI volume set $V0 performance.stat-prefetch off
e7a346
+TEST $CLI volume start $V0
e7a346
+TEST $CLI volume heal $V0 disable
e7a346
+TEST $CLI volume set $V0 cluster.entry-self-heal off
e7a346
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;
e7a346
+
e7a346
+TEST touch $M0/a
e7a346
+TEST touch $M0/c
e7a346
+TEST kill_brick $V0 $H0 $B0/brick0
e7a346
+TEST touch $M0/b
e7a346
+TEST rm -f $M0/a
e7a346
+TEST rm -f $M0/c
e7a346
+TEST touch $M0/c #gfid mismatch case
e7a346
+c_gfid=$(gf_get_gfid_xattr $B0/brick1/c)
e7a346
+TEST $CLI volume start $V0 force
e7a346
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
e7a346
+TEST ! stat $M0/a
e7a346
+TEST ! stat $B0/brick0/a
e7a346
+TEST ! stat $B0/brick1/a
e7a346
+TEST ! stat $B0/brick2/a
e7a346
+
e7a346
+TEST stat $M0/b
e7a346
+TEST ! stat $B0/brick0/b #Name heal shouldn't be triggered
e7a346
+TEST stat $B0/brick1/b
e7a346
+TEST stat $B0/brick2/b
e7a346
+
e7a346
+TEST stat $M0/c
e7a346
+TEST stat $B0/brick0/c
e7a346
+TEST stat $B0/brick1/c
e7a346
+TEST stat $B0/brick2/c
e7a346
+TEST [[ "$(gf_get_gfid_xattr $B0/brick0/c)" == "$c_gfid" ]]
e7a346
+
e7a346
+TEST $CLI volume set $V0 cluster.quorum-type none
e7a346
+EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "none" get_quorum_type $M0 $V0 0
e7a346
+TEST stat $M0/b
e7a346
+TEST stat $B0/brick0/b #Name heal should be triggered
e7a346
+TEST stat $B0/brick1/b
e7a346
+TEST stat $B0/brick2/b
e7a346
+TEST [[ "$(gf_get_gfid_xattr $B0/brick0/b)" == "$(gf_get_gfid_xattr $B0/brick1/b)" ]]
e7a346
+TEST $CLI volume set $V0 cluster.quorum-type auto
e7a346
+EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "auto" get_quorum_type $M0 $V0 0
e7a346
+
e7a346
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
e7a346
+
e7a346
+#Missing parent xattrs cases
e7a346
+TEST $CLI volume heal $V0 enable
e7a346
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
e7a346
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
e7a346
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
e7a346
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
e7a346
+TEST $CLI volume heal $V0
e7a346
+EXPECT_WITHIN $HEAL_TIMEOUT "0" get_pending_heal_count $V0
e7a346
+
e7a346
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;
e7a346
+TEST $CLI volume heal $V0 disable
e7a346
+#In cases where a good parent doesn't have pending xattrs and a file,
e7a346
+#name-heal will be triggered
e7a346
+TEST gf_rm_file_and_gfid_link $B0/brick1 c
e7a346
+TEST stat $M0/c
e7a346
+TEST stat $B0/brick0/c
e7a346
+TEST stat $B0/brick1/c
e7a346
+TEST stat $B0/brick2/c
e7a346
+TEST [[ "$(gf_get_gfid_xattr $B0/brick0/c)" == "$c_gfid" ]]
e7a346
+cleanup
e7a346
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
e7a346
index e74fdec..ce2b17a 100644
e7a346
--- a/xlators/cluster/afr/src/afr-common.c
e7a346
+++ b/xlators/cluster/afr/src/afr-common.c
e7a346
@@ -2302,8 +2302,6 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
e7a346
 	*/
e7a346
 	for (i = 0; i < priv->child_count; i++) {
e7a346
 		if (!replies[i].valid || replies[i].op_ret == -1) {
e7a346
-			if (priv->child_up[i])
e7a346
-				can_interpret = _gf_false;
e7a346
 			continue;
e7a346
 		}
e7a346
 
e7a346
@@ -2742,21 +2740,52 @@ afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this)
e7a346
 	afr_private_t *priv = NULL;
e7a346
 	call_frame_t *heal = NULL;
e7a346
 	int i = 0, first = -1;
e7a346
-	gf_boolean_t need_heal = _gf_false;
e7a346
+	gf_boolean_t name_state_mismatch = _gf_false;
e7a346
 	struct afr_reply *replies = NULL;
e7a346
 	int ret = 0;
e7a346
+        unsigned char *par_readables = NULL;
e7a346
+        unsigned char *success = NULL;
e7a346
+        int32_t op_errno = 0;
e7a346
+        uuid_t gfid = {0};
e7a346
 
e7a346
 	local = frame->local;
e7a346
 	replies = local->replies;
e7a346
 	priv = this->private;
e7a346
+        par_readables = alloca0(priv->child_count);
e7a346
+        success = alloca0(priv->child_count);
e7a346
+
e7a346
+        ret = afr_inode_read_subvol_get (local->loc.parent, this, par_readables,
e7a346
+                                         NULL, NULL);
e7a346
+        if (ret < 0 || AFR_COUNT (par_readables, priv->child_count) == 0) {
e7a346
+                /* In this case set par_readables to all 1 so that name_heal
e7a346
+                 * need checks at the end of this function will flag missing
e7a346
+                 * entry when name state mismatches*/
e7a346
+                memset (par_readables, 1, priv->child_count);
e7a346
+        }
e7a346
 
e7a346
 	for (i = 0; i < priv->child_count; i++) {
e7a346
 		if (!replies[i].valid)
e7a346
 			continue;
e7a346
 
e7a346
+                if (replies[i].op_ret == 0) {
e7a346
+                        if (uuid_is_null (gfid)) {
e7a346
+                                gf_uuid_copy (gfid,
e7a346
+                                              replies[i].poststat.ia_gfid);
e7a346
+                        }
e7a346
+                        success[i] = 1;
e7a346
+                } else {
e7a346
+                        if ((replies[i].op_errno != ENOTCONN) &&
e7a346
+                            (replies[i].op_errno != ENOENT) &&
e7a346
+                            (replies[i].op_errno != ESTALE)) {
e7a346
+                                op_errno = replies[i].op_errno;
e7a346
+                        }
e7a346
+                }
e7a346
+
e7a346
+                /*gfid is missing, needs heal*/
e7a346
                 if ((replies[i].op_ret == -1) &&
e7a346
-                    (replies[i].op_errno == ENODATA))
e7a346
-                        need_heal = _gf_true;
e7a346
+                    (replies[i].op_errno == ENODATA)) {
e7a346
+                        goto name_heal;
e7a346
+                }
e7a346
 
e7a346
 		if (first == -1) {
e7a346
 			first = i;
e7a346
@@ -2764,30 +2793,53 @@ afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this)
e7a346
 		}
e7a346
 
e7a346
 		if (replies[i].op_ret != replies[first].op_ret) {
e7a346
-			need_heal = _gf_true;
e7a346
-			break;
e7a346
+                        name_state_mismatch = _gf_true;
e7a346
 		}
e7a346
 
e7a346
-		if (gf_uuid_compare (replies[i].poststat.ia_gfid,
e7a346
-				  replies[first].poststat.ia_gfid)) {
e7a346
-			need_heal = _gf_true;
e7a346
-			break;
e7a346
-		}
e7a346
+                if (replies[i].op_ret == 0) {
e7a346
+                        /* Rename after this lookup may succeed if we don't do
e7a346
+                         * a name-heal and the destination may not have pending xattrs
e7a346
+                         * to indicate which name is good and which is bad so always do
e7a346
+                         * this heal*/
e7a346
+                        if (gf_uuid_compare (replies[i].poststat.ia_gfid,
e7a346
+                                             gfid)) {
e7a346
+                                goto name_heal;
e7a346
+                        }
e7a346
+                }
e7a346
 	}
e7a346
 
e7a346
-	if (need_heal) {
e7a346
-		heal = afr_frame_create (this, NULL);
e7a346
-		if (!heal)
e7a346
-                        goto metadata_heal;
e7a346
-
e7a346
-		ret = synctask_new (this->ctx->env, afr_lookup_selfheal_wrap,
e7a346
-				    afr_refresh_selfheal_done, heal, frame);
e7a346
-		if (ret) {
e7a346
-                        AFR_STACK_DESTROY (heal);
e7a346
-			goto metadata_heal;
e7a346
+        if (name_state_mismatch) {
e7a346
+                if (!priv->quorum_count)
e7a346
+                        goto name_heal;
e7a346
+                if (!afr_has_quorum (success, this))
e7a346
+                        goto name_heal;
e7a346
+                if (op_errno)
e7a346
+                        goto name_heal;
e7a346
+                for (i = 0; i < priv->child_count; i++) {
e7a346
+                        if (!replies[i].valid)
e7a346
+                                continue;
e7a346
+                        if (par_readables[i] && replies[i].op_ret < 0 &&
e7a346
+                            replies[i].op_errno != ENOTCONN) {
e7a346
+                                goto name_heal;
e7a346
+                        }
e7a346
                 }
e7a346
-                return ret;
e7a346
-	}
e7a346
+        }
e7a346
+
e7a346
+        goto metadata_heal;
e7a346
+
e7a346
+name_heal:
e7a346
+        heal = afr_frame_create (this, NULL);
e7a346
+        if (!heal)
e7a346
+                goto metadata_heal;
e7a346
+
e7a346
+        ret = synctask_new (this->ctx->env, afr_lookup_selfheal_wrap,
e7a346
+                            afr_refresh_selfheal_done, heal, frame);
e7a346
+        if (ret) {
e7a346
+                AFR_STACK_DESTROY (heal);
e7a346
+                goto metadata_heal;
e7a346
+        }
e7a346
+        return ret;
e7a346
+
e7a346
 metadata_heal:
e7a346
         ret = afr_lookup_metadata_heal_check (frame, this);
e7a346
 
e7a346
diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c
e7a346
index bcd0e60..0a5be29 100644
e7a346
--- a/xlators/cluster/afr/src/afr-self-heal-name.c
e7a346
+++ b/xlators/cluster/afr/src/afr-self-heal-name.c
e7a346
@@ -634,20 +634,26 @@ afr_selfheal_name_unlocked_inspect (call_frame_t *frame, xlator_t *this,
e7a346
 			continue;
e7a346
 
e7a346
                 if ((replies[i].op_ret == -1) &&
e7a346
-                    (replies[i].op_errno == ENODATA))
e7a346
+                    (replies[i].op_errno == ENODATA)) {
e7a346
                         *need_heal = _gf_true;
e7a346
+                        break;
e7a346
+                }
e7a346
 
e7a346
 		if (first_idx == -1) {
e7a346
 			first_idx = i;
e7a346
 			continue;
e7a346
 		}
e7a346
 
e7a346
-		if (replies[i].op_ret != replies[first_idx].op_ret)
e7a346
+		if (replies[i].op_ret != replies[first_idx].op_ret) {
e7a346
 			*need_heal = _gf_true;
e7a346
+                        break;
e7a346
+                }
e7a346
 
e7a346
 		if (gf_uuid_compare (replies[i].poststat.ia_gfid,
e7a346
-				  replies[first_idx].poststat.ia_gfid))
e7a346
+				  replies[first_idx].poststat.ia_gfid)) {
e7a346
 			*need_heal = _gf_true;
e7a346
+                        break;
e7a346
+                }
e7a346
 	}
e7a346
 
e7a346
 	if (inode)
e7a346
-- 
e7a346
1.8.3.1
e7a346