Blob Blame History Raw
From afc090ba37104c6a478cdb41661b96d94ff2d089 Mon Sep 17 00:00:00 2001
From: Ravishankar N <ravishankar@redhat.com>
Date: Fri, 19 Jun 2015 15:29:06 +0530
Subject: [PATCH 123/129] afr: complete conservative merge even in case of gfid split-brain.

Patch URL in master: http://review.gluster.org/#/c/9429/
Patch ULR in release 3.7: http://review.gluster.org/#/c/11327/

Problem:
While performing conservative merge, we bail out of the merge if we encounter a
file with mismatching gfid or type. What this means is all entries that come
after the mismatching file (during the merge) never get healed, no matter how
many index heals are done.

Fix:
Continue with the merging of rest of the entries even if a gfid/type mismatch is
found, but ensure that post-op does not happen on the parent dir in such a case.

Change-Id: I620911e14cf1a9c4482e44469fb00386dd4fe7ee
BUG: 1230517
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/51116
---
 tests/bugs/replicate/bug-1180545.t            |   48 +++++++++++++++++++++++++
 xlators/cluster/afr/src/afr-self-heal-entry.c |   20 +++++++++-
 2 files changed, 66 insertions(+), 2 deletions(-)
 create mode 100644 tests/bugs/replicate/bug-1180545.t

diff --git a/tests/bugs/replicate/bug-1180545.t b/tests/bugs/replicate/bug-1180545.t
new file mode 100644
index 0000000..748d5de
--- /dev/null
+++ b/tests/bugs/replicate/bug-1180545.t
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+#Create gfid split-brain of directory and check if conservative merge
+#completes successfully.
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+. $(dirname $0)/../../afr.rc
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+
+TEST $CLI volume create $V0 replica 2 $H0:$B0/brick{0,1}
+TEST $CLI volume set $V0 cluster.heal-timeout 60
+TEST $CLI volume set $V0 cluster.self-heal-daemon off
+TEST $CLI volume start $V0
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;
+
+#Create files with alternate brick down. One file has gfid mismatch.
+TEST mkdir $M0/DIR
+
+TEST kill_brick $V0 $H0 $B0/brick1
+TEST touch $M0/DIR/FILE
+TEST touch $M0/DIR/file{1..5}
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1
+
+TEST kill_brick $V0 $H0 $B0/brick0
+TEST touch $M0/DIR/FILE
+TEST touch $M0/DIR/file{6..10}
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
+
+#Trigger heal and verify number of entries in backend
+TEST $CLI volume set $V0 cluster.self-heal-daemon on
+EXPECT_WITHIN PROCESS_UP_TIMEOUT "Y" glustershd_up_status
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
+TEST $CLI volume heal $V0
+
+EXPECT_WITHIN $HEAL_TIMEOUT '2' count_sh_entries $B0/brick0
+EXPECT_WITHIN $HEAL_TIMEOUT '2' count_sh_entries $B0/brick1
+#Two entries for DIR and two for FILE
+EXPECT_WITHIN $HEAL_TIMEOUT "4" afr_get_pending_heal_count $V0
+TEST diff <(ls $B0/brick0/DIR) <(ls $B0/brick1/DIR)
+cleanup
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
index ab210b3..e64b6e4 100644
--- a/xlators/cluster/afr/src/afr-self-heal-entry.c
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
@@ -508,6 +508,7 @@ afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this,
 	call_frame_t *iter_frame = NULL;
 	xlator_t *subvol = NULL;
 	afr_private_t *priv = NULL;
+        gf_boolean_t mismatch = _gf_false;
 
 	priv = this->private;
 	subvol = priv->children[child];
@@ -537,6 +538,11 @@ afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this,
                                                          entry->d_name);
 			AFR_STACK_RESET (iter_frame);
 
+                        if (ret == -1) {
+                                /* gfid or type mismatch. */
+                                mismatch = _gf_true;
+                                ret = 0;
+                        }
 			if (ret)
 				break;
 		}
@@ -547,6 +553,9 @@ afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this,
 	}
 
 	AFR_STACK_DESTROY (iter_frame);
+        if (mismatch == _gf_true)
+                /* undo pending will be skipped */
+                ret = -1;
 	return ret;
 }
 
@@ -557,6 +566,7 @@ afr_selfheal_entry_do (call_frame_t *frame, xlator_t *this, fd_t *fd,
 {
 	int i = 0;
 	afr_private_t *priv = NULL;
+        gf_boolean_t mismatch = _gf_false;
 	int ret = 0;
 
 	priv = this->private;
@@ -568,14 +578,20 @@ afr_selfheal_entry_do (call_frame_t *frame, xlator_t *this, fd_t *fd,
 		if (i != source && !healed_sinks[i])
 			continue;
 		ret = afr_selfheal_entry_do_subvol (frame, this, fd, i);
+                if (ret == -1) {
+                        /* gfid or type mismatch. */
+                        mismatch = _gf_true;
+                        continue;
+                }
 		if (ret)
 			break;
 	}
+        if (mismatch == _gf_true)
+                /* undo pending will be skipped */
+                ret = -1;
 	return ret;
 }
 
-
-
 static int
 __afr_selfheal_entry (call_frame_t *frame, xlator_t *this, fd_t *fd,
 		      unsigned char *locked_on)
-- 
1.7.1