a3470f
From 6229320bc25ff24bb76f990c8e5411b6f1aa476c Mon Sep 17 00:00:00 2001
a3470f
From: Ravishankar N <ravishankar@redhat.com>
a3470f
Date: Sun, 28 Jan 2018 13:50:47 +0530
a3470f
Subject: [PATCH 155/180] afr: don't treat all cases all bricks being blamed as
a3470f
 split-brain
a3470f
a3470f
Backport of https://review.gluster.org/#/c/19349/
a3470f
a3470f
Problem:
a3470f
We currently don't have a roll-back/undoing of post-ops if quorum is not
a3470f
met. Though the FOP is still unwound with failure, the xattrs remain on
a3470f
the disk.  Due to these partial post-ops and partial heals (healing only when
a3470f
2 bricks are up), we can end up in split-brain purely from the afr
a3470f
xattrs point of view i.e each brick is blamed by atleast one of the
a3470f
others. These scenarios are hit when there is frequent
a3470f
connect/disconnect of the client/shd to the bricks while I/O or heal
a3470f
are in progress.
a3470f
a3470f
Fix:
a3470f
Instead of undoing the post-op, pick a source based on the xattr values.
a3470f
If 2 bricks blame one, the blamed one must be treated as sink.
a3470f
If there is no majority, all are sources. Once we pick a source,
a3470f
self-heal will then do the heal instead of erroring out due to
a3470f
split-brain.
a3470f
a3470f
Change-Id: I3d0224b883eb0945785ade0e9697a1c828aec0ae
a3470f
BUG: 1384983
a3470f
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
a3470f
Reviewed-on: https://code.engineering.redhat.com/gerrit/129245
a3470f
Tested-by: RHGS Build Bot <nigelb@redhat.com>
a3470f
---
a3470f
 tests/basic/afr/arbiter-add-brick.t                | 16 ++++
a3470f
 .../replicate/bug-1539358-split-brain-detection.t  | 89 ++++++++++++++++++++++
a3470f
 tests/bugs/replicate/bug-802417.t                  | 12 +++
a3470f
 xlators/cluster/afr/src/afr-self-heal-common.c     | 51 +++++++++++--
a3470f
 xlators/cluster/afr/src/afr-self-heal-data.c       |  6 +-
a3470f
 5 files changed, 165 insertions(+), 9 deletions(-)
a3470f
 create mode 100755 tests/bugs/replicate/bug-1539358-split-brain-detection.t
a3470f
a3470f
diff --git a/tests/basic/afr/arbiter-add-brick.t b/tests/basic/afr/arbiter-add-brick.t
a3470f
index fe919de..77b93d9 100644
a3470f
--- a/tests/basic/afr/arbiter-add-brick.t
a3470f
+++ b/tests/basic/afr/arbiter-add-brick.t
a3470f
@@ -12,6 +12,8 @@ TEST $CLI volume set $V0 performance.stat-prefetch off
a3470f
 TEST $CLI volume start $V0
a3470f
 TEST $CLI volume set $V0 self-heal-daemon off
a3470f
 TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;
a3470f
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
a3470f
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
a3470f
 TEST mkdir  $M0/dir1
a3470f
 TEST dd if=/dev/urandom of=$M0/file1 bs=1024 count=1
a3470f
 
a3470f
@@ -24,6 +26,7 @@ TEST dd if=/dev/urandom of=$M0/file1 bs=1024 count=1024
a3470f
 #convert replica 2 to arbiter volume
a3470f
 TEST $CLI volume start $V0 force
a3470f
 EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1
a3470f
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
a3470f
 
a3470f
 #syntax check for add-brick.
a3470f
 TEST ! $CLI volume add-brick $V0 replica 2 arbiter 1 $H0:$B0/${V0}2
a3470f
@@ -31,6 +34,19 @@ TEST ! $CLI volume add-brick $V0 replica 3 arbiter 2 $H0:$B0/${V0}2
a3470f
 
a3470f
 TEST $CLI volume add-brick $V0 replica 3 arbiter 1 $H0:$B0/${V0}2
a3470f
 EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2
a3470f
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2
a3470f
+
a3470f
+#Trigger name heals from client. If we just rely on index heal, the first index
a3470f
+#crawl on B0 fails for /, dir2 and /file either due to lock collision or files
a3470f
+#not being present on the other 2 bricks yet. It is getting healed only in the
a3470f
+#next crawl after priv->shd.timeout (600 seconds) or by manually launching
a3470f
+#index heal again.
a3470f
+TEST $CLI volume set $V0 data-self-heal off
a3470f
+TEST $CLI volume set $V0 metadata-self-heal off
a3470f
+TEST $CLI volume set $V0 entry-self-heal off
a3470f
+TEST stat $M0/dir1
a3470f
+TEST stat $M0/dir2
a3470f
+TEST stat $M0/file1
a3470f
 
a3470f
 #Heal files
a3470f
 TEST $CLI volume set $V0 self-heal-daemon on
a3470f
diff --git a/tests/bugs/replicate/bug-1539358-split-brain-detection.t b/tests/bugs/replicate/bug-1539358-split-brain-detection.t
a3470f
new file mode 100755
a3470f
index 0000000..7b71a7a
a3470f
--- /dev/null
a3470f
+++ b/tests/bugs/replicate/bug-1539358-split-brain-detection.t
a3470f
@@ -0,0 +1,89 @@
a3470f
+#!/bin/bash
a3470f
+
a3470f
+. $(dirname $0)/../../include.rc
a3470f
+. $(dirname $0)/../../volume.rc
a3470f
+. $(dirname $0)/../../afr.rc
a3470f
+
a3470f
+cleanup;
a3470f
+
a3470f
+## Start and create a volume
a3470f
+TEST glusterd;
a3470f
+TEST pidof glusterd;
a3470f
+TEST $CLI volume info;
a3470f
+
a3470f
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2};
a3470f
+TEST $CLI volume start $V0;
a3470f
+EXPECT 'Started' volinfo_field $V0 'Status';
a3470f
+TEST $CLI volume set $V0 self-heal-daemon off
a3470f
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
a3470f
+
a3470f
+###############################################################################yy
a3470f
+# Case of 2 bricks blaming the third and the third blaming the other two.
a3470f
+
a3470f
+TEST `echo "hello" >> $M0/file`
a3470f
+
a3470f
+# B0 and B2 must blame B1
a3470f
+TEST kill_brick $V0 $H0 $B0/$V0"1"
a3470f
+TEST `echo "append" >> $M0/file`
a3470f
+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}0/file trusted.afr.$V0-client-1 data
a3470f
+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}2/file trusted.afr.$V0-client-1 data
a3470f
+CLIENT_MD5=$(md5sum $M0/file | cut -d\  -f1)
a3470f
+
a3470f
+# B1 must blame B0 and B2
a3470f
+setfattr -n trusted.afr.$V0-client-0 -v 0x000000010000000000000000 $B0/$V0"1"/file
a3470f
+setfattr -n trusted.afr.$V0-client-2 -v 0x000000010000000000000000 $B0/$V0"1"/file
a3470f
+
a3470f
+# Launch heal
a3470f
+TEST $CLI volume start $V0 force
a3470f
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1
a3470f
+TEST $CLI volume set $V0 self-heal-daemon on
a3470f
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
a3470f
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
a3470f
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
a3470f
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
a3470f
+TEST $CLI volume heal $V0
a3470f
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
a3470f
+B0_MD5=$(md5sum $B0/${V0}0/file | cut -d\  -f1)
a3470f
+B1_MD5=$(md5sum $B0/${V0}1/file | cut -d\  -f1)
a3470f
+B2_MD5=$(md5sum $B0/${V0}2/file | cut -d\  -f1)
a3470f
+TEST [ "$CLIENT_MD5" == "$B0_MD5" ]
a3470f
+TEST [ "$CLIENT_MD5" == "$B1_MD5" ]
a3470f
+TEST [ "$CLIENT_MD5" == "$B2_MD5" ]
a3470f
+
a3470f
+TEST rm $M0/file
a3470f
+
a3470f
+###############################################################################yy
a3470f
+# Case of each brick blaming the next one in a cyclic manner
a3470f
+
a3470f
+TEST `echo "hello" >> $M0/file`
a3470f
+# Mark cyclic xattrs and modify file content directly on the bricks.
a3470f
+TEST $CLI volume set $V0 self-heal-daemon off
a3470f
+setfattr -n trusted.afr.$V0-client-1 -v 0x000000010000000000000000 $B0/$V0"0"/file
a3470f
+setfattr -n trusted.afr.dirty -v 0x000000010000000000000000 $B0/$V0"0"/file
a3470f
+setfattr -n trusted.afr.$V0-client-2 -v 0x000000010000000000000000 $B0/$V0"1"/file
a3470f
+setfattr -n trusted.afr.dirty -v 0x000000010000000000000000 $B0/$V0"1"/file
a3470f
+setfattr -n trusted.afr.$V0-client-0 -v 0x000000010000000000000000 $B0/$V0"2"/file
a3470f
+setfattr -n trusted.afr.dirty -v 0x000000010000000000000000 $B0/$V0"2"/file
a3470f
+
a3470f
+TEST `echo "ab" >> $B0/$V0"0"/file`
a3470f
+TEST `echo "cdef" >> $B0/$V0"1"/file`
a3470f
+TEST `echo "ghi" >> $B0/$V0"2"/file`
a3470f
+
a3470f
+# Add entry to xattrop dir to trigger index heal.
a3470f
+xattrop_dir0=$(afr_get_index_path $B0/$V0"0")
a3470f
+base_entry_b0=`ls $xattrop_dir0`
a3470f
+gfid_str=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/$V0"0"/file))
a3470f
+ln $xattrop_dir0/$base_entry_b0 $xattrop_dir0/$gfid_str
a3470f
+EXPECT_WITHIN $HEAL_TIMEOUT "^1$" get_pending_heal_count $V0
a3470f
+
a3470f
+# Launch heal
a3470f
+TEST $CLI volume set $V0 self-heal-daemon on
a3470f
+TEST $CLI volume heal $V0
a3470f
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
a3470f
+B0_MD5=$(md5sum $B0/${V0}0/file | cut -d\  -f1)
a3470f
+B1_MD5=$(md5sum $B0/${V0}1/file | cut -d\  -f1)
a3470f
+B2_MD5=$(md5sum $B0/${V0}2/file | cut -d\  -f1)
a3470f
+TEST [ "$B0_MD5" == "$B1_MD5" ]
a3470f
+TEST [ "$B0_MD5" == "$B2_MD5" ]
a3470f
+###############################################################################yy
a3470f
+cleanup
a3470f
diff --git a/tests/bugs/replicate/bug-802417.t b/tests/bugs/replicate/bug-802417.t
a3470f
index c5ba98b..f213439 100755
a3470f
--- a/tests/bugs/replicate/bug-802417.t
a3470f
+++ b/tests/bugs/replicate/bug-802417.t
a3470f
@@ -10,6 +10,18 @@ function write_file()
a3470f
 }
a3470f
 
a3470f
 cleanup;
a3470f
+
a3470f
+#####################################################
a3470f
+# We are currently not triggering data heal unless all bricks of the replica are
a3470f
+# up. We will need to modify this .t once the fix for preventing stale reads
a3470f
+# being served to clients for files in spurious split-brains is done. Spurious
a3470f
+# split-brains here means afr xattrs indicates sbrain but it is actually not.
a3470f
+# Self-heal will heal such files automatically but before the heal completes,
a3470f
+# reads can be served which needs fixing.
a3470f
+#G_TESTDEF_TEST_STATUS_NETBSD7=BAD_TEST,BUG=000000
a3470f
+#G_TESTDEF_TEST_STATUS_CENTOS6=BAD_TEST,BUG=000000
a3470f
+######################################################
a3470f
+
a3470f
 TEST glusterd
a3470f
 TEST pidof glusterd
a3470f
 TEST $CLI volume info;
a3470f
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
a3470f
index 26d3860..f61b237 100644
a3470f
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
a3470f
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
a3470f
@@ -1455,6 +1455,36 @@ afr_does_witness_exist (xlator_t *this, uint64_t *witness)
a3470f
         return _gf_false;
a3470f
 }
a3470f
 
a3470f
+unsigned int
a3470f
+afr_get_quorum_count (afr_private_t *priv)
a3470f
+{
a3470f
+        if (priv->quorum_count == AFR_QUORUM_AUTO) {
a3470f
+                return priv->child_count/2 + 1;
a3470f
+        } else {
a3470f
+                return priv->quorum_count;
a3470f
+        }
a3470f
+}
a3470f
+
a3470f
+void
a3470f
+afr_selfheal_post_op_failure_accounting (afr_private_t *priv, char *accused,
a3470f
+                                         unsigned char *sources,
a3470f
+                                         unsigned char *locked_on)
a3470f
+{
a3470f
+        int i = 0;
a3470f
+        unsigned int  quorum_count = 0;
a3470f
+
a3470f
+        if (AFR_COUNT (sources, priv->child_count) != 0)
a3470f
+                return;
a3470f
+
a3470f
+        quorum_count = afr_get_quorum_count (priv);
a3470f
+        for (i = 0; i < priv->child_count; i++) {
a3470f
+                if ((accused[i] < quorum_count) && locked_on[i]) {
a3470f
+                        sources[i] = 1;
a3470f
+                }
a3470f
+        }
a3470f
+        return;
a3470f
+}
a3470f
+
a3470f
 /*
a3470f
  * This function determines if a self-heal is required for a given inode,
a3470f
  * and if needed, in what direction.
a3470f
@@ -1490,6 +1520,7 @@ afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this,
a3470f
         char *accused = NULL;/* Accused others without any self-accusal */
a3470f
         char *pending = NULL;/* Have pending operations on others */
a3470f
         char *self_accused = NULL; /* Accused itself */
a3470f
+        int min_participants = -1;
a3470f
 
a3470f
 	priv = this->private;
a3470f
 
a3470f
@@ -1513,8 +1544,13 @@ afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this,
a3470f
                 }
a3470f
         }
a3470f
 
a3470f
+        if (type == AFR_DATA_TRANSACTION) {
a3470f
+                min_participants = priv->child_count;
a3470f
+        } else {
a3470f
+                min_participants = AFR_SH_MIN_PARTICIPANTS;
a3470f
+        }
a3470f
         if (afr_success_count (replies,
a3470f
-                               priv->child_count) < AFR_SH_MIN_PARTICIPANTS) {
a3470f
+                               priv->child_count) < min_participants) {
a3470f
                 /* Treat this just like locks not being acquired */
a3470f
                 return -ENOTCONN;
a3470f
         }
a3470f
@@ -1530,11 +1566,10 @@ afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this,
a3470f
 	for (i = 0; i < priv->child_count; i++) {
a3470f
 		for (j = 0; j < priv->child_count; j++) {
a3470f
                         if (matrix[i][j]) {
a3470f
-                                 if (!self_accused[i])
a3470f
-                                         accused[j] = 1;
a3470f
-
a3470f
-                                 if (i != j)
a3470f
-                                         pending[i] = 1;
a3470f
+                                if (!self_accused[i])
a3470f
+                                        accused[j] += 1;
a3470f
+                                if (i != j)
a3470f
+                                        pending[i] += 1;
a3470f
                          }
a3470f
 		}
a3470f
 	}
a3470f
@@ -1575,6 +1610,10 @@ afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this,
a3470f
                 }
a3470f
         }
a3470f
 
a3470f
+        if (type == AFR_DATA_TRANSACTION)
a3470f
+                afr_selfheal_post_op_failure_accounting (priv, accused,
a3470f
+                                                         sources, locked_on);
a3470f
+
a3470f
          /* If no sources, all locked nodes are sinks - split brain */
a3470f
          if (AFR_COUNT (sources, priv->child_count) == 0) {
a3470f
                 for (i = 0; i < priv->child_count; i++) {
a3470f
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
a3470f
index 8cf43f2..bcd0dec 100644
a3470f
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
a3470f
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
a3470f
@@ -684,7 +684,7 @@ __afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd,
a3470f
 	ret = afr_selfheal_inodelk (frame, this, fd->inode, this->name, 0, 0,
a3470f
 				    data_lock);
a3470f
 	{
a3470f
-		if (ret < AFR_SH_MIN_PARTICIPANTS) {
a3470f
+		if (ret < priv->child_count) {
a3470f
                         gf_msg_debug (this->name, 0, "%s: Skipping "
a3470f
                                       "self-heal as only %d number "
a3470f
                                       "of subvolumes "
a3470f
@@ -749,7 +749,7 @@ restore_time:
a3470f
         if (!is_arbiter_the_only_sink) {
a3470f
                 ret = afr_selfheal_inodelk (frame, this, fd->inode, this->name,
a3470f
                                             0, 0, data_lock);
a3470f
-                if (ret < AFR_SH_MIN_PARTICIPANTS) {
a3470f
+                if (ret < priv->child_count) {
a3470f
                         ret = -ENOTCONN;
a3470f
                         did_sh = _gf_false;
a3470f
                         goto skip_undo_pending;
a3470f
@@ -878,7 +878,7 @@ afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode)
a3470f
 	                                        priv->sh_domain, 0, 0,
a3470f
 				                locked_on);
a3470f
 	{
a3470f
-		if (ret < AFR_SH_MIN_PARTICIPANTS) {
a3470f
+		if (ret < priv->child_count) {
a3470f
                         gf_msg_debug (this->name, 0, "%s: Skipping "
a3470f
                                       "self-heal as only %d number of "
a3470f
                                       "subvolumes could be locked",
a3470f
-- 
a3470f
1.8.3.1
a3470f