d1681e
From 6229320bc25ff24bb76f990c8e5411b6f1aa476c Mon Sep 17 00:00:00 2001
d1681e
From: Ravishankar N <ravishankar@redhat.com>
d1681e
Date: Sun, 28 Jan 2018 13:50:47 +0530
d1681e
Subject: [PATCH 155/180] afr: don't treat all cases all bricks being blamed as
d1681e
 split-brain
d1681e
d1681e
Backport of https://review.gluster.org/#/c/19349/
d1681e
d1681e
Problem:
d1681e
We currently don't have a roll-back/undoing of post-ops if quorum is not
d1681e
met. Though the FOP is still unwound with failure, the xattrs remain on
d1681e
the disk.  Due to these partial post-ops and partial heals (healing only when
d1681e
2 bricks are up), we can end up in split-brain purely from the afr
d1681e
xattrs point of view i.e each brick is blamed by atleast one of the
d1681e
others. These scenarios are hit when there is frequent
d1681e
connect/disconnect of the client/shd to the bricks while I/O or heal
d1681e
are in progress.
d1681e
d1681e
Fix:
d1681e
Instead of undoing the post-op, pick a source based on the xattr values.
d1681e
If 2 bricks blame one, the blamed one must be treated as sink.
d1681e
If there is no majority, all are sources. Once we pick a source,
d1681e
self-heal will then do the heal instead of erroring out due to
d1681e
split-brain.
d1681e
d1681e
Change-Id: I3d0224b883eb0945785ade0e9697a1c828aec0ae
d1681e
BUG: 1384983
d1681e
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
d1681e
Reviewed-on: https://code.engineering.redhat.com/gerrit/129245
d1681e
Tested-by: RHGS Build Bot <nigelb@redhat.com>
d1681e
---
d1681e
 tests/basic/afr/arbiter-add-brick.t                | 16 ++++
d1681e
 .../replicate/bug-1539358-split-brain-detection.t  | 89 ++++++++++++++++++++++
d1681e
 tests/bugs/replicate/bug-802417.t                  | 12 +++
d1681e
 xlators/cluster/afr/src/afr-self-heal-common.c     | 51 +++++++++++--
d1681e
 xlators/cluster/afr/src/afr-self-heal-data.c       |  6 +-
d1681e
 5 files changed, 165 insertions(+), 9 deletions(-)
d1681e
 create mode 100755 tests/bugs/replicate/bug-1539358-split-brain-detection.t
d1681e
d1681e
diff --git a/tests/basic/afr/arbiter-add-brick.t b/tests/basic/afr/arbiter-add-brick.t
d1681e
index fe919de..77b93d9 100644
d1681e
--- a/tests/basic/afr/arbiter-add-brick.t
d1681e
+++ b/tests/basic/afr/arbiter-add-brick.t
d1681e
@@ -12,6 +12,8 @@ TEST $CLI volume set $V0 performance.stat-prefetch off
d1681e
 TEST $CLI volume start $V0
d1681e
 TEST $CLI volume set $V0 self-heal-daemon off
d1681e
 TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;
d1681e
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
d1681e
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
d1681e
 TEST mkdir  $M0/dir1
d1681e
 TEST dd if=/dev/urandom of=$M0/file1 bs=1024 count=1
d1681e
 
d1681e
@@ -24,6 +26,7 @@ TEST dd if=/dev/urandom of=$M0/file1 bs=1024 count=1024
d1681e
 #convert replica 2 to arbiter volume
d1681e
 TEST $CLI volume start $V0 force
d1681e
 EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1
d1681e
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
d1681e
 
d1681e
 #syntax check for add-brick.
d1681e
 TEST ! $CLI volume add-brick $V0 replica 2 arbiter 1 $H0:$B0/${V0}2
d1681e
@@ -31,6 +34,19 @@ TEST ! $CLI volume add-brick $V0 replica 3 arbiter 2 $H0:$B0/${V0}2
d1681e
 
d1681e
 TEST $CLI volume add-brick $V0 replica 3 arbiter 1 $H0:$B0/${V0}2
d1681e
 EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2
d1681e
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2
d1681e
+
d1681e
+#Trigger name heals from client. If we just rely on index heal, the first index
d1681e
+#crawl on B0 fails for /, dir2 and /file either due to lock collision or files
d1681e
+#not being present on the other 2 bricks yet. It is getting healed only in the
d1681e
+#next crawl after priv->shd.timeout (600 seconds) or by manually launching
d1681e
+#index heal again.
d1681e
+TEST $CLI volume set $V0 data-self-heal off
d1681e
+TEST $CLI volume set $V0 metadata-self-heal off
d1681e
+TEST $CLI volume set $V0 entry-self-heal off
d1681e
+TEST stat $M0/dir1
d1681e
+TEST stat $M0/dir2
d1681e
+TEST stat $M0/file1
d1681e
 
d1681e
 #Heal files
d1681e
 TEST $CLI volume set $V0 self-heal-daemon on
d1681e
diff --git a/tests/bugs/replicate/bug-1539358-split-brain-detection.t b/tests/bugs/replicate/bug-1539358-split-brain-detection.t
d1681e
new file mode 100755
d1681e
index 0000000..7b71a7a
d1681e
--- /dev/null
d1681e
+++ b/tests/bugs/replicate/bug-1539358-split-brain-detection.t
d1681e
@@ -0,0 +1,89 @@
d1681e
+#!/bin/bash
d1681e
+
d1681e
+. $(dirname $0)/../../include.rc
d1681e
+. $(dirname $0)/../../volume.rc
d1681e
+. $(dirname $0)/../../afr.rc
d1681e
+
d1681e
+cleanup;
d1681e
+
d1681e
+## Start and create a volume
d1681e
+TEST glusterd;
d1681e
+TEST pidof glusterd;
d1681e
+TEST $CLI volume info;
d1681e
+
d1681e
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2};
d1681e
+TEST $CLI volume start $V0;
d1681e
+EXPECT 'Started' volinfo_field $V0 'Status';
d1681e
+TEST $CLI volume set $V0 self-heal-daemon off
d1681e
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
d1681e
+
d1681e
+###############################################################################yy
d1681e
+# Case of 2 bricks blaming the third and the third blaming the other two.
d1681e
+
d1681e
+TEST `echo "hello" >> $M0/file`
d1681e
+
d1681e
+# B0 and B2 must blame B1
d1681e
+TEST kill_brick $V0 $H0 $B0/$V0"1"
d1681e
+TEST `echo "append" >> $M0/file`
d1681e
+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}0/file trusted.afr.$V0-client-1 data
d1681e
+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}2/file trusted.afr.$V0-client-1 data
d1681e
+CLIENT_MD5=$(md5sum $M0/file | cut -d\  -f1)
d1681e
+
d1681e
+# B1 must blame B0 and B2
d1681e
+setfattr -n trusted.afr.$V0-client-0 -v 0x000000010000000000000000 $B0/$V0"1"/file
d1681e
+setfattr -n trusted.afr.$V0-client-2 -v 0x000000010000000000000000 $B0/$V0"1"/file
d1681e
+
d1681e
+# Launch heal
d1681e
+TEST $CLI volume start $V0 force
d1681e
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1
d1681e
+TEST $CLI volume set $V0 self-heal-daemon on
d1681e
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
d1681e
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
d1681e
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
d1681e
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
d1681e
+TEST $CLI volume heal $V0
d1681e
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
d1681e
+B0_MD5=$(md5sum $B0/${V0}0/file | cut -d\  -f1)
d1681e
+B1_MD5=$(md5sum $B0/${V0}1/file | cut -d\  -f1)
d1681e
+B2_MD5=$(md5sum $B0/${V0}2/file | cut -d\  -f1)
d1681e
+TEST [ "$CLIENT_MD5" == "$B0_MD5" ]
d1681e
+TEST [ "$CLIENT_MD5" == "$B1_MD5" ]
d1681e
+TEST [ "$CLIENT_MD5" == "$B2_MD5" ]
d1681e
+
d1681e
+TEST rm $M0/file
d1681e
+
d1681e
+###############################################################################yy
d1681e
+# Case of each brick blaming the next one in a cyclic manner
d1681e
+
d1681e
+TEST `echo "hello" >> $M0/file`
d1681e
+# Mark cyclic xattrs and modify file content directly on the bricks.
d1681e
+TEST $CLI volume set $V0 self-heal-daemon off
d1681e
+setfattr -n trusted.afr.$V0-client-1 -v 0x000000010000000000000000 $B0/$V0"0"/file
d1681e
+setfattr -n trusted.afr.dirty -v 0x000000010000000000000000 $B0/$V0"0"/file
d1681e
+setfattr -n trusted.afr.$V0-client-2 -v 0x000000010000000000000000 $B0/$V0"1"/file
d1681e
+setfattr -n trusted.afr.dirty -v 0x000000010000000000000000 $B0/$V0"1"/file
d1681e
+setfattr -n trusted.afr.$V0-client-0 -v 0x000000010000000000000000 $B0/$V0"2"/file
d1681e
+setfattr -n trusted.afr.dirty -v 0x000000010000000000000000 $B0/$V0"2"/file
d1681e
+
d1681e
+TEST `echo "ab" >> $B0/$V0"0"/file`
d1681e
+TEST `echo "cdef" >> $B0/$V0"1"/file`
d1681e
+TEST `echo "ghi" >> $B0/$V0"2"/file`
d1681e
+
d1681e
+# Add entry to xattrop dir to trigger index heal.
d1681e
+xattrop_dir0=$(afr_get_index_path $B0/$V0"0")
d1681e
+base_entry_b0=`ls $xattrop_dir0`
d1681e
+gfid_str=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/$V0"0"/file))
d1681e
+ln $xattrop_dir0/$base_entry_b0 $xattrop_dir0/$gfid_str
d1681e
+EXPECT_WITHIN $HEAL_TIMEOUT "^1$" get_pending_heal_count $V0
d1681e
+
d1681e
+# Launch heal
d1681e
+TEST $CLI volume set $V0 self-heal-daemon on
d1681e
+TEST $CLI volume heal $V0
d1681e
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
d1681e
+B0_MD5=$(md5sum $B0/${V0}0/file | cut -d\  -f1)
d1681e
+B1_MD5=$(md5sum $B0/${V0}1/file | cut -d\  -f1)
d1681e
+B2_MD5=$(md5sum $B0/${V0}2/file | cut -d\  -f1)
d1681e
+TEST [ "$B0_MD5" == "$B1_MD5" ]
d1681e
+TEST [ "$B0_MD5" == "$B2_MD5" ]
d1681e
+###############################################################################yy
d1681e
+cleanup
d1681e
diff --git a/tests/bugs/replicate/bug-802417.t b/tests/bugs/replicate/bug-802417.t
d1681e
index c5ba98b..f213439 100755
d1681e
--- a/tests/bugs/replicate/bug-802417.t
d1681e
+++ b/tests/bugs/replicate/bug-802417.t
d1681e
@@ -10,6 +10,18 @@ function write_file()
d1681e
 }
d1681e
 
d1681e
 cleanup;
d1681e
+
d1681e
+#####################################################
d1681e
+# We are currently not triggering data heal unless all bricks of the replica are
d1681e
+# up. We will need to modify this .t once the fix for preventing stale reads
d1681e
+# being served to clients for files in spurious split-brains is done. Spurious
d1681e
+# split-brains here means afr xattrs indicates sbrain but it is actually not.
d1681e
+# Self-heal will heal such files automatically but before the heal completes,
d1681e
+# reads can be served which needs fixing.
d1681e
+#G_TESTDEF_TEST_STATUS_NETBSD7=BAD_TEST,BUG=000000
d1681e
+#G_TESTDEF_TEST_STATUS_CENTOS6=BAD_TEST,BUG=000000
d1681e
+######################################################
d1681e
+
d1681e
 TEST glusterd
d1681e
 TEST pidof glusterd
d1681e
 TEST $CLI volume info;
d1681e
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
d1681e
index 26d3860..f61b237 100644
d1681e
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
d1681e
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
d1681e
@@ -1455,6 +1455,36 @@ afr_does_witness_exist (xlator_t *this, uint64_t *witness)
d1681e
         return _gf_false;
d1681e
 }
d1681e
 
d1681e
+unsigned int
d1681e
+afr_get_quorum_count (afr_private_t *priv)
d1681e
+{
d1681e
+        if (priv->quorum_count == AFR_QUORUM_AUTO) {
d1681e
+                return priv->child_count/2 + 1;
d1681e
+        } else {
d1681e
+                return priv->quorum_count;
d1681e
+        }
d1681e
+}
d1681e
+
d1681e
+void
d1681e
+afr_selfheal_post_op_failure_accounting (afr_private_t *priv, char *accused,
d1681e
+                                         unsigned char *sources,
d1681e
+                                         unsigned char *locked_on)
d1681e
+{
d1681e
+        int i = 0;
d1681e
+        unsigned int  quorum_count = 0;
d1681e
+
d1681e
+        if (AFR_COUNT (sources, priv->child_count) != 0)
d1681e
+                return;
d1681e
+
d1681e
+        quorum_count = afr_get_quorum_count (priv);
d1681e
+        for (i = 0; i < priv->child_count; i++) {
d1681e
+                if ((accused[i] < quorum_count) && locked_on[i]) {
d1681e
+                        sources[i] = 1;
d1681e
+                }
d1681e
+        }
d1681e
+        return;
d1681e
+}
d1681e
+
d1681e
 /*
d1681e
  * This function determines if a self-heal is required for a given inode,
d1681e
  * and if needed, in what direction.
d1681e
@@ -1490,6 +1520,7 @@ afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this,
d1681e
         char *accused = NULL;/* Accused others without any self-accusal */
d1681e
         char *pending = NULL;/* Have pending operations on others */
d1681e
         char *self_accused = NULL; /* Accused itself */
d1681e
+        int min_participants = -1;
d1681e
 
d1681e
 	priv = this->private;
d1681e
 
d1681e
@@ -1513,8 +1544,13 @@ afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this,
d1681e
                 }
d1681e
         }
d1681e
 
d1681e
+        if (type == AFR_DATA_TRANSACTION) {
d1681e
+                min_participants = priv->child_count;
d1681e
+        } else {
d1681e
+                min_participants = AFR_SH_MIN_PARTICIPANTS;
d1681e
+        }
d1681e
         if (afr_success_count (replies,
d1681e
-                               priv->child_count) < AFR_SH_MIN_PARTICIPANTS) {
d1681e
+                               priv->child_count) < min_participants) {
d1681e
                 /* Treat this just like locks not being acquired */
d1681e
                 return -ENOTCONN;
d1681e
         }
d1681e
@@ -1530,11 +1566,10 @@ afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this,
d1681e
 	for (i = 0; i < priv->child_count; i++) {
d1681e
 		for (j = 0; j < priv->child_count; j++) {
d1681e
                         if (matrix[i][j]) {
d1681e
-                                 if (!self_accused[i])
d1681e
-                                         accused[j] = 1;
d1681e
-
d1681e
-                                 if (i != j)
d1681e
-                                         pending[i] = 1;
d1681e
+                                if (!self_accused[i])
d1681e
+                                        accused[j] += 1;
d1681e
+                                if (i != j)
d1681e
+                                        pending[i] += 1;
d1681e
                          }
d1681e
 		}
d1681e
 	}
d1681e
@@ -1575,6 +1610,10 @@ afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this,
d1681e
                 }
d1681e
         }
d1681e
 
d1681e
+        if (type == AFR_DATA_TRANSACTION)
d1681e
+                afr_selfheal_post_op_failure_accounting (priv, accused,
d1681e
+                                                         sources, locked_on);
d1681e
+
d1681e
          /* If no sources, all locked nodes are sinks - split brain */
d1681e
          if (AFR_COUNT (sources, priv->child_count) == 0) {
d1681e
                 for (i = 0; i < priv->child_count; i++) {
d1681e
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
d1681e
index 8cf43f2..bcd0dec 100644
d1681e
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
d1681e
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
d1681e
@@ -684,7 +684,7 @@ __afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd,
d1681e
 	ret = afr_selfheal_inodelk (frame, this, fd->inode, this->name, 0, 0,
d1681e
 				    data_lock);
d1681e
 	{
d1681e
-		if (ret < AFR_SH_MIN_PARTICIPANTS) {
d1681e
+		if (ret < priv->child_count) {
d1681e
                         gf_msg_debug (this->name, 0, "%s: Skipping "
d1681e
                                       "self-heal as only %d number "
d1681e
                                       "of subvolumes "
d1681e
@@ -749,7 +749,7 @@ restore_time:
d1681e
         if (!is_arbiter_the_only_sink) {
d1681e
                 ret = afr_selfheal_inodelk (frame, this, fd->inode, this->name,
d1681e
                                             0, 0, data_lock);
d1681e
-                if (ret < AFR_SH_MIN_PARTICIPANTS) {
d1681e
+                if (ret < priv->child_count) {
d1681e
                         ret = -ENOTCONN;
d1681e
                         did_sh = _gf_false;
d1681e
                         goto skip_undo_pending;
d1681e
@@ -878,7 +878,7 @@ afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode)
d1681e
 	                                        priv->sh_domain, 0, 0,
d1681e
 				                locked_on);
d1681e
 	{
d1681e
-		if (ret < AFR_SH_MIN_PARTICIPANTS) {
d1681e
+		if (ret < priv->child_count) {
d1681e
                         gf_msg_debug (this->name, 0, "%s: Skipping "
d1681e
                                       "self-heal as only %d number of "
d1681e
                                       "subvolumes could be locked",
d1681e
-- 
d1681e
1.8.3.1
d1681e