3604df
From 6a4c9ea70799464cdbe2eb0197698c9a5ebb7c5d Mon Sep 17 00:00:00 2001
3604df
From: Ravishankar N <ravishankar@redhat.com>
3604df
Date: Mon, 30 Jan 2017 09:54:16 +0530
3604df
Subject: [PATCH 286/294] afr: all children of AFR must be up to resolve
3604df
 s-brain
3604df
3604df
Backport of: https://review.gluster.org/16476
3604df
3604df
Problem:
3604df
The various split-brain resolution policies (favorite-child-policy based,
3604df
CLI based and mount (get/setfattr) based) attempt to resolve split-brain
3604df
even when not all bricks of replica are up. This can be a problem when
3604df
say in a replica 3, the only good copy is down and the other 2 bricks
3604df
are up and blame each other (i.e. split-brain). We end up healing the
3604df
file in such a  case and allow I/O on it.
3604df
3604df
Fix:
3604df
A decision on whether the file is in split-brain or not must be taken
3604df
only if we are able to examine the afr xattrs of *all* bricks of a given
3604df
replica.
3604df
3604df
Change-Id: Icddb1268b380005799990f5379ef957d84639ef9
3604df
BUG: 1417177
3604df
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
3604df
Reviewed-on: https://code.engineering.redhat.com/gerrit/97384
3604df
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
3604df
---
3604df
 .../bug-1417522-block-split-brain-resolution.t     | 66 ++++++++++++++++++++++
3604df
 xlators/cluster/afr/src/afr-common.c               | 32 +++++++----
3604df
 xlators/cluster/afr/src/afr-self-heal-common.c     | 38 +++++++++++--
3604df
 xlators/cluster/afr/src/afr-self-heal.h            |  6 +-
3604df
 4 files changed, 127 insertions(+), 15 deletions(-)
3604df
 create mode 100644 tests/bugs/replicate/bug-1417522-block-split-brain-resolution.t
3604df
3604df
diff --git a/tests/bugs/replicate/bug-1417522-block-split-brain-resolution.t b/tests/bugs/replicate/bug-1417522-block-split-brain-resolution.t
3604df
new file mode 100644
3604df
index 0000000..4592ebf
3604df
--- /dev/null
3604df
+++ b/tests/bugs/replicate/bug-1417522-block-split-brain-resolution.t
3604df
@@ -0,0 +1,66 @@
3604df
+#!/bin/bash
3604df
+. $(dirname $0)/../../include.rc
3604df
+. $(dirname $0)/../../volume.rc
3604df
+cleanup;
3604df
+
3604df
+TEST glusterd
3604df
+TEST pidof glusterd
3604df
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0..2}
3604df
+TEST $CLI volume set $V0 self-heal-daemon off
3604df
+TEST $CLI volume set $V0 data-self-heal off
3604df
+TEST $CLI volume set $V0 entry-self-heal off
3604df
+TEST $CLI volume set $V0 metadata-self-heal off
3604df
+TEST $CLI volume start $V0
3604df
+
3604df
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0;
3604df
+TEST touch $M0/file
3604df
+
3604df
+TEST kill_brick $V0 $H0 $B0/${V0}1
3604df
+TEST dd if=/dev/urandom of=$M0/file bs=1024 count=10
3604df
+TEST $CLI volume start $V0 force
3604df
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1
3604df
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
3604df
+TEST kill_brick $V0 $H0 $B0/${V0}2
3604df
+TEST dd if=/dev/urandom of=$M0/file bs=1024 count=20
3604df
+TEST $CLI volume start $V0 force
3604df
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2
3604df
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2
3604df
+TEST kill_brick $V0 $H0 $B0/${V0}0
3604df
+TEST ! dd if=$M0/file of=/dev/null
3604df
+SOURCE_BRICK_MD5=$(md5sum $B0/${V0}0/file | cut -d\  -f1)
3604df
+
3604df
+# Various fav-child policies must not heal the file when some bricks are down.
3604df
+TEST $CLI volume set $V0 favorite-child-policy size
3604df
+TEST ! dd if=$M0/file of=/dev/null
3604df
+TEST $CLI volume set $V0 favorite-child-policy ctime
3604df
+TEST ! dd if=$M0/file of=/dev/null
3604df
+TEST $CLI volume set $V0 favorite-child-policy mtime
3604df
+TEST ! dd if=$M0/file of=/dev/null
3604df
+TEST $CLI volume set $V0 favorite-child-policy majority
3604df
+TEST ! dd if=$M0/file of=/dev/null
3604df
+
3604df
+# CLI/mount based split-brain resolution must also not work.
3604df
+TEST ! $CLI volume heal $V0 split-brain bigger-file /file
3604df
+TEST ! $CLI volume heal $V0 split-brain mtime /file
3604df
+TEST ! $CLI volume heal $V0 split-brain source-brick $H0:$B0/${V0}2 /file1
3604df
+
3604df
+TEST ! getfattr -n replica.split-brain-status $M0/file
3604df
+TEST ! setfattr -n replica.split-brain-choice -v $V0-client-1 $M0/file
3604df
+
3604df
+# Bring all bricks back up and launch heal.
3604df
+TEST $CLI volume set $V0 self-heal-daemon on
3604df
+TEST $CLI volume start $V0 force
3604df
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0
3604df
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
3604df
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
3604df
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
3604df
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
3604df
+TEST $CLI volume heal $V0
3604df
+EXPECT 0 get_pending_heal_count $V0
3604df
+B1_MD5=$(md5sum $B0/${V0}1/file | cut -d\  -f1)
3604df
+B2_MD5=$(md5sum $B0/${V0}2/file | cut -d\  -f1)
3604df
+TEST [ "$SOURCE_BRICK_MD5" == "$B1_MD5" ]
3604df
+TEST [ "$SOURCE_BRICK_MD5" == "$B2_MD5" ]
3604df
+
3604df
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
3604df
+cleanup;
3604df
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
3604df
index 696e909..03ae7f9 100644
3604df
--- a/xlators/cluster/afr/src/afr-common.c
3604df
+++ b/xlators/cluster/afr/src/afr-common.c
3604df
@@ -727,14 +727,17 @@ afr_set_split_brain_choice (int ret, call_frame_t *frame, void *opaque)
3604df
         gf_boolean_t        timer_reset      = _gf_false;
3604df
         int                 old_spb_choice   = -1;
3604df
 
3604df
-        if (ret)
3604df
-                goto out;
3604df
-
3604df
         frame = data->frame;
3604df
         loc = data->loc;
3604df
         this = frame->this;
3604df
         priv = this->private;
3604df
 
3604df
+        if (ret) {
3604df
+                op_errno = -ret;
3604df
+                ret = -1;
3604df
+                goto out;
3604df
+        }
3604df
+
3604df
         delta.tv_sec = priv->spb_choice_timeout;
3604df
         delta.tv_nsec = 0;
3604df
 
3604df
@@ -5551,6 +5554,12 @@ afr_is_split_brain (call_frame_t *frame, xlator_t *this, inode_t *inode,
3604df
         if (ret)
3604df
                 goto out;
3604df
 
3604df
+        if (!afr_can_decide_split_brain_source_sinks (replies,
3604df
+                                                      priv->child_count)) {
3604df
+                ret = -EAGAIN;
3604df
+                goto out;
3604df
+        }
3604df
+
3604df
         ret = _afr_is_split_brain (frame, this, replies,
3604df
                                     AFR_DATA_TRANSACTION, d_spb);
3604df
         if (ret)
3604df
@@ -5603,6 +5612,13 @@ afr_get_split_brain_status (void *opaque)
3604df
         if (!inode)
3604df
                 goto out;
3604df
 
3604df
+        dict = dict_new ();
3604df
+        if (!dict) {
3604df
+                op_errno = ENOMEM;
3604df
+                ret = -1;
3604df
+                goto out;
3604df
+        }
3604df
+
3604df
         /* Calculation for string length :
3604df
         * (child_count X length of child-name) + strlen ("    Choices :")
3604df
         * child-name consists of :
3604df
@@ -5616,13 +5632,9 @@ afr_get_split_brain_status (void *opaque)
3604df
                                   &m_spb);
3604df
         if (ret) {
3604df
                 op_errno = -ret;
3604df
-                ret = -1;
3604df
-                goto out;
3604df
-        }
3604df
-
3604df
-        dict = dict_new ();
3604df
-        if (!dict) {
3604df
-                op_errno = ENOMEM;
3604df
+                if (ret == -EAGAIN)
3604df
+                        ret = dict_set_str (dict, GF_AFR_SBRAIN_STATUS,
3604df
+                                            SBRAIN_HEAL_NO_GO_MSG);
3604df
                 ret = -1;
3604df
                 goto out;
3604df
         }
3604df
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
3604df
index 74696b5..f731d42 100644
3604df
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
3604df
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
3604df
@@ -473,6 +473,19 @@ afr_dict_contains_heal_op (call_frame_t *frame)
3604df
         return _gf_true;
3604df
 }
3604df
 
3604df
+gf_boolean_t
3604df
+afr_can_decide_split_brain_source_sinks (struct afr_reply *replies,
3604df
+                                         int child_count)
3604df
+{
3604df
+        int i = 0;
3604df
+
3604df
+        for (i = 0; i < child_count; i++)
3604df
+                if (replies[i].valid != 1 || replies[i].op_ret != 0)
3604df
+                        return _gf_false;
3604df
+
3604df
+        return _gf_true;
3604df
+}
3604df
+
3604df
 int
3604df
 afr_mark_split_brain_source_sinks_by_heal_op (call_frame_t *frame,
3604df
                                    xlator_t *this, unsigned char *sources,
3604df
@@ -511,6 +524,14 @@ afr_mark_split_brain_source_sinks_by_heal_op (call_frame_t *frame,
3604df
         }
3604df
         xdata_rsp = local->xdata_rsp;
3604df
 
3604df
+        if (!afr_can_decide_split_brain_source_sinks (replies,
3604df
+                                                      priv->child_count)) {
3604df
+                ret = dict_set_str (xdata_rsp, "sh-fail-msg",
3604df
+                                    SBRAIN_HEAL_NO_GO_MSG);
3604df
+                ret = -1;
3604df
+                goto out;
3604df
+        }
3604df
+
3604df
         for (i = 0 ; i < priv->child_count; i++)
3604df
                 if (locked_on[i])
3604df
                         sources[i] = 1;
3604df
@@ -749,26 +770,35 @@ afr_sh_get_fav_by_policy (xlator_t *this, struct afr_reply *replies,
3604df
         int fav_child = -1;
3604df
 
3604df
         priv = this->private;
3604df
+        if (!afr_can_decide_split_brain_source_sinks (replies,
3604df
+                                                      priv->child_count)) {
3604df
+                return -1;
3604df
+        }
3604df
+
3604df
         switch (priv->fav_child_policy) {
3604df
         case AFR_FAV_CHILD_BY_SIZE:
3604df
                 fav_child = afr_sh_fav_by_size (this, replies, inode);
3604df
-                if (policy_str && fav_child >= 0)
3604df
+                if (policy_str && fav_child >= 0) {
3604df
                         *policy_str = "SIZE";
3604df
+                }
3604df
                 break;
3604df
         case AFR_FAV_CHILD_BY_CTIME:
3604df
                 fav_child = afr_sh_fav_by_ctime (this, replies, inode);
3604df
-                if (policy_str && fav_child >= 0)
3604df
+                if (policy_str && fav_child >= 0) {
3604df
                         *policy_str = "CTIME";
3604df
+                }
3604df
                 break;
3604df
         case AFR_FAV_CHILD_BY_MTIME:
3604df
                 fav_child = afr_sh_fav_by_mtime (this, replies, inode);
3604df
-                if (policy_str && fav_child >= 0)
3604df
+                if (policy_str && fav_child >= 0) {
3604df
                         *policy_str = "MTIME";
3604df
+                }
3604df
                 break;
3604df
         case AFR_FAV_CHILD_BY_MAJORITY:
3604df
                 fav_child = afr_sh_fav_by_majority (this, replies, inode);
3604df
-                if (policy_str && fav_child >= 0)
3604df
+                if (policy_str && fav_child >= 0) {
3604df
                         *policy_str = "MAJORITY";
3604df
+                }
3604df
                 break;
3604df
         case AFR_FAV_CHILD_NONE:
3604df
         default:
3604df
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
3604df
index 500227a..a339050 100644
3604df
--- a/xlators/cluster/afr/src/afr-self-heal.h
3604df
+++ b/xlators/cluster/afr/src/afr-self-heal.h
3604df
@@ -81,7 +81,8 @@
3604df
 
3604df
 #define IA_EQUAL(f,s,field) (memcmp (&(f.ia_##field), &(s.ia_##field), sizeof (s.ia_##field)) == 0)
3604df
 
3604df
-
3604df
+#define SBRAIN_HEAL_NO_GO_MSG "Failed to obtain replies from all bricks of "\
3604df
+                      "the replica (are they up?). Cannot resolve split-brain."
3604df
 int
3604df
 afr_selfheal (xlator_t *this, uuid_t gfid);
3604df
 
3604df
@@ -220,6 +221,9 @@ afr_mark_active_sinks (xlator_t *this, unsigned char *sources,
3604df
 gf_boolean_t
3604df
 afr_dict_contains_heal_op (call_frame_t *frame);
3604df
 
3604df
+gf_boolean_t
3604df
+afr_can_decide_split_brain_source_sinks (struct afr_reply *replies,
3604df
+                                         int child_count);
3604df
 int
3604df
 afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this,
3604df
                                    inode_t *inode,
3604df
-- 
3604df
2.9.3
3604df