Blob Blame History Raw
From 6a4c9ea70799464cdbe2eb0197698c9a5ebb7c5d Mon Sep 17 00:00:00 2001
From: Ravishankar N <ravishankar@redhat.com>
Date: Mon, 30 Jan 2017 09:54:16 +0530
Subject: [PATCH 286/294] afr: all children of AFR must be up to resolve
 s-brain

Backport of: https://review.gluster.org/16476

Problem:
The various split-brain resolution policies (favorite-child-policy based,
CLI based and mount (get/setfattr) based) attempt to resolve split-brain
even when not all bricks of replica are up. This can be a problem when
say in a replica 3, the only good copy is down and the other 2 bricks
are up and blame each other (i.e. split-brain). We end up healing the
file in such a  case and allow I/O on it.

Fix:
A decision on whether the file is in split-brain or not must be taken
only if we are able to examine the afr xattrs of *all* bricks of a given
replica.

Change-Id: Icddb1268b380005799990f5379ef957d84639ef9
BUG: 1417177
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/97384
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
---
 .../bug-1417522-block-split-brain-resolution.t     | 66 ++++++++++++++++++++++
 xlators/cluster/afr/src/afr-common.c               | 32 +++++++----
 xlators/cluster/afr/src/afr-self-heal-common.c     | 38 +++++++++++--
 xlators/cluster/afr/src/afr-self-heal.h            |  6 +-
 4 files changed, 127 insertions(+), 15 deletions(-)
 create mode 100644 tests/bugs/replicate/bug-1417522-block-split-brain-resolution.t

diff --git a/tests/bugs/replicate/bug-1417522-block-split-brain-resolution.t b/tests/bugs/replicate/bug-1417522-block-split-brain-resolution.t
new file mode 100644
index 0000000..4592ebf
--- /dev/null
+++ b/tests/bugs/replicate/bug-1417522-block-split-brain-resolution.t
@@ -0,0 +1,66 @@
+#!/bin/bash
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0..2}
+TEST $CLI volume set $V0 self-heal-daemon off
+TEST $CLI volume set $V0 data-self-heal off
+TEST $CLI volume set $V0 entry-self-heal off
+TEST $CLI volume set $V0 metadata-self-heal off
+TEST $CLI volume start $V0
+
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0;
+TEST touch $M0/file
+
+TEST kill_brick $V0 $H0 $B0/${V0}1
+TEST dd if=/dev/urandom of=$M0/file bs=1024 count=10
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
+TEST kill_brick $V0 $H0 $B0/${V0}2
+TEST dd if=/dev/urandom of=$M0/file bs=1024 count=20
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST ! dd if=$M0/file of=/dev/null
+SOURCE_BRICK_MD5=$(md5sum $B0/${V0}0/file | cut -d\  -f1)
+
+# Various fav-child policies must not heal the file when some bricks are down.
+TEST $CLI volume set $V0 favorite-child-policy size
+TEST ! dd if=$M0/file of=/dev/null
+TEST $CLI volume set $V0 favorite-child-policy ctime
+TEST ! dd if=$M0/file of=/dev/null
+TEST $CLI volume set $V0 favorite-child-policy mtime
+TEST ! dd if=$M0/file of=/dev/null
+TEST $CLI volume set $V0 favorite-child-policy majority
+TEST ! dd if=$M0/file of=/dev/null
+
+# CLI/mount based split-brain resolution must also not work.
+TEST ! $CLI volume heal $V0 split-brain bigger-file /file
+TEST ! $CLI volume heal $V0 split-brain mtime /file
+TEST ! $CLI volume heal $V0 split-brain source-brick $H0:$B0/${V0}2 /file1
+
+TEST ! getfattr -n replica.split-brain-status $M0/file
+TEST ! setfattr -n replica.split-brain-choice -v $V0-client-1 $M0/file
+
+# Bring all bricks back up and launch heal.
+TEST $CLI volume set $V0 self-heal-daemon on
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
+TEST $CLI volume heal $V0
+EXPECT 0 get_pending_heal_count $V0
+B1_MD5=$(md5sum $B0/${V0}1/file | cut -d\  -f1)
+B2_MD5=$(md5sum $B0/${V0}2/file | cut -d\  -f1)
+TEST [ "$SOURCE_BRICK_MD5" == "$B1_MD5" ]
+TEST [ "$SOURCE_BRICK_MD5" == "$B2_MD5" ]
+
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
+cleanup;
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 696e909..03ae7f9 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -727,14 +727,17 @@ afr_set_split_brain_choice (int ret, call_frame_t *frame, void *opaque)
         gf_boolean_t        timer_reset      = _gf_false;
         int                 old_spb_choice   = -1;
 
-        if (ret)
-                goto out;
-
         frame = data->frame;
         loc = data->loc;
         this = frame->this;
         priv = this->private;
 
+        if (ret) {
+                op_errno = -ret;
+                ret = -1;
+                goto out;
+        }
+
         delta.tv_sec = priv->spb_choice_timeout;
         delta.tv_nsec = 0;
 
@@ -5551,6 +5554,12 @@ afr_is_split_brain (call_frame_t *frame, xlator_t *this, inode_t *inode,
         if (ret)
                 goto out;
 
+        if (!afr_can_decide_split_brain_source_sinks (replies,
+                                                      priv->child_count)) {
+                ret = -EAGAIN;
+                goto out;
+        }
+
         ret = _afr_is_split_brain (frame, this, replies,
                                     AFR_DATA_TRANSACTION, d_spb);
         if (ret)
@@ -5603,6 +5612,13 @@ afr_get_split_brain_status (void *opaque)
         if (!inode)
                 goto out;
 
+        dict = dict_new ();
+        if (!dict) {
+                op_errno = ENOMEM;
+                ret = -1;
+                goto out;
+        }
+
         /* Calculation for string length :
         * (child_count X length of child-name) + strlen ("    Choices :")
         * child-name consists of :
@@ -5616,13 +5632,9 @@ afr_get_split_brain_status (void *opaque)
                                   &m_spb);
         if (ret) {
                 op_errno = -ret;
-                ret = -1;
-                goto out;
-        }
-
-        dict = dict_new ();
-        if (!dict) {
-                op_errno = ENOMEM;
+                if (ret == -EAGAIN)
+                        ret = dict_set_str (dict, GF_AFR_SBRAIN_STATUS,
+                                            SBRAIN_HEAL_NO_GO_MSG);
                 ret = -1;
                 goto out;
         }
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index 74696b5..f731d42 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -473,6 +473,19 @@ afr_dict_contains_heal_op (call_frame_t *frame)
         return _gf_true;
 }
 
+gf_boolean_t
+afr_can_decide_split_brain_source_sinks (struct afr_reply *replies,
+                                         int child_count)
+{
+        int i = 0;
+
+        for (i = 0; i < child_count; i++)
+                if (replies[i].valid != 1 || replies[i].op_ret != 0)
+                        return _gf_false;
+
+        return _gf_true;
+}
+
 int
 afr_mark_split_brain_source_sinks_by_heal_op (call_frame_t *frame,
                                    xlator_t *this, unsigned char *sources,
@@ -511,6 +524,14 @@ afr_mark_split_brain_source_sinks_by_heal_op (call_frame_t *frame,
         }
         xdata_rsp = local->xdata_rsp;
 
+        if (!afr_can_decide_split_brain_source_sinks (replies,
+                                                      priv->child_count)) {
+                ret = dict_set_str (xdata_rsp, "sh-fail-msg",
+                                    SBRAIN_HEAL_NO_GO_MSG);
+                ret = -1;
+                goto out;
+        }
+
         for (i = 0 ; i < priv->child_count; i++)
                 if (locked_on[i])
                         sources[i] = 1;
@@ -749,26 +770,35 @@ afr_sh_get_fav_by_policy (xlator_t *this, struct afr_reply *replies,
         int fav_child = -1;
 
         priv = this->private;
+        if (!afr_can_decide_split_brain_source_sinks (replies,
+                                                      priv->child_count)) {
+                return -1;
+        }
+
         switch (priv->fav_child_policy) {
         case AFR_FAV_CHILD_BY_SIZE:
                 fav_child = afr_sh_fav_by_size (this, replies, inode);
-                if (policy_str && fav_child >= 0)
+                if (policy_str && fav_child >= 0) {
                         *policy_str = "SIZE";
+                }
                 break;
         case AFR_FAV_CHILD_BY_CTIME:
                 fav_child = afr_sh_fav_by_ctime (this, replies, inode);
-                if (policy_str && fav_child >= 0)
+                if (policy_str && fav_child >= 0) {
                         *policy_str = "CTIME";
+                }
                 break;
         case AFR_FAV_CHILD_BY_MTIME:
                 fav_child = afr_sh_fav_by_mtime (this, replies, inode);
-                if (policy_str && fav_child >= 0)
+                if (policy_str && fav_child >= 0) {
                         *policy_str = "MTIME";
+                }
                 break;
         case AFR_FAV_CHILD_BY_MAJORITY:
                 fav_child = afr_sh_fav_by_majority (this, replies, inode);
-                if (policy_str && fav_child >= 0)
+                if (policy_str && fav_child >= 0) {
                         *policy_str = "MAJORITY";
+                }
                 break;
         case AFR_FAV_CHILD_NONE:
         default:
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index 500227a..a339050 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -81,7 +81,8 @@
 
 #define IA_EQUAL(f,s,field) (memcmp (&(f.ia_##field), &(s.ia_##field), sizeof (s.ia_##field)) == 0)
 
-
+#define SBRAIN_HEAL_NO_GO_MSG "Failed to obtain replies from all bricks of "\
+                      "the replica (are they up?). Cannot resolve split-brain."
 int
 afr_selfheal (xlator_t *this, uuid_t gfid);
 
@@ -220,6 +221,9 @@ afr_mark_active_sinks (xlator_t *this, unsigned char *sources,
 gf_boolean_t
 afr_dict_contains_heal_op (call_frame_t *frame);
 
+gf_boolean_t
+afr_can_decide_split_brain_source_sinks (struct afr_reply *replies,
+                                         int child_count);
 int
 afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this,
                                    inode_t *inode,
-- 
2.9.3