|
|
3604df |
From 6a4c9ea70799464cdbe2eb0197698c9a5ebb7c5d Mon Sep 17 00:00:00 2001
|
|
|
3604df |
From: Ravishankar N <ravishankar@redhat.com>
|
|
|
3604df |
Date: Mon, 30 Jan 2017 09:54:16 +0530
|
|
|
3604df |
Subject: [PATCH 286/294] afr: all children of AFR must be up to resolve
|
|
|
3604df |
s-brain
|
|
|
3604df |
|
|
|
3604df |
Backport of: https://review.gluster.org/16476
|
|
|
3604df |
|
|
|
3604df |
Problem:
|
|
|
3604df |
The various split-brain resolution policies (favorite-child-policy based,
|
|
|
3604df |
CLI based and mount (get/setfattr) based) attempt to resolve split-brain
|
|
|
3604df |
even when not all bricks of replica are up. This can be a problem when
|
|
|
3604df |
say in a replica 3, the only good copy is down and the other 2 bricks
|
|
|
3604df |
are up and blame each other (i.e. split-brain). We end up healing the
|
|
|
3604df |
file in such a case and allow I/O on it.
|
|
|
3604df |
|
|
|
3604df |
Fix:
|
|
|
3604df |
A decision on whether the file is in split-brain or not must be taken
|
|
|
3604df |
only if we are able to examine the afr xattrs of *all* bricks of a given
|
|
|
3604df |
replica.
|
|
|
3604df |
|
|
|
3604df |
Change-Id: Icddb1268b380005799990f5379ef957d84639ef9
|
|
|
3604df |
BUG: 1417177
|
|
|
3604df |
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
|
|
|
3604df |
Reviewed-on: https://code.engineering.redhat.com/gerrit/97384
|
|
|
3604df |
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
|
|
|
3604df |
---
|
|
|
3604df |
.../bug-1417522-block-split-brain-resolution.t | 66 ++++++++++++++++++++++
|
|
|
3604df |
xlators/cluster/afr/src/afr-common.c | 32 +++++++----
|
|
|
3604df |
xlators/cluster/afr/src/afr-self-heal-common.c | 38 +++++++++++--
|
|
|
3604df |
xlators/cluster/afr/src/afr-self-heal.h | 6 +-
|
|
|
3604df |
4 files changed, 127 insertions(+), 15 deletions(-)
|
|
|
3604df |
create mode 100644 tests/bugs/replicate/bug-1417522-block-split-brain-resolution.t
|
|
|
3604df |
|
|
|
3604df |
diff --git a/tests/bugs/replicate/bug-1417522-block-split-brain-resolution.t b/tests/bugs/replicate/bug-1417522-block-split-brain-resolution.t
|
|
|
3604df |
new file mode 100644
|
|
|
3604df |
index 0000000..4592ebf
|
|
|
3604df |
--- /dev/null
|
|
|
3604df |
+++ b/tests/bugs/replicate/bug-1417522-block-split-brain-resolution.t
|
|
|
3604df |
@@ -0,0 +1,66 @@
|
|
|
3604df |
+#!/bin/bash
|
|
|
3604df |
+. $(dirname $0)/../../include.rc
|
|
|
3604df |
+. $(dirname $0)/../../volume.rc
|
|
|
3604df |
+cleanup;
|
|
|
3604df |
+
|
|
|
3604df |
+TEST glusterd
|
|
|
3604df |
+TEST pidof glusterd
|
|
|
3604df |
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0..2}
|
|
|
3604df |
+TEST $CLI volume set $V0 self-heal-daemon off
|
|
|
3604df |
+TEST $CLI volume set $V0 data-self-heal off
|
|
|
3604df |
+TEST $CLI volume set $V0 entry-self-heal off
|
|
|
3604df |
+TEST $CLI volume set $V0 metadata-self-heal off
|
|
|
3604df |
+TEST $CLI volume start $V0
|
|
|
3604df |
+
|
|
|
3604df |
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0;
|
|
|
3604df |
+TEST touch $M0/file
|
|
|
3604df |
+
|
|
|
3604df |
+TEST kill_brick $V0 $H0 $B0/${V0}1
|
|
|
3604df |
+TEST dd if=/dev/urandom of=$M0/file bs=1024 count=10
|
|
|
3604df |
+TEST $CLI volume start $V0 force
|
|
|
3604df |
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1
|
|
|
3604df |
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
|
|
|
3604df |
+TEST kill_brick $V0 $H0 $B0/${V0}2
|
|
|
3604df |
+TEST dd if=/dev/urandom of=$M0/file bs=1024 count=20
|
|
|
3604df |
+TEST $CLI volume start $V0 force
|
|
|
3604df |
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2
|
|
|
3604df |
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2
|
|
|
3604df |
+TEST kill_brick $V0 $H0 $B0/${V0}0
|
|
|
3604df |
+TEST ! dd if=$M0/file of=/dev/null
|
|
|
3604df |
+SOURCE_BRICK_MD5=$(md5sum $B0/${V0}0/file | cut -d\ -f1)
|
|
|
3604df |
+
|
|
|
3604df |
+# Various fav-child policies must not heal the file when some bricks are down.
|
|
|
3604df |
+TEST $CLI volume set $V0 favorite-child-policy size
|
|
|
3604df |
+TEST ! dd if=$M0/file of=/dev/null
|
|
|
3604df |
+TEST $CLI volume set $V0 favorite-child-policy ctime
|
|
|
3604df |
+TEST ! dd if=$M0/file of=/dev/null
|
|
|
3604df |
+TEST $CLI volume set $V0 favorite-child-policy mtime
|
|
|
3604df |
+TEST ! dd if=$M0/file of=/dev/null
|
|
|
3604df |
+TEST $CLI volume set $V0 favorite-child-policy majority
|
|
|
3604df |
+TEST ! dd if=$M0/file of=/dev/null
|
|
|
3604df |
+
|
|
|
3604df |
+# CLI/mount based split-brain resolution must also not work.
|
|
|
3604df |
+TEST ! $CLI volume heal $V0 split-brain bigger-file /file
|
|
|
3604df |
+TEST ! $CLI volume heal $V0 split-brain mtime /file
|
|
|
3604df |
+TEST ! $CLI volume heal $V0 split-brain source-brick $H0:$B0/${V0}2 /file1
|
|
|
3604df |
+
|
|
|
3604df |
+TEST ! getfattr -n replica.split-brain-status $M0/file
|
|
|
3604df |
+TEST ! setfattr -n replica.split-brain-choice -v $V0-client-1 $M0/file
|
|
|
3604df |
+
|
|
|
3604df |
+# Bring all bricks back up and launch heal.
|
|
|
3604df |
+TEST $CLI volume set $V0 self-heal-daemon on
|
|
|
3604df |
+TEST $CLI volume start $V0 force
|
|
|
3604df |
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0
|
|
|
3604df |
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
|
|
|
3604df |
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
|
|
|
3604df |
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
|
|
|
3604df |
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
|
|
|
3604df |
+TEST $CLI volume heal $V0
|
|
|
3604df |
+EXPECT 0 get_pending_heal_count $V0
|
|
|
3604df |
+B1_MD5=$(md5sum $B0/${V0}1/file | cut -d\ -f1)
|
|
|
3604df |
+B2_MD5=$(md5sum $B0/${V0}2/file | cut -d\ -f1)
|
|
|
3604df |
+TEST [ "$SOURCE_BRICK_MD5" == "$B1_MD5" ]
|
|
|
3604df |
+TEST [ "$SOURCE_BRICK_MD5" == "$B2_MD5" ]
|
|
|
3604df |
+
|
|
|
3604df |
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
|
|
|
3604df |
+cleanup;
|
|
|
3604df |
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
|
|
|
3604df |
index 696e909..03ae7f9 100644
|
|
|
3604df |
--- a/xlators/cluster/afr/src/afr-common.c
|
|
|
3604df |
+++ b/xlators/cluster/afr/src/afr-common.c
|
|
|
3604df |
@@ -727,14 +727,17 @@ afr_set_split_brain_choice (int ret, call_frame_t *frame, void *opaque)
|
|
|
3604df |
gf_boolean_t timer_reset = _gf_false;
|
|
|
3604df |
int old_spb_choice = -1;
|
|
|
3604df |
|
|
|
3604df |
- if (ret)
|
|
|
3604df |
- goto out;
|
|
|
3604df |
-
|
|
|
3604df |
frame = data->frame;
|
|
|
3604df |
loc = data->loc;
|
|
|
3604df |
this = frame->this;
|
|
|
3604df |
priv = this->private;
|
|
|
3604df |
|
|
|
3604df |
+ if (ret) {
|
|
|
3604df |
+ op_errno = -ret;
|
|
|
3604df |
+ ret = -1;
|
|
|
3604df |
+ goto out;
|
|
|
3604df |
+ }
|
|
|
3604df |
+
|
|
|
3604df |
delta.tv_sec = priv->spb_choice_timeout;
|
|
|
3604df |
delta.tv_nsec = 0;
|
|
|
3604df |
|
|
|
3604df |
@@ -5551,6 +5554,12 @@ afr_is_split_brain (call_frame_t *frame, xlator_t *this, inode_t *inode,
|
|
|
3604df |
if (ret)
|
|
|
3604df |
goto out;
|
|
|
3604df |
|
|
|
3604df |
+ if (!afr_can_decide_split_brain_source_sinks (replies,
|
|
|
3604df |
+ priv->child_count)) {
|
|
|
3604df |
+ ret = -EAGAIN;
|
|
|
3604df |
+ goto out;
|
|
|
3604df |
+ }
|
|
|
3604df |
+
|
|
|
3604df |
ret = _afr_is_split_brain (frame, this, replies,
|
|
|
3604df |
AFR_DATA_TRANSACTION, d_spb);
|
|
|
3604df |
if (ret)
|
|
|
3604df |
@@ -5603,6 +5612,13 @@ afr_get_split_brain_status (void *opaque)
|
|
|
3604df |
if (!inode)
|
|
|
3604df |
goto out;
|
|
|
3604df |
|
|
|
3604df |
+ dict = dict_new ();
|
|
|
3604df |
+ if (!dict) {
|
|
|
3604df |
+ op_errno = ENOMEM;
|
|
|
3604df |
+ ret = -1;
|
|
|
3604df |
+ goto out;
|
|
|
3604df |
+ }
|
|
|
3604df |
+
|
|
|
3604df |
/* Calculation for string length :
|
|
|
3604df |
* (child_count X length of child-name) + strlen (" Choices :")
|
|
|
3604df |
* child-name consists of :
|
|
|
3604df |
@@ -5616,13 +5632,9 @@ afr_get_split_brain_status (void *opaque)
|
|
|
3604df |
&m_spb);
|
|
|
3604df |
if (ret) {
|
|
|
3604df |
op_errno = -ret;
|
|
|
3604df |
- ret = -1;
|
|
|
3604df |
- goto out;
|
|
|
3604df |
- }
|
|
|
3604df |
-
|
|
|
3604df |
- dict = dict_new ();
|
|
|
3604df |
- if (!dict) {
|
|
|
3604df |
- op_errno = ENOMEM;
|
|
|
3604df |
+ if (ret == -EAGAIN)
|
|
|
3604df |
+ ret = dict_set_str (dict, GF_AFR_SBRAIN_STATUS,
|
|
|
3604df |
+ SBRAIN_HEAL_NO_GO_MSG);
|
|
|
3604df |
ret = -1;
|
|
|
3604df |
goto out;
|
|
|
3604df |
}
|
|
|
3604df |
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
|
|
|
3604df |
index 74696b5..f731d42 100644
|
|
|
3604df |
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
|
|
|
3604df |
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
|
|
|
3604df |
@@ -473,6 +473,19 @@ afr_dict_contains_heal_op (call_frame_t *frame)
|
|
|
3604df |
return _gf_true;
|
|
|
3604df |
}
|
|
|
3604df |
|
|
|
3604df |
+gf_boolean_t
|
|
|
3604df |
+afr_can_decide_split_brain_source_sinks (struct afr_reply *replies,
|
|
|
3604df |
+ int child_count)
|
|
|
3604df |
+{
|
|
|
3604df |
+ int i = 0;
|
|
|
3604df |
+
|
|
|
3604df |
+ for (i = 0; i < child_count; i++)
|
|
|
3604df |
+ if (replies[i].valid != 1 || replies[i].op_ret != 0)
|
|
|
3604df |
+ return _gf_false;
|
|
|
3604df |
+
|
|
|
3604df |
+ return _gf_true;
|
|
|
3604df |
+}
|
|
|
3604df |
+
|
|
|
3604df |
int
|
|
|
3604df |
afr_mark_split_brain_source_sinks_by_heal_op (call_frame_t *frame,
|
|
|
3604df |
xlator_t *this, unsigned char *sources,
|
|
|
3604df |
@@ -511,6 +524,14 @@ afr_mark_split_brain_source_sinks_by_heal_op (call_frame_t *frame,
|
|
|
3604df |
}
|
|
|
3604df |
xdata_rsp = local->xdata_rsp;
|
|
|
3604df |
|
|
|
3604df |
+ if (!afr_can_decide_split_brain_source_sinks (replies,
|
|
|
3604df |
+ priv->child_count)) {
|
|
|
3604df |
+ ret = dict_set_str (xdata_rsp, "sh-fail-msg",
|
|
|
3604df |
+ SBRAIN_HEAL_NO_GO_MSG);
|
|
|
3604df |
+ ret = -1;
|
|
|
3604df |
+ goto out;
|
|
|
3604df |
+ }
|
|
|
3604df |
+
|
|
|
3604df |
for (i = 0 ; i < priv->child_count; i++)
|
|
|
3604df |
if (locked_on[i])
|
|
|
3604df |
sources[i] = 1;
|
|
|
3604df |
@@ -749,26 +770,35 @@ afr_sh_get_fav_by_policy (xlator_t *this, struct afr_reply *replies,
|
|
|
3604df |
int fav_child = -1;
|
|
|
3604df |
|
|
|
3604df |
priv = this->private;
|
|
|
3604df |
+ if (!afr_can_decide_split_brain_source_sinks (replies,
|
|
|
3604df |
+ priv->child_count)) {
|
|
|
3604df |
+ return -1;
|
|
|
3604df |
+ }
|
|
|
3604df |
+
|
|
|
3604df |
switch (priv->fav_child_policy) {
|
|
|
3604df |
case AFR_FAV_CHILD_BY_SIZE:
|
|
|
3604df |
fav_child = afr_sh_fav_by_size (this, replies, inode);
|
|
|
3604df |
- if (policy_str && fav_child >= 0)
|
|
|
3604df |
+ if (policy_str && fav_child >= 0) {
|
|
|
3604df |
*policy_str = "SIZE";
|
|
|
3604df |
+ }
|
|
|
3604df |
break;
|
|
|
3604df |
case AFR_FAV_CHILD_BY_CTIME:
|
|
|
3604df |
fav_child = afr_sh_fav_by_ctime (this, replies, inode);
|
|
|
3604df |
- if (policy_str && fav_child >= 0)
|
|
|
3604df |
+ if (policy_str && fav_child >= 0) {
|
|
|
3604df |
*policy_str = "CTIME";
|
|
|
3604df |
+ }
|
|
|
3604df |
break;
|
|
|
3604df |
case AFR_FAV_CHILD_BY_MTIME:
|
|
|
3604df |
fav_child = afr_sh_fav_by_mtime (this, replies, inode);
|
|
|
3604df |
- if (policy_str && fav_child >= 0)
|
|
|
3604df |
+ if (policy_str && fav_child >= 0) {
|
|
|
3604df |
*policy_str = "MTIME";
|
|
|
3604df |
+ }
|
|
|
3604df |
break;
|
|
|
3604df |
case AFR_FAV_CHILD_BY_MAJORITY:
|
|
|
3604df |
fav_child = afr_sh_fav_by_majority (this, replies, inode);
|
|
|
3604df |
- if (policy_str && fav_child >= 0)
|
|
|
3604df |
+ if (policy_str && fav_child >= 0) {
|
|
|
3604df |
*policy_str = "MAJORITY";
|
|
|
3604df |
+ }
|
|
|
3604df |
break;
|
|
|
3604df |
case AFR_FAV_CHILD_NONE:
|
|
|
3604df |
default:
|
|
|
3604df |
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
|
|
|
3604df |
index 500227a..a339050 100644
|
|
|
3604df |
--- a/xlators/cluster/afr/src/afr-self-heal.h
|
|
|
3604df |
+++ b/xlators/cluster/afr/src/afr-self-heal.h
|
|
|
3604df |
@@ -81,7 +81,8 @@
|
|
|
3604df |
|
|
|
3604df |
#define IA_EQUAL(f,s,field) (memcmp (&(f.ia_##field), &(s.ia_##field), sizeof (s.ia_##field)) == 0)
|
|
|
3604df |
|
|
|
3604df |
-
|
|
|
3604df |
+#define SBRAIN_HEAL_NO_GO_MSG "Failed to obtain replies from all bricks of "\
|
|
|
3604df |
+ "the replica (are they up?). Cannot resolve split-brain."
|
|
|
3604df |
int
|
|
|
3604df |
afr_selfheal (xlator_t *this, uuid_t gfid);
|
|
|
3604df |
|
|
|
3604df |
@@ -220,6 +221,9 @@ afr_mark_active_sinks (xlator_t *this, unsigned char *sources,
|
|
|
3604df |
gf_boolean_t
|
|
|
3604df |
afr_dict_contains_heal_op (call_frame_t *frame);
|
|
|
3604df |
|
|
|
3604df |
+gf_boolean_t
|
|
|
3604df |
+afr_can_decide_split_brain_source_sinks (struct afr_reply *replies,
|
|
|
3604df |
+ int child_count);
|
|
|
3604df |
int
|
|
|
3604df |
afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this,
|
|
|
3604df |
inode_t *inode,
|
|
|
3604df |
--
|
|
|
3604df |
2.9.3
|
|
|
3604df |
|