From 6a4c9ea70799464cdbe2eb0197698c9a5ebb7c5d Mon Sep 17 00:00:00 2001 From: Ravishankar N Date: Mon, 30 Jan 2017 09:54:16 +0530 Subject: [PATCH 286/294] afr: all children of AFR must be up to resolve s-brain Backport of: https://review.gluster.org/16476 Problem: The various split-brain resolution policies (favorite-child-policy based, CLI based and mount (get/setfattr) based) attempt to resolve split-brain even when not all bricks of replica are up. This can be a problem when say in a replica 3, the only good copy is down and the other 2 bricks are up and blame each other (i.e. split-brain). We end up healing the file in such a case and allow I/O on it. Fix: A decision on whether the file is in split-brain or not must be taken only if we are able to examine the afr xattrs of *all* bricks of a given replica. Change-Id: Icddb1268b380005799990f5379ef957d84639ef9 BUG: 1417177 Signed-off-by: Ravishankar N Reviewed-on: https://code.engineering.redhat.com/gerrit/97384 Reviewed-by: Atin Mukherjee --- .../bug-1417522-block-split-brain-resolution.t | 66 ++++++++++++++++++++++ xlators/cluster/afr/src/afr-common.c | 32 +++++++---- xlators/cluster/afr/src/afr-self-heal-common.c | 38 +++++++++++-- xlators/cluster/afr/src/afr-self-heal.h | 6 +- 4 files changed, 127 insertions(+), 15 deletions(-) create mode 100644 tests/bugs/replicate/bug-1417522-block-split-brain-resolution.t diff --git a/tests/bugs/replicate/bug-1417522-block-split-brain-resolution.t b/tests/bugs/replicate/bug-1417522-block-split-brain-resolution.t new file mode 100644 index 0000000..4592ebf --- /dev/null +++ b/tests/bugs/replicate/bug-1417522-block-split-brain-resolution.t @@ -0,0 +1,66 @@ +#!/bin/bash +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0..2} +TEST $CLI volume set $V0 self-heal-daemon off +TEST $CLI volume set $V0 data-self-heal off +TEST $CLI volume set $V0 entry-self-heal off +TEST $CLI volume set $V0 metadata-self-heal off +TEST $CLI volume start $V0 + +TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0; +TEST touch $M0/file + +TEST kill_brick $V0 $H0 $B0/${V0}1 +TEST dd if=/dev/urandom of=$M0/file bs=1024 count=10 +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 +TEST kill_brick $V0 $H0 $B0/${V0}2 +TEST dd if=/dev/urandom of=$M0/file bs=1024 count=20 +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2 +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST ! dd if=$M0/file of=/dev/null +SOURCE_BRICK_MD5=$(md5sum $B0/${V0}0/file | cut -d\ -f1) + +# Various fav-child policies must not heal the file when some bricks are down. +TEST $CLI volume set $V0 favorite-child-policy size +TEST ! dd if=$M0/file of=/dev/null +TEST $CLI volume set $V0 favorite-child-policy ctime +TEST ! dd if=$M0/file of=/dev/null +TEST $CLI volume set $V0 favorite-child-policy mtime +TEST ! dd if=$M0/file of=/dev/null +TEST $CLI volume set $V0 favorite-child-policy majority +TEST ! dd if=$M0/file of=/dev/null + +# CLI/mount based split-brain resolution must also not work. +TEST ! $CLI volume heal $V0 split-brain bigger-file /file +TEST ! $CLI volume heal $V0 split-brain mtime /file +TEST ! $CLI volume heal $V0 split-brain source-brick $H0:$B0/${V0}2 /file1 + +TEST ! getfattr -n replica.split-brain-status $M0/file +TEST ! setfattr -n replica.split-brain-choice -v $V0-client-1 $M0/file + +# Bring all bricks back up and launch heal. +TEST $CLI volume set $V0 self-heal-daemon on +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2 +TEST $CLI volume heal $V0 +EXPECT 0 get_pending_heal_count $V0 +B1_MD5=$(md5sum $B0/${V0}1/file | cut -d\ -f1) +B2_MD5=$(md5sum $B0/${V0}2/file | cut -d\ -f1) +TEST [ "$SOURCE_BRICK_MD5" == "$B1_MD5" ] +TEST [ "$SOURCE_BRICK_MD5" == "$B2_MD5" ] + +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 +cleanup; diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 696e909..03ae7f9 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -727,14 +727,17 @@ afr_set_split_brain_choice (int ret, call_frame_t *frame, void *opaque) gf_boolean_t timer_reset = _gf_false; int old_spb_choice = -1; - if (ret) - goto out; - frame = data->frame; loc = data->loc; this = frame->this; priv = this->private; + if (ret) { + op_errno = -ret; + ret = -1; + goto out; + } + delta.tv_sec = priv->spb_choice_timeout; delta.tv_nsec = 0; @@ -5551,6 +5554,12 @@ afr_is_split_brain (call_frame_t *frame, xlator_t *this, inode_t *inode, if (ret) goto out; + if (!afr_can_decide_split_brain_source_sinks (replies, + priv->child_count)) { + ret = -EAGAIN; + goto out; + } + ret = _afr_is_split_brain (frame, this, replies, AFR_DATA_TRANSACTION, d_spb); if (ret) @@ -5603,6 +5612,13 @@ afr_get_split_brain_status (void *opaque) if (!inode) goto out; + dict = dict_new (); + if (!dict) { + op_errno = ENOMEM; + ret = -1; + goto out; + } + /* Calculation for string length : * (child_count X length of child-name) + strlen (" Choices :") * child-name consists of : @@ -5616,13 +5632,9 @@ afr_get_split_brain_status (void *opaque) &m_spb); if (ret) { op_errno = -ret; - ret = -1; - goto out; - } - - dict = dict_new (); - if (!dict) { - op_errno = ENOMEM; + if (ret == -EAGAIN) + ret = dict_set_str (dict, GF_AFR_SBRAIN_STATUS, + SBRAIN_HEAL_NO_GO_MSG); ret = -1; goto out; } diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 74696b5..f731d42 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -473,6 +473,19 @@ afr_dict_contains_heal_op (call_frame_t *frame) return _gf_true; } +gf_boolean_t +afr_can_decide_split_brain_source_sinks (struct afr_reply *replies, + int child_count) +{ + int i = 0; + + for (i = 0; i < child_count; i++) + if (replies[i].valid != 1 || replies[i].op_ret != 0) + return _gf_false; + + return _gf_true; +} + int afr_mark_split_brain_source_sinks_by_heal_op (call_frame_t *frame, xlator_t *this, unsigned char *sources, @@ -511,6 +524,14 @@ afr_mark_split_brain_source_sinks_by_heal_op (call_frame_t *frame, } xdata_rsp = local->xdata_rsp; + if (!afr_can_decide_split_brain_source_sinks (replies, + priv->child_count)) { + ret = dict_set_str (xdata_rsp, "sh-fail-msg", + SBRAIN_HEAL_NO_GO_MSG); + ret = -1; + goto out; + } + for (i = 0 ; i < priv->child_count; i++) if (locked_on[i]) sources[i] = 1; @@ -749,26 +770,35 @@ afr_sh_get_fav_by_policy (xlator_t *this, struct afr_reply *replies, int fav_child = -1; priv = this->private; + if (!afr_can_decide_split_brain_source_sinks (replies, + priv->child_count)) { + return -1; + } + switch (priv->fav_child_policy) { case AFR_FAV_CHILD_BY_SIZE: fav_child = afr_sh_fav_by_size (this, replies, inode); - if (policy_str && fav_child >= 0) + if (policy_str && fav_child >= 0) { *policy_str = "SIZE"; + } break; case AFR_FAV_CHILD_BY_CTIME: fav_child = afr_sh_fav_by_ctime (this, replies, inode); - if (policy_str && fav_child >= 0) + if (policy_str && fav_child >= 0) { *policy_str = "CTIME"; + } break; case AFR_FAV_CHILD_BY_MTIME: fav_child = afr_sh_fav_by_mtime (this, replies, inode); - if (policy_str && fav_child >= 0) + if (policy_str && fav_child >= 0) { *policy_str = "MTIME"; + } break; case AFR_FAV_CHILD_BY_MAJORITY: fav_child = afr_sh_fav_by_majority (this, replies, inode); - if (policy_str && fav_child >= 0) + if (policy_str && fav_child >= 0) { *policy_str = "MAJORITY"; + } break; case AFR_FAV_CHILD_NONE: default: diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index 500227a..a339050 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -81,7 +81,8 @@ #define IA_EQUAL(f,s,field) (memcmp (&(f.ia_##field), &(s.ia_##field), sizeof (s.ia_##field)) == 0) - +#define SBRAIN_HEAL_NO_GO_MSG "Failed to obtain replies from all bricks of "\ + "the replica (are they up?). Cannot resolve split-brain." int afr_selfheal (xlator_t *this, uuid_t gfid); @@ -220,6 +221,9 @@ afr_mark_active_sinks (xlator_t *this, unsigned char *sources, gf_boolean_t afr_dict_contains_heal_op (call_frame_t *frame); +gf_boolean_t +afr_can_decide_split_brain_source_sinks (struct afr_reply *replies, + int child_count); int afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this, inode_t *inode, -- 2.9.3