From 2d168099b85ef5f5d8b6f6a1ae55902b1360b441 Mon Sep 17 00:00:00 2001 From: Anuradha Talur Date: Mon, 3 Aug 2015 17:09:13 +0530 Subject: [PATCH 296/304] cluster/afr : Examine data/metadata readable for read-subvol During lookup and discover, currently read_subvol is based only on data_readable. read_subvol should be decided based on both data_readable and metadata_readable. Credits to Ravishankar N for the logic of afr_first_up_child from http://review.gluster.org/10905/ . > Change-Id: I98580b23c278172ee2902be08eeaafb6722e830c > BUG: 1240244 > Signed-off-by: Anuradha Talur > Reviewed-on: http://review.gluster.org/11551 > Reviewed-by: Ravishankar N > Tested-by: Gluster Build System > Reviewed-by: Krutika Dhananjay > Reviewed-by: Pranith Kumar Karampuri BUG: 1238398 Signed-off-by: Anuradha Talur Change-Id: Ib4f1ee57dc6103dd608fc983d7cf7efab9dadf59 Reviewed-on: https://code.engineering.redhat.com/gerrit/56259 Reviewed-by: Pranith Kumar Karampuri Tested-by: Pranith Kumar Karampuri --- .../replicate/bug-1238398-split-brain-resolution.t | 48 ++++++++++++ xlators/cluster/afr/src/afr-common.c | 77 +++++++++++++++----- xlators/cluster/afr/src/afr-read-txn.c | 16 ++++- 3 files changed, 118 insertions(+), 23 deletions(-) create mode 100644 tests/bugs/replicate/bug-1238398-split-brain-resolution.t diff --git a/tests/bugs/replicate/bug-1238398-split-brain-resolution.t b/tests/bugs/replicate/bug-1238398-split-brain-resolution.t new file mode 100644 index 0000000..7ba09f0 --- /dev/null +++ b/tests/bugs/replicate/bug-1238398-split-brain-resolution.t @@ -0,0 +1,48 @@ +#!/bin/bash +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc +cleanup; + +function get_split_brain_status { + local path=$1 + echo `getfattr -n replica.split-brain-status $path` | cut -f2 -d"=" | sed -e 's/^"//' -e 's/"$//' +} + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1} +TEST $CLI volume start $V0 + +#Disable self-heal-daemon +TEST $CLI volume set $V0 cluster.self-heal-daemon off + +TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 --entry-timeout=0 $M0; + +TEST `echo "some-data" > $M0/metadata-split-brain.txt` + +#Create metadata split-brain +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST chmod 666 $M0/metadata-split-brain.txt + +TEST $CLI volume start $V0 force +TEST kill_brick $V0 $H0 $B0/${V0}1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 + +TEST chmod 757 $M0/metadata-split-brain.txt + +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 + +EXPECT 2 get_pending_heal_count $V0 + +#Inspect the file in metadata-split-brain +EXPECT "data-split-brain:no metadata-split-brain:yes Choices:patchy-client-0,patchy-client-1" get_split_brain_status $M0/metadata-split-brain.txt +TEST setfattr -n replica.split-brain-choice -v $V0-client-0 $M0/metadata-split-brain.txt + +EXPECT "757" stat -c %a $M0/metadata-split-brain.txt + +TEST setfattr -n replica.split-brain-choice -v $V0-client-1 $M0/metadata-split-brain.txt +EXPECT "666" stat -c %a $M0/metadata-split-brain.txt + +cleanup; diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 2401cfd..7255179 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -368,7 +368,8 @@ afr_inode_get_readable (call_frame_t *frame, inode_t *inode, xlator_t *this, if (inode->ia_type == IA_IFDIR) { /* For directories, allow even if it is in data split-brain. */ - if (type == AFR_METADATA_TRANSACTION) { + if (type == AFR_METADATA_TRANSACTION || + local->op == GF_FOP_STAT || local->op == GF_FOP_FSTAT) { if (!metadata_count) return -EIO; } @@ -1508,6 +1509,40 @@ afr_get_parent_read_subvol (xlator_t *this, inode_t *parent, } +int +afr_read_subvol_decide (inode_t *inode, xlator_t *this, + afr_read_subvol_args_t *args) +{ + int data_subvol = -1; + int mdata_subvol = -1; + + data_subvol = afr_data_subvol_get (inode, this, + 0, 0, args); + mdata_subvol = afr_metadata_subvol_get (inode, this, + 0, 0, args); + if (data_subvol == -1 || mdata_subvol == -1) + return -1; + + return data_subvol; +} + +static inline int +afr_first_up_child (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + + local = frame->local; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) + if (local->replies[i].valid && + local->replies[i].op_ret == 0) + return i; + return 0; +} + static void afr_lookup_done (call_frame_t *frame, xlator_t *this) { @@ -1623,13 +1658,13 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this) gf_uuid_copy (args.gfid, read_gfid); args.ia_type = ia_type; if (afr_replies_interpret (frame, this, local->inode)) { - read_subvol = afr_data_subvol_get (local->inode, this, - 0, 0, &args); + read_subvol = afr_read_subvol_decide (local->inode, + this, &args); afr_inode_read_subvol_reset (local->inode, this); goto cant_interpret; } else { - read_subvol = afr_data_subvol_get (local->inode, this, - 0, 0, &args); + read_subvol = afr_data_subvol_get (local->inode, this, + 0, 0, &args); } } else { cant_interpret: @@ -1637,7 +1672,7 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this) if (spb_choice >= 0) read_subvol = spb_choice; else - read_subvol = 0; + read_subvol = afr_first_up_child (frame, this); } dict_del (replies[read_subvol].xdata, GF_CONTENT_KEY); } @@ -1649,7 +1684,7 @@ unwind: if (spb_choice >= 0) read_subvol = spb_choice; else - read_subvol = 0; + read_subvol = afr_first_up_child (frame, this); } par_read_subvol = afr_get_parent_read_subvol (this, parent, replies, readable); @@ -2030,11 +2065,15 @@ afr_discover_done (call_frame_t *frame, xlator_t *this) afr_local_t *local = NULL; int i = -1; int op_errno = 0; - int read_subvol = 0; + int spb_choice = -1; + int read_subvol = -1; priv = this->private; local = frame->local; + afr_inode_split_brain_choice_get (local->inode, this, + &spb_choice); + for (i = 0; i < priv->child_count; i++) { if (!local->replies[i].valid) continue; @@ -2052,27 +2091,25 @@ afr_discover_done (call_frame_t *frame, xlator_t *this) afr_replies_interpret (frame, this, local->inode); - read_subvol = afr_data_subvol_get (local->inode, this, 0, 0, NULL); + read_subvol = afr_read_subvol_decide (local->inode, this, NULL); if (read_subvol == -1) { gf_msg (this->name, GF_LOG_WARNING, 0, AFR_MSG_READ_SUBVOL_ERROR, "no read subvols for %s", local->loc.path); - for (i = 0; i < priv->child_count; i++) { - if (!local->replies[i].valid || - local->replies[i].op_ret == -1) - continue; - read_subvol = i; - break; - } + if (spb_choice >= 0) { + read_subvol = spb_choice; + } else { + read_subvol = afr_first_up_child (frame, this); + } } unwind: if (read_subvol == -1) { - afr_inode_split_brain_choice_get (local->inode, this, - &read_subvol); - if (read_subvol == -1) - read_subvol = 0; + if (spb_choice >= 0) + read_subvol = spb_choice; + else + read_subvol = afr_first_up_child (frame, this); } AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c index 1b2faf3..a70565c 100644 --- a/xlators/cluster/afr/src/afr-read-txn.c +++ b/xlators/cluster/afr/src/afr-read-txn.c @@ -193,12 +193,16 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode, { afr_local_t *local = NULL; afr_private_t *priv = NULL; + unsigned char *data = NULL; + unsigned char *metadata = NULL; int read_subvol = -1; int event_generation = 0; int ret = -1; priv = this->private; local = frame->local; + data = alloca0 (priv->child_count); + metadata = alloca0 (priv->child_count); afr_read_txn_wipe (frame, this); @@ -213,10 +217,16 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode, goto read; } - local->transaction.type = type; - ret = afr_inode_read_subvol_type_get (inode, this, local->readable, - &event_generation, type); + if (local->op == GF_FOP_FSTAT || local->op == GF_FOP_STAT) { + ret = afr_inode_read_subvol_get (inode, this, data, metadata, + &event_generation); + AFR_INTERSECT (local->readable, data, metadata, + priv->child_count); + } else { + ret = afr_inode_read_subvol_type_get (inode, this, local->readable, + &event_generation, type); + } if (ret == -1) /* very first transaction on this inode */ goto refresh; -- 1.7.1