From 2d168099b85ef5f5d8b6f6a1ae55902b1360b441 Mon Sep 17 00:00:00 2001
From: Anuradha Talur <atalur@redhat.com>
Date: Mon, 3 Aug 2015 17:09:13 +0530
Subject: [PATCH 296/304] cluster/afr : Examine data/metadata readable for read-subvol During lookup and discover, currently read_subvol is based only on data_readable. read_subvol should be decided based on both data_readable and metadata_readable.
Credits to Ravishankar N for the logic of afr_first_up_child
from http://review.gluster.org/10905/ .
> Change-Id: I98580b23c278172ee2902be08eeaafb6722e830c
> BUG: 1240244
> Signed-off-by: Anuradha Talur <atalur@redhat.com>
> Reviewed-on: http://review.gluster.org/11551
> Reviewed-by: Ravishankar N <ravishankar@redhat.com>
> Tested-by: Gluster Build System <jenkins@build.gluster.com>
> Reviewed-by: Krutika Dhananjay <kdhananj@redhat.com>
> Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
BUG: 1238398
Signed-off-by: Anuradha Talur <atalur@redhat.com>
Change-Id: Ib4f1ee57dc6103dd608fc983d7cf7efab9dadf59
Reviewed-on: https://code.engineering.redhat.com/gerrit/56259
Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
Tested-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
---
.../replicate/bug-1238398-split-brain-resolution.t | 48 ++++++++++++
xlators/cluster/afr/src/afr-common.c | 77 +++++++++++++++-----
xlators/cluster/afr/src/afr-read-txn.c | 16 ++++-
3 files changed, 118 insertions(+), 23 deletions(-)
create mode 100644 tests/bugs/replicate/bug-1238398-split-brain-resolution.t
diff --git a/tests/bugs/replicate/bug-1238398-split-brain-resolution.t b/tests/bugs/replicate/bug-1238398-split-brain-resolution.t
new file mode 100644
index 0000000..7ba09f0
--- /dev/null
+++ b/tests/bugs/replicate/bug-1238398-split-brain-resolution.t
@@ -0,0 +1,48 @@
+#!/bin/bash
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+cleanup;
+
+function get_split_brain_status {
+ local path=$1
+ echo `getfattr -n replica.split-brain-status $path` | cut -f2 -d"=" | sed -e 's/^"//' -e 's/"$//'
+}
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1}
+TEST $CLI volume start $V0
+
+#Disable self-heal-daemon
+TEST $CLI volume set $V0 cluster.self-heal-daemon off
+
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 --entry-timeout=0 $M0;
+
+TEST `echo "some-data" > $M0/metadata-split-brain.txt`
+
+#Create metadata split-brain
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST chmod 666 $M0/metadata-split-brain.txt
+
+TEST $CLI volume start $V0 force
+TEST kill_brick $V0 $H0 $B0/${V0}1
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
+
+TEST chmod 757 $M0/metadata-split-brain.txt
+
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
+
+EXPECT 2 get_pending_heal_count $V0
+
+#Inspect the file in metadata-split-brain
+EXPECT "data-split-brain:no metadata-split-brain:yes Choices:patchy-client-0,patchy-client-1" get_split_brain_status $M0/metadata-split-brain.txt
+TEST setfattr -n replica.split-brain-choice -v $V0-client-0 $M0/metadata-split-brain.txt
+
+EXPECT "757" stat -c %a $M0/metadata-split-brain.txt
+
+TEST setfattr -n replica.split-brain-choice -v $V0-client-1 $M0/metadata-split-brain.txt
+EXPECT "666" stat -c %a $M0/metadata-split-brain.txt
+
+cleanup;
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 2401cfd..7255179 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -368,7 +368,8 @@ afr_inode_get_readable (call_frame_t *frame, inode_t *inode, xlator_t *this,
if (inode->ia_type == IA_IFDIR) {
/* For directories, allow even if it is in data split-brain. */
- if (type == AFR_METADATA_TRANSACTION) {
+ if (type == AFR_METADATA_TRANSACTION ||
+ local->op == GF_FOP_STAT || local->op == GF_FOP_FSTAT) {
if (!metadata_count)
return -EIO;
}
@@ -1508,6 +1509,40 @@ afr_get_parent_read_subvol (xlator_t *this, inode_t *parent,
}
+int
+afr_read_subvol_decide (inode_t *inode, xlator_t *this,
+ afr_read_subvol_args_t *args)
+{
+ int data_subvol = -1;
+ int mdata_subvol = -1;
+
+ data_subvol = afr_data_subvol_get (inode, this,
+ 0, 0, args);
+ mdata_subvol = afr_metadata_subvol_get (inode, this,
+ 0, 0, args);
+ if (data_subvol == -1 || mdata_subvol == -1)
+ return -1;
+
+ return data_subvol;
+}
+
+static inline int
+afr_first_up_child (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++)
+ if (local->replies[i].valid &&
+ local->replies[i].op_ret == 0)
+ return i;
+ return 0;
+}
+
static void
afr_lookup_done (call_frame_t *frame, xlator_t *this)
{
@@ -1623,13 +1658,13 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
gf_uuid_copy (args.gfid, read_gfid);
args.ia_type = ia_type;
if (afr_replies_interpret (frame, this, local->inode)) {
- read_subvol = afr_data_subvol_get (local->inode, this,
- 0, 0, &args);
+ read_subvol = afr_read_subvol_decide (local->inode,
+ this, &args);
afr_inode_read_subvol_reset (local->inode, this);
goto cant_interpret;
} else {
- read_subvol = afr_data_subvol_get (local->inode, this,
- 0, 0, &args);
+ read_subvol = afr_data_subvol_get (local->inode, this,
+ 0, 0, &args);
}
} else {
cant_interpret:
@@ -1637,7 +1672,7 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
if (spb_choice >= 0)
read_subvol = spb_choice;
else
- read_subvol = 0;
+ read_subvol = afr_first_up_child (frame, this);
}
dict_del (replies[read_subvol].xdata, GF_CONTENT_KEY);
}
@@ -1649,7 +1684,7 @@ unwind:
if (spb_choice >= 0)
read_subvol = spb_choice;
else
- read_subvol = 0;
+ read_subvol = afr_first_up_child (frame, this);
}
par_read_subvol = afr_get_parent_read_subvol (this, parent, replies,
readable);
@@ -2030,11 +2065,15 @@ afr_discover_done (call_frame_t *frame, xlator_t *this)
afr_local_t *local = NULL;
int i = -1;
int op_errno = 0;
- int read_subvol = 0;
+ int spb_choice = -1;
+ int read_subvol = -1;
priv = this->private;
local = frame->local;
+ afr_inode_split_brain_choice_get (local->inode, this,
+ &spb_choice);
+
for (i = 0; i < priv->child_count; i++) {
if (!local->replies[i].valid)
continue;
@@ -2052,27 +2091,25 @@ afr_discover_done (call_frame_t *frame, xlator_t *this)
afr_replies_interpret (frame, this, local->inode);
- read_subvol = afr_data_subvol_get (local->inode, this, 0, 0, NULL);
+ read_subvol = afr_read_subvol_decide (local->inode, this, NULL);
if (read_subvol == -1) {
gf_msg (this->name, GF_LOG_WARNING, 0,
AFR_MSG_READ_SUBVOL_ERROR, "no read subvols for %s",
local->loc.path);
- for (i = 0; i < priv->child_count; i++) {
- if (!local->replies[i].valid ||
- local->replies[i].op_ret == -1)
- continue;
- read_subvol = i;
- break;
- }
+ if (spb_choice >= 0) {
+ read_subvol = spb_choice;
+ } else {
+ read_subvol = afr_first_up_child (frame, this);
+ }
}
unwind:
if (read_subvol == -1) {
- afr_inode_split_brain_choice_get (local->inode, this,
- &read_subvol);
- if (read_subvol == -1)
- read_subvol = 0;
+ if (spb_choice >= 0)
+ read_subvol = spb_choice;
+ else
+ read_subvol = afr_first_up_child (frame, this);
}
AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c
index 1b2faf3..a70565c 100644
--- a/xlators/cluster/afr/src/afr-read-txn.c
+++ b/xlators/cluster/afr/src/afr-read-txn.c
@@ -193,12 +193,16 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
+ unsigned char *data = NULL;
+ unsigned char *metadata = NULL;
int read_subvol = -1;
int event_generation = 0;
int ret = -1;
priv = this->private;
local = frame->local;
+ data = alloca0 (priv->child_count);
+ metadata = alloca0 (priv->child_count);
afr_read_txn_wipe (frame, this);
@@ -213,10 +217,16 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
goto read;
}
-
local->transaction.type = type;
- ret = afr_inode_read_subvol_type_get (inode, this, local->readable,
- &event_generation, type);
+ if (local->op == GF_FOP_FSTAT || local->op == GF_FOP_STAT) {
+ ret = afr_inode_read_subvol_get (inode, this, data, metadata,
+ &event_generation);
+ AFR_INTERSECT (local->readable, data, metadata,
+ priv->child_count);
+ } else {
+ ret = afr_inode_read_subvol_type_get (inode, this, local->readable,
+ &event_generation, type);
+ }
if (ret == -1)
/* very first transaction on this inode */
goto refresh;
--
1.7.1