Blob Blame History Raw
From 2d168099b85ef5f5d8b6f6a1ae55902b1360b441 Mon Sep 17 00:00:00 2001
From: Anuradha Talur <atalur@redhat.com>
Date: Mon, 3 Aug 2015 17:09:13 +0530
Subject: [PATCH 296/304] cluster/afr : Examine data/metadata readable for read-subvol During lookup and discover, currently read_subvol is based only on data_readable. read_subvol should be decided based on both data_readable and metadata_readable.

Credits to Ravishankar N for the logic of afr_first_up_child
from http://review.gluster.org/10905/ .

> Change-Id: I98580b23c278172ee2902be08eeaafb6722e830c
> BUG: 1240244
> Signed-off-by: Anuradha Talur <atalur@redhat.com>
> Reviewed-on: http://review.gluster.org/11551
> Reviewed-by: Ravishankar N <ravishankar@redhat.com>
> Tested-by: Gluster Build System <jenkins@build.gluster.com>
> Reviewed-by: Krutika Dhananjay <kdhananj@redhat.com>
> Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>

BUG: 1238398
Signed-off-by: Anuradha Talur <atalur@redhat.com>
Change-Id: Ib4f1ee57dc6103dd608fc983d7cf7efab9dadf59
Reviewed-on: https://code.engineering.redhat.com/gerrit/56259
Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
Tested-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
---
 .../replicate/bug-1238398-split-brain-resolution.t |   48 ++++++++++++
 xlators/cluster/afr/src/afr-common.c               |   77 +++++++++++++++-----
 xlators/cluster/afr/src/afr-read-txn.c             |   16 ++++-
 3 files changed, 118 insertions(+), 23 deletions(-)
 create mode 100644 tests/bugs/replicate/bug-1238398-split-brain-resolution.t

diff --git a/tests/bugs/replicate/bug-1238398-split-brain-resolution.t b/tests/bugs/replicate/bug-1238398-split-brain-resolution.t
new file mode 100644
index 0000000..7ba09f0
--- /dev/null
+++ b/tests/bugs/replicate/bug-1238398-split-brain-resolution.t
@@ -0,0 +1,48 @@
+#!/bin/bash
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+cleanup;
+
+function get_split_brain_status {
+        local path=$1
+        echo `getfattr -n replica.split-brain-status $path` | cut -f2 -d"=" | sed -e 's/^"//'  -e 's/"$//'
+}
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1}
+TEST $CLI volume start $V0
+
+#Disable self-heal-daemon
+TEST $CLI volume set $V0 cluster.self-heal-daemon off
+
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 --entry-timeout=0 $M0;
+
+TEST `echo "some-data" > $M0/metadata-split-brain.txt`
+
+#Create metadata split-brain
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST chmod 666 $M0/metadata-split-brain.txt
+
+TEST $CLI volume start $V0 force
+TEST kill_brick $V0 $H0 $B0/${V0}1
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
+
+TEST chmod 757 $M0/metadata-split-brain.txt
+
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
+
+EXPECT 2 get_pending_heal_count $V0
+
+#Inspect the file in metadata-split-brain
+EXPECT "data-split-brain:no metadata-split-brain:yes Choices:patchy-client-0,patchy-client-1" get_split_brain_status $M0/metadata-split-brain.txt
+TEST setfattr -n replica.split-brain-choice -v $V0-client-0 $M0/metadata-split-brain.txt
+
+EXPECT "757" stat -c %a $M0/metadata-split-brain.txt
+
+TEST setfattr -n replica.split-brain-choice -v $V0-client-1 $M0/metadata-split-brain.txt
+EXPECT "666" stat -c %a $M0/metadata-split-brain.txt
+
+cleanup;
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 2401cfd..7255179 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -368,7 +368,8 @@ afr_inode_get_readable (call_frame_t *frame, inode_t *inode, xlator_t *this,
 
         if (inode->ia_type == IA_IFDIR) {
                 /* For directories, allow even if it is in data split-brain. */
-                if (type == AFR_METADATA_TRANSACTION) {
+                if (type == AFR_METADATA_TRANSACTION ||
+                    local->op == GF_FOP_STAT || local->op == GF_FOP_FSTAT) {
                         if (!metadata_count)
                                 return -EIO;
                 }
@@ -1508,6 +1509,40 @@ afr_get_parent_read_subvol (xlator_t *this, inode_t *parent,
 
 }
 
+int
+afr_read_subvol_decide (inode_t *inode, xlator_t *this,
+                        afr_read_subvol_args_t *args)
+{
+        int data_subvol  = -1;
+        int mdata_subvol = -1;
+
+        data_subvol = afr_data_subvol_get (inode, this,
+                                           0, 0, args);
+        mdata_subvol = afr_metadata_subvol_get (inode, this,
+                                                0, 0, args);
+        if (data_subvol == -1 || mdata_subvol == -1)
+                return -1;
+
+        return data_subvol;
+}
+
+static inline int
+afr_first_up_child (call_frame_t *frame, xlator_t *this)
+{
+        afr_private_t       *priv  = NULL;
+        afr_local_t         *local = NULL;
+        int                  i     = 0;
+
+        local = frame->local;
+        priv = this->private;
+
+        for (i = 0; i < priv->child_count; i++)
+                if (local->replies[i].valid &&
+                    local->replies[i].op_ret == 0)
+                        return i;
+        return 0;
+}
+
 static void
 afr_lookup_done (call_frame_t *frame, xlator_t *this)
 {
@@ -1623,13 +1658,13 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
                 gf_uuid_copy (args.gfid, read_gfid);
                 args.ia_type = ia_type;
 		if (afr_replies_interpret (frame, this, local->inode)) {
-			read_subvol = afr_data_subvol_get (local->inode, this,
-							   0, 0, &args);
+                        read_subvol = afr_read_subvol_decide (local->inode,
+                                                              this, &args);
 			afr_inode_read_subvol_reset (local->inode, this);
 			goto cant_interpret;
 		} else {
-			read_subvol = afr_data_subvol_get (local->inode, this,
-							   0, 0, &args);
+                        read_subvol = afr_data_subvol_get (local->inode, this,
+                                                           0, 0, &args);
 		}
 	} else {
 	cant_interpret:
@@ -1637,7 +1672,7 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
                         if (spb_choice >= 0)
                                 read_subvol = spb_choice;
                         else
-                                read_subvol = 0;
+                                read_subvol = afr_first_up_child (frame, this);
                 }
 		dict_del (replies[read_subvol].xdata, GF_CONTENT_KEY);
 	}
@@ -1649,7 +1684,7 @@ unwind:
                 if (spb_choice >= 0)
                         read_subvol = spb_choice;
                 else
-                        read_subvol = 0;
+                        read_subvol = afr_first_up_child (frame, this);
         }
         par_read_subvol = afr_get_parent_read_subvol (this, parent, replies,
                                                       readable);
@@ -2030,11 +2065,15 @@ afr_discover_done (call_frame_t *frame, xlator_t *this)
         afr_local_t         *local = NULL;
 	int                 i = -1;
 	int                 op_errno = 0;
-	int                 read_subvol = 0;
+	int                 spb_choice = -1;
+	int                 read_subvol = -1;
 
         priv  = this->private;
         local = frame->local;
 
+        afr_inode_split_brain_choice_get (local->inode, this,
+                                          &spb_choice);
+
 	for (i = 0; i < priv->child_count; i++) {
 		if (!local->replies[i].valid)
 			continue;
@@ -2052,27 +2091,25 @@ afr_discover_done (call_frame_t *frame, xlator_t *this)
 
 	afr_replies_interpret (frame, this, local->inode);
 
-	read_subvol = afr_data_subvol_get (local->inode, this, 0, 0, NULL);
+	read_subvol = afr_read_subvol_decide (local->inode, this, NULL);
 	if (read_subvol == -1) {
 	        gf_msg (this->name, GF_LOG_WARNING, 0,
                         AFR_MSG_READ_SUBVOL_ERROR, "no read subvols for %s",
 			local->loc.path);
 
-		for (i = 0; i < priv->child_count; i++) {
-			if (!local->replies[i].valid ||
-			    local->replies[i].op_ret == -1)
-				continue;
-			read_subvol = i;
-			break;
-		}
+                if (spb_choice >= 0) {
+                        read_subvol = spb_choice;
+                } else {
+                        read_subvol = afr_first_up_child (frame, this);
+                }
 	}
 
 unwind:
 	if (read_subvol == -1) {
-                afr_inode_split_brain_choice_get (local->inode, this,
-                                                        &read_subvol);
-                if (read_subvol == -1)
-                        read_subvol = 0;
+                if (spb_choice >= 0)
+                        read_subvol = spb_choice;
+                else
+                        read_subvol = afr_first_up_child (frame, this);
         }
 
 	AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c
index 1b2faf3..a70565c 100644
--- a/xlators/cluster/afr/src/afr-read-txn.c
+++ b/xlators/cluster/afr/src/afr-read-txn.c
@@ -193,12 +193,16 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
 {
 	afr_local_t *local = NULL;
 	afr_private_t *priv = NULL;
+        unsigned char *data = NULL;
+        unsigned char *metadata = NULL;
 	int read_subvol = -1;
 	int event_generation = 0;
 	int ret = -1;
 
 	priv = this->private;
 	local = frame->local;
+        data = alloca0 (priv->child_count);
+        metadata = alloca0 (priv->child_count);
 
 	afr_read_txn_wipe (frame, this);
 
@@ -213,10 +217,16 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
                 goto read;
         }
 
-
 	local->transaction.type = type;
-	ret = afr_inode_read_subvol_type_get (inode, this, local->readable,
-					      &event_generation, type);
+        if (local->op == GF_FOP_FSTAT || local->op == GF_FOP_STAT) {
+                ret = afr_inode_read_subvol_get (inode, this, data, metadata,
+                                                 &event_generation);
+                AFR_INTERSECT (local->readable, data, metadata,
+                               priv->child_count);
+        } else {
+                ret = afr_inode_read_subvol_type_get (inode, this, local->readable,
+                                                      &event_generation, type);
+        }
 	if (ret == -1)
 		/* very first transaction on this inode */
 		goto refresh;
-- 
1.7.1