e7a346
From e3f6fc1ccff95145b94aea9405cd136ada9000bc Mon Sep 17 00:00:00 2001
e7a346
From: Ravishankar N <ravishankar@redhat.com>
e7a346
Date: Wed, 16 Aug 2017 18:01:17 +0530
e7a346
Subject: [PATCH 476/493] afr: add checks for allowing lookups
e7a346
e7a346
Patch in upstream master: https://review.gluster.org/#/c/glusterfs/+/17673/
e7a346
e7a346
Problem:
e7a346
In an arbiter volume, lookup was being served from one of the sink
e7a346
bricks (source brick was down). shard uses the iatt values from lookup cbk
e7a346
to calculate the size and block count, which in this case were incorrect
e7a346
values. shard_local_t->last_block was thus initialised to -1, resulting
e7a346
in an infinite while loop in shard_common_resolve_shards().
e7a346
e7a346
Fix:
e7a346
Use client quorum logic to allow or fail the lookups from afr if there
e7a346
are no readable subvolumes. So in replica-3 or arbiter vols, if there is
e7a346
no good copy or if quorum is not met, fail lookup with ENOTCONN.
e7a346
e7a346
With this fix, we are also removing support for quorum-reads xlator
e7a346
option. So if quorum is not met, neither read nor write txns are allowed
e7a346
and we fail the fop with ENOTCONN.
e7a346
e7a346
Change-Id: Ic65c00c24f77ece007328b421494eee62a505fa0
e7a346
BUG: 1362129
e7a346
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
e7a346
Reviewed-on: https://code.engineering.redhat.com/gerrit/158650
e7a346
Tested-by: RHGS Build Bot <nigelb@redhat.com>
e7a346
Reviewed-by: Karthik Subrahmanya <ksubrahm@redhat.com>
e7a346
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
e7a346
---
e7a346
 tests/basic/afr/quorum.t               |  23 ----
e7a346
 tests/bugs/replicate/bug-977797.t      |   2 +
e7a346
 xlators/cluster/afr/src/afr-common.c   | 244 +++++++++++++++++++++------------
e7a346
 xlators/cluster/afr/src/afr-read-txn.c |   3 +-
e7a346
 xlators/cluster/afr/src/afr.c          |   7 +-
e7a346
 xlators/cluster/afr/src/afr.h          |   1 -
e7a346
 6 files changed, 164 insertions(+), 116 deletions(-)
e7a346
e7a346
diff --git a/tests/basic/afr/quorum.t b/tests/basic/afr/quorum.t
e7a346
index 252e254..58116ba 100644
e7a346
--- a/tests/basic/afr/quorum.t
e7a346
+++ b/tests/basic/afr/quorum.t
e7a346
@@ -31,11 +31,7 @@ TEST $CLI volume set $V0 cluster.quorum-count 2
e7a346
 TEST test_write
e7a346
 TEST kill_brick $V0 $H0 $B0/${V0}1
e7a346
 TEST ! test_write
e7a346
-EXPECT "abc" cat $M0/b
e7a346
-TEST $CLI volume set $V0 cluster.quorum-reads on
e7a346
-EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "1" mount_get_option_value $M0 $V0-replicate-0 quorum-reads
e7a346
 TEST ! cat $M0/b
e7a346
-TEST $CLI volume reset $V0 cluster.quorum-reads
e7a346
 
e7a346
 TEST $CLI volume set $V0 cluster.quorum-type auto
e7a346
 EXPECT auto volume_option $V0 cluster.quorum-type
e7a346
@@ -44,11 +40,7 @@ EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
e7a346
 TEST test_write
e7a346
 TEST kill_brick $V0 $H0 $B0/${V0}1
e7a346
 TEST ! test_write
e7a346
-EXPECT "abc" cat $M0/b
e7a346
-TEST $CLI volume set $V0 cluster.quorum-reads on
e7a346
-EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "1" mount_get_option_value $M0 $V0-replicate-0 quorum-reads
e7a346
 TEST ! cat $M0/b
e7a346
-TEST $CLI volume reset $V0 cluster.quorum-reads
e7a346
 
e7a346
 TEST $CLI volume set $V0 cluster.quorum-type none
e7a346
 EXPECT none volume_option $V0 cluster.quorum-type
e7a346
@@ -57,11 +49,6 @@ TEST test_write
e7a346
 TEST $CLI volume reset $V0 cluster.quorum-type
e7a346
 TEST test_write
e7a346
 EXPECT "abc" cat $M0/b
e7a346
-TEST $CLI volume set $V0 cluster.quorum-reads on
e7a346
-EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "1" mount_get_option_value $M0 $V0-replicate-0 quorum-reads
e7a346
-EXPECT "abc" cat $M0/b
e7a346
-TEST $CLI volume reset $V0 cluster.quorum-reads
e7a346
-
e7a346
 
e7a346
 cleanup;
e7a346
 TEST glusterd;
e7a346
@@ -86,24 +73,14 @@ TEST $CLI volume set $V0 cluster.quorum-count 3
e7a346
 TEST test_write
e7a346
 TEST kill_brick $V0 $H0 $B0/${V0}1
e7a346
 TEST ! test_write
e7a346
-EXPECT "abc" cat $M0/b
e7a346
-TEST $CLI volume set $V0 cluster.quorum-reads on
e7a346
-EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "1" mount_get_option_value $M0 $V0-replicate-0 quorum-reads
e7a346
 TEST ! cat $M0/b
e7a346
-TEST $CLI volume reset $V0 cluster.quorum-reads
e7a346
-
e7a346
 
e7a346
 TEST $CLI volume set $V0 cluster.quorum-type auto
e7a346
 EXPECT auto volume_option $V0 cluster.quorum-type
e7a346
 TEST test_write
e7a346
 TEST kill_brick $V0 $H0 $B0/${V0}3
e7a346
 TEST ! test_write
e7a346
-EXPECT "abc" cat $M0/b
e7a346
-TEST $CLI volume set $V0 cluster.quorum-reads on
e7a346
-EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "1" mount_get_option_value $M0 $V0-replicate-0 quorum-reads
e7a346
 TEST ! cat $M0/b
e7a346
-TEST $CLI volume reset $V0 cluster.quorum-reads
e7a346
-
e7a346
 
e7a346
 TEST $CLI volume set $V0 cluster.quorum-type none
e7a346
 EXPECT none volume_option $V0 cluster.quorum-type
e7a346
diff --git a/tests/bugs/replicate/bug-977797.t b/tests/bugs/replicate/bug-977797.t
e7a346
index ea9a98a..fee8205 100755
e7a346
--- a/tests/bugs/replicate/bug-977797.t
e7a346
+++ b/tests/bugs/replicate/bug-977797.t
e7a346
@@ -53,6 +53,8 @@ TEST chmod 757 $M0/a/file
e7a346
 TEST $CLI volume start $V0 force
e7a346
 EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1;
e7a346
 
e7a346
+#Trigger entry heal of $M0/a
e7a346
+getfattr -n user.nosuchattr $M0/a
e7a346
 dd if=$M0/a/file of=/dev/null bs=1024k
e7a346
 #read fails, but heal is triggered.
e7a346
 TEST [ $? -ne 0 ]
e7a346
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
e7a346
index 10d9620..231de9d 100644
e7a346
--- a/xlators/cluster/afr/src/afr-common.c
e7a346
+++ b/xlators/cluster/afr/src/afr-common.c
e7a346
@@ -47,9 +47,7 @@
e7a346
 int32_t
e7a346
 afr_quorum_errno (afr_private_t *priv)
e7a346
 {
e7a346
-        if (priv->quorum_reads)
e7a346
-                return ENOTCONN;
e7a346
-        return EROFS;
e7a346
+        return ENOTCONN;
e7a346
 }
e7a346
 
e7a346
 int
e7a346
@@ -1154,8 +1152,6 @@ afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode,
e7a346
 	return ret;
e7a346
 }
e7a346
 
e7a346
-
e7a346
-
e7a346
 int
e7a346
 afr_refresh_selfheal_done (int ret, call_frame_t *heal, void *opaque)
e7a346
 {
e7a346
@@ -1726,6 +1722,29 @@ afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this,
e7a346
 	return ret;
e7a346
 }
e7a346
 
e7a346
+void
e7a346
+afr_readables_intersect_get (inode_t *inode, xlator_t *this, int *event,
e7a346
+                             unsigned char *intersection)
e7a346
+{
e7a346
+	afr_private_t *priv = NULL;
e7a346
+	unsigned char *data_readable = NULL;
e7a346
+	unsigned char *metadata_readable = NULL;
e7a346
+	unsigned char *intersect = NULL;
e7a346
+
e7a346
+	priv = this->private;
e7a346
+	data_readable = alloca0 (priv->child_count);
e7a346
+	metadata_readable = alloca0 (priv->child_count);
e7a346
+	intersect = alloca0 (priv->child_count);
e7a346
+
e7a346
+	afr_inode_read_subvol_get (inode, this, data_readable,
e7a346
+                               metadata_readable, event);
e7a346
+
e7a346
+	AFR_INTERSECT (intersect, data_readable, metadata_readable,
e7a346
+		       priv->child_count);
e7a346
+        if (intersection)
e7a346
+                memcpy (intersection, intersect,
e7a346
+                        sizeof (*intersection) * priv->child_count);
e7a346
+}
e7a346
 
e7a346
 int
e7a346
 afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p,
e7a346
@@ -1734,8 +1753,6 @@ afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p,
e7a346
                      afr_read_subvol_args_t *args)
e7a346
 {
e7a346
 	afr_private_t *priv = NULL;
e7a346
-	unsigned char *data_readable = NULL;
e7a346
-	unsigned char *metadata_readable = NULL;
e7a346
 	unsigned char *readable = NULL;
e7a346
 	unsigned char *intersection = NULL;
e7a346
 	int subvol = -1;
e7a346
@@ -1744,17 +1761,11 @@ afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p,
e7a346
 	priv = this->private;
e7a346
 
e7a346
 	readable = alloca0 (priv->child_count);
e7a346
-	data_readable = alloca0 (priv->child_count);
e7a346
-	metadata_readable = alloca0 (priv->child_count);
e7a346
 	intersection = alloca0 (priv->child_count);
e7a346
 
e7a346
 	afr_inode_read_subvol_type_get (inode, this, readable, &event, type);
e7a346
 
e7a346
-	afr_inode_read_subvol_get (inode, this, data_readable, metadata_readable,
e7a346
-				   &event);
e7a346
-
e7a346
-	AFR_INTERSECT (intersection, data_readable, metadata_readable,
e7a346
-		       priv->child_count);
e7a346
+        afr_readables_intersect_get (inode, this, &event, intersection);
e7a346
 
e7a346
 	if (AFR_COUNT (intersection, priv->child_count) > 0)
e7a346
 		subvol = afr_read_subvol_select_by_policy (inode, this,
e7a346
@@ -2188,18 +2199,28 @@ afr_get_parent_read_subvol (xlator_t *this, inode_t *parent,
e7a346
 
e7a346
 int
e7a346
 afr_read_subvol_decide (inode_t *inode, xlator_t *this,
e7a346
-                        afr_read_subvol_args_t *args)
e7a346
+                        afr_read_subvol_args_t *args, unsigned char *readable)
e7a346
 {
e7a346
-        int data_subvol  = -1;
e7a346
-        int mdata_subvol = -1;
e7a346
+        int event = 0;
e7a346
+        afr_private_t *priv = NULL;
e7a346
+        unsigned char *intersection = NULL;
e7a346
+
e7a346
+        priv = this->private;
e7a346
+        intersection = alloca0 (priv->child_count);
e7a346
+
e7a346
+        afr_readables_intersect_get (inode, this, &event, intersection);
e7a346
 
e7a346
-        data_subvol = afr_data_subvol_get (inode, this, NULL, NULL, NULL, args);
e7a346
-        mdata_subvol = afr_metadata_subvol_get (inode, this,
e7a346
-                                                NULL, NULL, NULL, args);
e7a346
-        if (data_subvol == -1 || mdata_subvol == -1)
e7a346
+        if (AFR_COUNT (intersection, priv->child_count) <= 0) {
e7a346
+                /* TODO: If we have one brick with valid data_readable and
e7a346
+                 * another with metadata_readable, try to send an iatt with
e7a346
+                 * valid bits from both.*/
e7a346
                 return -1;
e7a346
+        }
e7a346
 
e7a346
-        return data_subvol;
e7a346
+        memcpy (readable, intersection, sizeof (*readable) * priv->child_count);
e7a346
+
e7a346
+        return afr_read_subvol_select_by_policy (inode, this, intersection,
e7a346
+                                                 args);
e7a346
 }
e7a346
 
e7a346
 static inline int
e7a346
@@ -2216,7 +2237,49 @@ afr_first_up_child (call_frame_t *frame, xlator_t *this)
e7a346
                 if (local->replies[i].valid &&
e7a346
                     local->replies[i].op_ret == 0)
e7a346
                         return i;
e7a346
-        return 0;
e7a346
+        return -1;
e7a346
+}
e7a346
+
e7a346
+static void
e7a346
+afr_attempt_readsubvol_set (call_frame_t *frame, xlator_t *this,
e7a346
+                            unsigned char *success_replies,
e7a346
+                            unsigned char *data_readable, int *read_subvol)
e7a346
+{
e7a346
+        afr_private_t *priv = NULL;
e7a346
+        afr_local_t *local = NULL;
e7a346
+        int spb_choice = -1;
e7a346
+        int child_count = -1;
e7a346
+
e7a346
+        if (*read_subvol != -1)
e7a346
+                return;
e7a346
+
e7a346
+        priv = this->private;
e7a346
+        local = frame->local;
e7a346
+        child_count = priv->child_count;
e7a346
+
e7a346
+        afr_inode_split_brain_choice_get (local->inode, this,
e7a346
+                                          &spb_choice);
e7a346
+        if ((spb_choice >= 0) &&
e7a346
+            (AFR_COUNT(success_replies, child_count) == child_count)) {
e7a346
+                *read_subvol = spb_choice;
e7a346
+        } else if (!priv->quorum_count) {
e7a346
+                *read_subvol = afr_first_up_child (frame, this);
e7a346
+        } else if (priv->quorum_count &&
e7a346
+                   afr_has_quorum (data_readable, this)) {
e7a346
+                /* read_subvol is guaranteed to be valid if we hit this path. */
e7a346
+                *read_subvol = afr_first_up_child (frame, this);
e7a346
+        } else {
e7a346
+               /* If quorum is enabled and we do not have a
e7a346
+                  readable yet, it means all good copies are down.
e7a346
+               */
e7a346
+               local->op_ret = -1;
e7a346
+               local->op_errno = ENOTCONN;
e7a346
+               gf_msg (this->name, GF_LOG_WARNING, 0,
e7a346
+                       AFR_MSG_READ_SUBVOL_ERROR, "no read "
e7a346
+                       "subvols for %s", local->loc.path);
e7a346
+        }
e7a346
+        if (*read_subvol >= 0)
e7a346
+                dict_del (local->replies[*read_subvol].xdata, GF_CONTENT_KEY);
e7a346
 }
e7a346
 
e7a346
 static void
e7a346
@@ -2230,13 +2293,13 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
e7a346
         int                 par_read_subvol = 0;
e7a346
         int                 ret         = -1;
e7a346
 	unsigned char      *readable = NULL;
e7a346
+        unsigned char      *success_replies = NULL;
e7a346
 	int                 event = 0;
e7a346
 	struct afr_reply   *replies = NULL;
e7a346
 	uuid_t              read_gfid = {0, };
e7a346
 	gf_boolean_t        locked_entry = _gf_false;
e7a346
 	gf_boolean_t        can_interpret = _gf_true;
e7a346
         inode_t            *parent = NULL;
e7a346
-        int                 spb_choice = -1;
e7a346
         ia_type_t           ia_type = IA_INVAL;
e7a346
         afr_read_subvol_args_t args = {0,};
e7a346
         char               *gfid_heal_msg = NULL;
e7a346
@@ -2250,11 +2313,12 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
e7a346
                                                   this);
e7a346
 
e7a346
 	readable = alloca0 (priv->child_count);
e7a346
+	success_replies = alloca0 (priv->child_count);
e7a346
 
e7a346
 	afr_inode_read_subvol_get (parent, this, readable, NULL, &event);
e7a346
+        par_read_subvol = afr_get_parent_read_subvol (this, parent, replies,
e7a346
+                                                      readable);
e7a346
 
e7a346
-        afr_inode_split_brain_choice_get (local->inode, this,
e7a346
-                                                &spb_choice);
e7a346
 	/* First, check if we have a gfid-change from somewhere,
e7a346
 	   If so, propagate that so that a fresh lookup can be
e7a346
 	   issued
e7a346
@@ -2262,13 +2326,17 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
e7a346
         if (local->cont.lookup.needs_fresh_lookup) {
e7a346
                 local->op_ret = -1;
e7a346
                 local->op_errno = ESTALE;
e7a346
-                goto unwind;
e7a346
+                goto error;
e7a346
         }
e7a346
 
e7a346
 	op_errno = afr_final_errno (frame->local, this->private);
e7a346
 	local->op_errno = op_errno;
e7a346
 
e7a346
 	read_subvol = -1;
e7a346
+        for (i = 0; i < priv->child_count; i++)
e7a346
+                if (replies[i].valid && replies[i].op_ret == 0)
e7a346
+                        success_replies[i] = 1;
e7a346
+
e7a346
 	for (i = 0; i < priv->child_count; i++) {
e7a346
 		if (!replies[i].valid)
e7a346
 			continue;
e7a346
@@ -2277,9 +2345,9 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
e7a346
 		    replies[i].op_errno == ENOENT) {
e7a346
 			/* Second, check entry is still
e7a346
 			   "underway" in creation */
e7a346
-			local->op_ret = -1;
e7a346
-			local->op_errno = ENOENT;
e7a346
-			goto unwind;
e7a346
+                        local->op_ret = -1;
e7a346
+                        local->op_errno = ENOENT;
e7a346
+                        goto error;
e7a346
 		}
e7a346
 
e7a346
 		if (replies[i].op_ret == -1)
e7a346
@@ -2293,8 +2361,8 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
e7a346
 		}
e7a346
 	}
e7a346
 
e7a346
-	if (read_subvol == -1)
e7a346
-		goto unwind;
e7a346
+        if (read_subvol == -1)
e7a346
+                goto error;
e7a346
 	/* We now have a read_subvol, which is readable[] (if there
e7a346
 	   were any). Next we look for GFID mismatches. We don't
e7a346
 	   consider a GFID mismatch as an error if read_subvol is
e7a346
@@ -2318,58 +2386,61 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
e7a346
 		if (readable[read_subvol] && !readable[i])
e7a346
 			continue;
e7a346
 
e7a346
+                /* If we were called from glfsheal and there is still a gfid
e7a346
+                 * mismatch, succeed the lookup and let glfsheal print the
e7a346
+                 * response via gfid-heal-msg.*/
e7a346
+                if (!dict_get_str (local->xattr_req, "gfid-heal-msg",
e7a346
+                                   &gfid_heal_msg))
e7a346
+                        goto cant_interpret;
e7a346
+
e7a346
 		/* LOG ERROR */
e7a346
 		local->op_ret = -1;
e7a346
 		local->op_errno = EIO;
e7a346
-		goto unwind;
e7a346
+		goto error;
e7a346
 	}
e7a346
 
e7a346
 	/* Forth, for the finalized GFID, pick the best subvolume
e7a346
 	   to return stats from.
e7a346
 	*/
e7a346
+        read_subvol = -1;
e7a346
+        memset (readable, 0, sizeof (*readable) * priv->child_count);
e7a346
 	if (can_interpret) {
e7a346
+                if (!afr_has_quorum (success_replies, this))
e7a346
+                        goto cant_interpret;
e7a346
 		/* It is safe to call afr_replies_interpret() because we have
e7a346
 		   a response from all the UP subvolumes and all of them resolved
e7a346
 		   to the same GFID
e7a346
 		*/
e7a346
                 gf_uuid_copy (args.gfid, read_gfid);
e7a346
                 args.ia_type = ia_type;
e7a346
-		if (afr_replies_interpret (frame, this, local->inode, NULL)) {
e7a346
-                        read_subvol = afr_read_subvol_decide (local->inode,
e7a346
-                                                              this, &args);
e7a346
+		ret = afr_replies_interpret (frame, this, local->inode, NULL);
e7a346
+                read_subvol = afr_read_subvol_decide (local->inode, this, &args,
e7a346
+                                                      readable);
e7a346
+                if (read_subvol == -1)
e7a346
+                        goto cant_interpret;
e7a346
+                if (ret) {
e7a346
 			afr_inode_event_gen_reset (local->inode, this);
e7a346
-			goto cant_interpret;
e7a346
-		} else {
e7a346
-                        read_subvol = afr_data_subvol_get (local->inode, this,
e7a346
-                                                       NULL, NULL, NULL, &args);
e7a346
-		}
e7a346
+                        dict_del (local->replies[read_subvol].xdata,
e7a346
+                                  GF_CONTENT_KEY);
e7a346
+                }
e7a346
 	} else {
e7a346
 	cant_interpret:
e7a346
+                afr_attempt_readsubvol_set (frame, this, success_replies,
e7a346
+                                            readable, &read_subvol);
e7a346
                 if (read_subvol == -1) {
e7a346
-                        if (spb_choice >= 0)
e7a346
-                                read_subvol = spb_choice;
e7a346
-                        else
e7a346
-                                read_subvol = afr_first_up_child (frame, this);
e7a346
+                        goto error;
e7a346
                 }
e7a346
-		dict_del (replies[read_subvol].xdata, GF_CONTENT_KEY);
e7a346
 	}
e7a346
 
e7a346
 	afr_handle_quota_size (frame, this);
e7a346
 
e7a346
-unwind:
e7a346
         afr_set_need_heal (this, local);
e7a346
-        if (read_subvol == -1) {
e7a346
-                if (spb_choice >= 0)
e7a346
-                        read_subvol = spb_choice;
e7a346
-                else
e7a346
-                        read_subvol = afr_first_up_child (frame, this);
e7a346
-
e7a346
-        }
e7a346
-        par_read_subvol = afr_get_parent_read_subvol (this, parent, replies,
e7a346
-                                                      readable);
e7a346
         if (AFR_IS_ARBITER_BRICK (priv, read_subvol) && local->op_ret == 0) {
e7a346
-                        local->op_ret = -1;
e7a346
-                        local->op_errno = ENOTCONN;
e7a346
+                local->op_ret = -1;
e7a346
+                local->op_errno = ENOTCONN;
e7a346
+                gf_msg_debug(this->name, 0, "Arbiter cannot be a read subvol "
e7a346
+                             "for %s", local->loc.path);
e7a346
+                goto error;
e7a346
         }
e7a346
 
e7a346
         ret = dict_get_str (local->xattr_req, "gfid-heal-msg", &gfid_heal_msg);
e7a346
@@ -2389,6 +2460,11 @@ unwind:
e7a346
 			  local->inode, &local->replies[read_subvol].poststat,
e7a346
 			  local->replies[read_subvol].xdata,
e7a346
 			  &local->replies[par_read_subvol].postparent);
e7a346
+        return;
e7a346
+
e7a346
+error:
e7a346
+        AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, NULL,
e7a346
+                          NULL, NULL, NULL);
e7a346
 }
e7a346
 
e7a346
 /*
e7a346
@@ -2904,55 +2980,54 @@ afr_discover_done (call_frame_t *frame, xlator_t *this)
e7a346
         afr_local_t         *local = NULL;
e7a346
 	int                 i = -1;
e7a346
 	int                 op_errno = 0;
e7a346
-	int                 spb_choice = -1;
e7a346
 	int                 read_subvol = -1;
e7a346
+        unsigned char *data_readable = NULL;
e7a346
+        unsigned char *success_replies = NULL;
e7a346
 
e7a346
         priv  = this->private;
e7a346
         local = frame->local;
e7a346
-
e7a346
-        afr_inode_split_brain_choice_get (local->inode, this,
e7a346
-                                          &spb_choice);
e7a346
+        data_readable = alloca0 (priv->child_count);
e7a346
+        success_replies = alloca0 (priv->child_count);
e7a346
 
e7a346
 	for (i = 0; i < priv->child_count; i++) {
e7a346
 		if (!local->replies[i].valid)
e7a346
 			continue;
e7a346
-		if (local->replies[i].op_ret == 0)
e7a346
+		if (local->replies[i].op_ret == 0) {
e7a346
+                        success_replies[i] = 1;
e7a346
 			local->op_ret = 0;
e7a346
+                }
e7a346
 	}
e7a346
 
e7a346
 	op_errno = afr_final_errno (frame->local, this->private);
e7a346
 
e7a346
         if (local->op_ret < 0) {
e7a346
-		local->op_errno = op_errno;
e7a346
-		local->op_ret = -1;
e7a346
-                goto unwind;
e7a346
+                AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL,
e7a346
+                                  NULL, NULL);
e7a346
+                return;
e7a346
 	}
e7a346
 
e7a346
-	afr_replies_interpret (frame, this, local->inode, NULL);
e7a346
+        if (!afr_has_quorum (success_replies, this))
e7a346
+                goto unwind;
e7a346
 
e7a346
-	read_subvol = afr_read_subvol_decide (local->inode, this, NULL);
e7a346
-	if (read_subvol == -1) {
e7a346
-	        gf_msg (this->name, GF_LOG_WARNING, 0,
e7a346
-                        AFR_MSG_READ_SUBVOL_ERROR, "no read subvols for %s",
e7a346
-			local->loc.path);
e7a346
+	afr_replies_interpret (frame, this, local->inode, NULL);
e7a346
 
e7a346
-                if (spb_choice >= 0) {
e7a346
-                        read_subvol = spb_choice;
e7a346
-                } else {
e7a346
-                        read_subvol = afr_first_up_child (frame, this);
e7a346
-                }
e7a346
-	}
e7a346
+	read_subvol = afr_read_subvol_decide (local->inode, this, NULL,
e7a346
+                                          data_readable);
e7a346
 
e7a346
 unwind:
e7a346
+        afr_attempt_readsubvol_set (frame, this, success_replies, data_readable,
e7a346
+                                    &read_subvol);
e7a346
 	if (read_subvol == -1) {
e7a346
-                if (spb_choice >= 0)
e7a346
-                        read_subvol = spb_choice;
e7a346
-                else
e7a346
-                        read_subvol = afr_first_up_child (frame, this);
e7a346
+                AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
e7a346
+                                  NULL, NULL, NULL, NULL);
e7a346
+                return;
e7a346
         }
e7a346
+
e7a346
         if (AFR_IS_ARBITER_BRICK (priv, read_subvol) && local->op_ret == 0) {
e7a346
-                        local->op_ret = -1;
e7a346
-                        local->op_errno = ENOTCONN;
e7a346
+                local->op_ret = -1;
e7a346
+                local->op_errno = ENOTCONN;
e7a346
+	        gf_msg_debug (this->name, 0, "Arbiter cannot be a read subvol "
e7a346
+                              "for %s", local->loc.path);
e7a346
         }
e7a346
 
e7a346
 	AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
e7a346
@@ -4646,7 +4721,6 @@ afr_priv_dump (xlator_t *this)
e7a346
         gf_proc_dump_write("read_child", "%d", priv->read_child);
e7a346
         gf_proc_dump_write("favorite_child", "%d", priv->favorite_child);
e7a346
         gf_proc_dump_write("wait_count", "%u", priv->wait_count);
e7a346
-        gf_proc_dump_write("quorum-reads", "%d", priv->quorum_reads);
e7a346
         gf_proc_dump_write("heal-wait-queue-length", "%d",
e7a346
                            priv->heal_wait_qlen);
e7a346
         gf_proc_dump_write("heal-waiters", "%d", priv->heal_waiters);
e7a346
diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c
e7a346
index 50e8040..f6c491b 100644
e7a346
--- a/xlators/cluster/afr/src/afr-read-txn.c
e7a346
+++ b/xlators/cluster/afr/src/afr-read-txn.c
e7a346
@@ -193,8 +193,7 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
e7a346
 	local->inode = inode_ref (inode);
e7a346
         local->is_read_txn = _gf_true;
e7a346
 
e7a346
-        if (priv->quorum_reads &&
e7a346
-            priv->quorum_count && !afr_has_quorum (priv->child_up, this)) {
e7a346
+        if (priv->quorum_count && !afr_has_quorum (local->child_up, this)) {
e7a346
                 local->op_ret = -1;
e7a346
                 local->op_errno = ENOTCONN;
e7a346
                 read_subvol = -1;
e7a346
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
e7a346
index 0122b7f..1b738c0 100644
e7a346
--- a/xlators/cluster/afr/src/afr.c
e7a346
+++ b/xlators/cluster/afr/src/afr.c
e7a346
@@ -267,8 +267,6 @@ reconfigure (xlator_t *this, dict_t *options)
e7a346
         GF_OPTION_RECONF ("heal-timeout", priv->shd.timeout, options,
e7a346
                           int32, out);
e7a346
 
e7a346
-        GF_OPTION_RECONF ("quorum-reads", priv->quorum_reads, options,
e7a346
-                          bool, out);
e7a346
         GF_OPTION_RECONF ("consistent-metadata", priv->consistent_metadata,
e7a346
                           options, bool, out);
e7a346
 
e7a346
@@ -531,7 +529,6 @@ init (xlator_t *this)
e7a346
 	GF_OPTION_INIT ("iam-self-heal-daemon", priv->shd.iamshd, bool, out);
e7a346
         GF_OPTION_INIT ("heal-timeout", priv->shd.timeout, int32, out);
e7a346
 
e7a346
-        GF_OPTION_INIT ("quorum-reads", priv->quorum_reads, bool, out);
e7a346
         GF_OPTION_INIT ("consistent-metadata", priv->consistent_metadata, bool,
e7a346
                         out);
e7a346
         GF_OPTION_INIT ("consistent-io", priv->consistent_io, bool, out);
e7a346
@@ -965,8 +962,8 @@ struct volume_options options[] = {
e7a346
         { .key = {"quorum-reads"},
e7a346
           .type = GF_OPTION_TYPE_BOOL,
e7a346
           .default_value = "no",
e7a346
-          .description = "If quorum-reads is \"true\" only allow reads if "
e7a346
-                         "quorum is met when quorum is enabled.",
e7a346
+          .description = "This option has been removed. Reads are not allowed "
e7a346
+                          "if quorum is not met.",
e7a346
         },
e7a346
         { .key  = {"node-uuid"},
e7a346
           .type = GF_OPTION_TYPE_STR,
e7a346
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
e7a346
index af9dbc8..7010e9b 100644
e7a346
--- a/xlators/cluster/afr/src/afr.h
e7a346
+++ b/xlators/cluster/afr/src/afr.h
e7a346
@@ -131,7 +131,6 @@ typedef struct _afr_private {
e7a346
         gf_boolean_t      pre_op_compat;      /* on/off */
e7a346
 	uint32_t          post_op_delay_secs;
e7a346
         unsigned int      quorum_count;
e7a346
-        gf_boolean_t      quorum_reads;
e7a346
 
e7a346
         char                   vol_uuid[UUID_SIZE + 1];
e7a346
         int32_t                *last_event;
e7a346
-- 
e7a346
1.8.3.1
e7a346