d1681e
From 46a4c05ce998a72a006f79ddac4e1ad2384e66bb Mon Sep 17 00:00:00 2001
d1681e
From: Pranith Kumar K <pkarampu@redhat.com>
d1681e
Date: Mon, 4 Sep 2017 16:57:25 +0530
d1681e
Subject: [PATCH 094/128] cluster/afr: Fail open on split-brain
d1681e
d1681e
Problem:
d1681e
Append on a file with split-brain succeeds. Open is intercepted by open-behind,
d1681e
when write comes on the file, open-behind does open+write. Open succeeds
d1681e
because afr doesn't fail it. Then write succeeds because write-behind
d1681e
intercepts it. Flush is also intercepted by write-behind, so the application
d1681e
never gets to know that the write failed.
d1681e
d1681e
Fix:
d1681e
Fail open on split-brain, so that when open-behind does open+write open fails
d1681e
which leads to write failure. Application will know about this failure.
d1681e
d1681e
 > Change-Id: I4bff1c747c97bb2925d6987f4ced5f1ce75dbc15
d1681e
 > BUG: 1294051
d1681e
 > Upstream-patch: https://review.gluster.org/13075
d1681e
 > Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
d1681e
d1681e
Change-Id: I4bff1c747c97bb2925d6987f4ced5f1ce75dbc15
d1681e
BUG: 1277924
d1681e
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
d1681e
Reviewed-on: https://code.engineering.redhat.com/gerrit/124882
d1681e
Tested-by: RHGS Build Bot <nigelb@redhat.com>
d1681e
Reviewed-by: Ravishankar Narayanankutty <ravishankar@redhat.com>
d1681e
---
d1681e
 tests/basic/afr/split-brain-open.t               | 38 ++++++++++
d1681e
 tests/bugs/nfs/bug-974972.t                      |  1 +
d1681e
 xlators/cluster/afr/src/afr-common.c             | 77 ++++++++++++++++++--
d1681e
 xlators/cluster/afr/src/afr-inode-write.c        |  2 +-
d1681e
 xlators/cluster/afr/src/afr-open.c               | 93 +++++++++++++++++-------
d1681e
 xlators/cluster/afr/src/afr-self-heal-common.c   | 11 ++-
d1681e
 xlators/cluster/afr/src/afr-self-heal-data.c     | 58 ++++++++++++++-
d1681e
 xlators/cluster/afr/src/afr-self-heal-metadata.c |  4 +-
d1681e
 xlators/cluster/afr/src/afr-self-heal-name.c     |  2 +-
d1681e
 xlators/cluster/afr/src/afr-self-heal.h          |  2 +-
d1681e
 xlators/cluster/afr/src/afr-self-heald.c         |  6 +-
d1681e
 xlators/cluster/afr/src/afr-transaction.c        | 43 +----------
d1681e
 xlators/cluster/afr/src/afr.h                    |  6 +-
d1681e
 13 files changed, 248 insertions(+), 95 deletions(-)
d1681e
 create mode 100644 tests/basic/afr/split-brain-open.t
d1681e
d1681e
diff --git a/tests/basic/afr/split-brain-open.t b/tests/basic/afr/split-brain-open.t
d1681e
new file mode 100644
d1681e
index 0000000..9b2f285
d1681e
--- /dev/null
d1681e
+++ b/tests/basic/afr/split-brain-open.t
d1681e
@@ -0,0 +1,38 @@
d1681e
+#!/bin/bash
d1681e
+. $(dirname $0)/../../include.rc
d1681e
+. $(dirname $0)/../../volume.rc
d1681e
+cleanup;
d1681e
+
d1681e
+TEST glusterd
d1681e
+TEST pidof glusterd
d1681e
+TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1}
d1681e
+TEST $CLI volume start $V0
d1681e
+
d1681e
+#Disable self-heal-daemon
d1681e
+TEST $CLI volume heal $V0 disable
d1681e
+
d1681e
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 --entry-timeout=0 $M0;
d1681e
+
d1681e
+TEST touch $M0/data-split-brain.txt
d1681e
+
d1681e
+#Create data split-brain
d1681e
+TEST kill_brick $V0 $H0 $B0/${V0}0
d1681e
+
d1681e
+`echo "brick1_alive" > $M0/data-split-brain.txt`
d1681e
+TEST [ $? == 0 ];
d1681e
+
d1681e
+TEST $CLI volume start $V0 force
d1681e
+TEST kill_brick $V0 $H0 $B0/${V0}1
d1681e
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
d1681e
+
d1681e
+`echo "brick0_alive" > $M0/data-split-brain.txt`
d1681e
+TEST [ $? == 0 ];
d1681e
+
d1681e
+TEST $CLI volume start $V0 force
d1681e
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
d1681e
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
d1681e
+
d1681e
+echo "all-alive" >> $M0/data-split-brain.txt
d1681e
+TEST [ $? != 0 ];
d1681e
+
d1681e
+cleanup;
d1681e
diff --git a/tests/bugs/nfs/bug-974972.t b/tests/bugs/nfs/bug-974972.t
d1681e
index d05e7df..7047825 100755
d1681e
--- a/tests/bugs/nfs/bug-974972.t
d1681e
+++ b/tests/bugs/nfs/bug-974972.t
d1681e
@@ -11,6 +11,7 @@ TEST glusterd
d1681e
 TEST pidof glusterd
d1681e
 TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1}
d1681e
 TEST $CLI volume set $V0 self-heal-daemon off
d1681e
+TEST $CLI volume set $V0 cluster.eager-lock off
d1681e
 TEST $CLI volume set $V0 nfs.disable false
d1681e
 TEST $CLI volume start $V0
d1681e
 EXPECT_WITHIN $NFS_EXPORT_TIMEOUT "1" is_nfs_export_available;
d1681e
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
d1681e
index a8ba5a0..692f198 100644
d1681e
--- a/xlators/cluster/afr/src/afr-common.c
d1681e
+++ b/xlators/cluster/afr/src/afr-common.c
d1681e
@@ -254,8 +254,9 @@ __afr_set_in_flight_sb_status (xlator_t *this, afr_local_t *local,
d1681e
                         local->transaction.in_flight_sb = _gf_true;
d1681e
                         metadatamap |= (1 << index);
d1681e
                 }
d1681e
-                if (metadatamap_old != metadatamap)
d1681e
+                if (metadatamap_old != metadatamap) {
d1681e
                         event = 0;
d1681e
+                }
d1681e
                 break;
d1681e
 
d1681e
         case AFR_DATA_TRANSACTION:
d1681e
@@ -283,19 +284,71 @@ __afr_set_in_flight_sb_status (xlator_t *this, afr_local_t *local,
d1681e
         return ret;
d1681e
 }
d1681e
 
d1681e
-int
d1681e
-afr_set_in_flight_sb_status (xlator_t *this, afr_local_t *local, inode_t *inode)
d1681e
+gf_boolean_t
d1681e
+afr_is_symmetric_error (call_frame_t *frame, xlator_t *this)
d1681e
 {
d1681e
-        int            ret  = -1;
d1681e
+        afr_local_t *local = NULL;
d1681e
         afr_private_t *priv = NULL;
d1681e
+        int op_errno = 0;
d1681e
+        int i_errno = 0;
d1681e
+        gf_boolean_t matching_errors = _gf_true;
d1681e
+        int i = 0;
d1681e
+
d1681e
+        priv = this->private;
d1681e
+        local = frame->local;
d1681e
+
d1681e
+        for (i = 0; i < priv->child_count; i++) {
d1681e
+                if (!local->replies[i].valid)
d1681e
+                        continue;
d1681e
+                if (local->replies[i].op_ret != -1) {
d1681e
+                        /* Operation succeeded on at least one subvol,
d1681e
+                           so it is not a failed-everywhere situation.
d1681e
+                        */
d1681e
+                        matching_errors = _gf_false;
d1681e
+                        break;
d1681e
+                }
d1681e
+                i_errno = local->replies[i].op_errno;
d1681e
+
d1681e
+                if (i_errno == ENOTCONN) {
d1681e
+                        /* ENOTCONN is not a symmetric error. We do not
d1681e
+                           know if the operation was performed on the
d1681e
+                           backend or not.
d1681e
+                        */
d1681e
+                        matching_errors = _gf_false;
d1681e
+                        break;
d1681e
+                }
d1681e
+
d1681e
+                if (!op_errno) {
d1681e
+                        op_errno = i_errno;
d1681e
+                } else if (op_errno != i_errno) {
d1681e
+                        /* Mismatching op_errno's */
d1681e
+                        matching_errors = _gf_false;
d1681e
+                        break;
d1681e
+                }
d1681e
+        }
d1681e
+
d1681e
+        return matching_errors;
d1681e
+}
d1681e
+
d1681e
+int
d1681e
+afr_set_in_flight_sb_status (xlator_t *this, call_frame_t *frame,
d1681e
+                             inode_t *inode)
d1681e
+{
d1681e
+        int           ret    = -1;
d1681e
+        afr_private_t *priv  = NULL;
d1681e
+        afr_local_t   *local = NULL;
d1681e
 
d1681e
         priv = this->private;
d1681e
+        local = frame->local;
d1681e
 
d1681e
         /* If this transaction saw no failures, then exit. */
d1681e
         if (AFR_COUNT (local->transaction.failed_subvols,
d1681e
                        priv->child_count) == 0)
d1681e
                 return 0;
d1681e
 
d1681e
+        if (afr_is_symmetric_error (frame, this))
d1681e
+                return 0;
d1681e
+
d1681e
         LOCK (&inode->lock);
d1681e
         {
d1681e
                 ret = __afr_set_in_flight_sb_status (this, local, inode);
d1681e
@@ -548,8 +601,9 @@ afr_inode_get_readable (call_frame_t *frame, inode_t *inode, xlator_t *this,
d1681e
                 }
d1681e
         } else {
d1681e
                 /* For files, abort in case of data/metadata split-brain. */
d1681e
-                if (!data_count || !metadata_count)
d1681e
+                if (!data_count || !metadata_count) {
d1681e
                         return -EIO;
d1681e
+                }
d1681e
         }
d1681e
 
d1681e
         if (type == AFR_METADATA_TRANSACTION && readable)
d1681e
@@ -1958,6 +2012,11 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this)
d1681e
                 GF_FREE (local->cont.opendir.checksum);
d1681e
         }
d1681e
 
d1681e
+        { /* open */
d1681e
+                if (local->cont.open.fd)
d1681e
+                        fd_unref (local->cont.open.fd);
d1681e
+        }
d1681e
+
d1681e
         { /* readdirp */
d1681e
                 if (local->cont.readdir.dict)
d1681e
                         dict_unref (local->cont.readdir.dict);
d1681e
@@ -2535,9 +2594,11 @@ afr_lookup_metadata_heal_check (call_frame_t *frame, xlator_t *this)
d1681e
         if (!afr_can_start_metadata_self_heal (frame, this))
d1681e
                 goto out;
d1681e
 
d1681e
-        heal = afr_frame_create (this);
d1681e
-        if (!heal)
d1681e
+        heal = afr_frame_create (this, &ret;;
d1681e
+        if (!heal) {
d1681e
+                ret = -ret;
d1681e
                 goto out;
d1681e
+        }
d1681e
 
d1681e
         ret = synctask_new (this->ctx->env, afr_lookup_sh_metadata_wrap,
d1681e
                             afr_refresh_selfheal_done, heal, frame);
d1681e
@@ -2630,7 +2691,7 @@ afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this)
d1681e
 	}
d1681e
 
d1681e
 	if (need_heal) {
d1681e
-		heal = afr_frame_create (this);
d1681e
+		heal = afr_frame_create (this, NULL);
d1681e
 		if (!heal)
d1681e
                         goto metadata_heal;
d1681e
 
d1681e
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
d1681e
index 6651e92..97397f9 100644
d1681e
--- a/xlators/cluster/afr/src/afr-inode-write.c
d1681e
+++ b/xlators/cluster/afr/src/afr-inode-write.c
d1681e
@@ -131,7 +131,7 @@ __afr_inode_write_finalize (call_frame_t *frame, xlator_t *this)
d1681e
 		}
d1681e
 	}
d1681e
 
d1681e
-        afr_set_in_flight_sb_status (this, local, local->inode);
d1681e
+        afr_set_in_flight_sb_status (this, frame, local->inode);
d1681e
 }
d1681e
 
d1681e
 
d1681e
diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c
d1681e
index 7a62835..6c625cc 100644
d1681e
--- a/xlators/cluster/afr/src/afr-open.c
d1681e
+++ b/xlators/cluster/afr/src/afr-open.c
d1681e
@@ -66,16 +66,15 @@ afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
d1681e
         return 0;
d1681e
 }
d1681e
 
d1681e
-
d1681e
 int
d1681e
 afr_open_cbk (call_frame_t *frame, void *cookie,
d1681e
               xlator_t *this, int32_t op_ret, int32_t op_errno,
d1681e
               fd_t *fd, dict_t *xdata)
d1681e
 {
d1681e
-        afr_local_t *  local       = NULL;
d1681e
-        int            call_count  = -1;
d1681e
-        int            child_index = (long) cookie;
d1681e
-	afr_fd_ctx_t  *fd_ctx = NULL;
d1681e
+        afr_local_t   *local           = NULL;
d1681e
+        int           call_count       = -1;
d1681e
+        int           child_index      = (long) cookie;
d1681e
+        afr_fd_ctx_t  *fd_ctx          = NULL;
d1681e
 
d1681e
         local = frame->local;
d1681e
 	fd_ctx = local->fd_ctx;
d1681e
@@ -103,24 +102,62 @@ afr_open_cbk (call_frame_t *frame, void *cookie,
d1681e
                                     fd, 0, NULL);
d1681e
                 } else {
d1681e
                         AFR_STACK_UNWIND (open, frame, local->op_ret,
d1681e
-                                          local->op_errno, local->fd,
d1681e
-					  local->xdata_rsp);
d1681e
+                                          local->op_errno, local->cont.open.fd,
d1681e
+                                          local->xdata_rsp);
d1681e
                 }
d1681e
         }
d1681e
 
d1681e
         return 0;
d1681e
 }
d1681e
 
d1681e
+
d1681e
+int
d1681e
+afr_open_continue (call_frame_t *frame, xlator_t *this, int err)
d1681e
+{
d1681e
+        afr_local_t   *local     = NULL;
d1681e
+        afr_private_t *priv      = NULL;
d1681e
+        int           call_count = 0;
d1681e
+        int           i          = 0;
d1681e
+
d1681e
+        local  = frame->local;
d1681e
+        priv   = this->private;
d1681e
+
d1681e
+        if (err) {
d1681e
+                AFR_STACK_UNWIND (open, frame, -1, -err, NULL, NULL);
d1681e
+        } else {
d1681e
+                local->call_count = AFR_COUNT (local->child_up,
d1681e
+                                               priv->child_count);
d1681e
+                call_count = local->call_count;
d1681e
+
d1681e
+                for (i = 0; i < priv->child_count; i++) {
d1681e
+                        if (local->child_up[i]) {
d1681e
+                                STACK_WIND_COOKIE (frame, afr_open_cbk,
d1681e
+                                                   (void *)(long)i,
d1681e
+                                                   priv->children[i],
d1681e
+                                                  priv->children[i]->fops->open,
d1681e
+                                                   &local->loc,
d1681e
+                                            (local->cont.open.flags & ~O_TRUNC),
d1681e
+                                                   local->cont.open.fd,
d1681e
+                                                   local->xdata_req);
d1681e
+                                if (!--call_count)
d1681e
+                                        break;
d1681e
+                        }
d1681e
+                }
d1681e
+        }
d1681e
+        return 0;
d1681e
+}
d1681e
+
d1681e
 int
d1681e
 afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
d1681e
           fd_t *fd, dict_t *xdata)
d1681e
 {
d1681e
-        afr_private_t * priv       = NULL;
d1681e
-        afr_local_t *   local      = NULL;
d1681e
-        int             i          = 0;
d1681e
-        int32_t         call_count = 0;
d1681e
-        int32_t         op_errno   = 0;
d1681e
-	afr_fd_ctx_t   *fd_ctx = NULL;
d1681e
+        afr_private_t *priv            = NULL;
d1681e
+        afr_local_t   *local           = NULL;
d1681e
+        int           spb_choice       = 0;
d1681e
+        int           event_generation = 0;
d1681e
+        int           ret              = 0;
d1681e
+        int32_t       op_errno         = 0;
d1681e
+        afr_fd_ctx_t  *fd_ctx          = NULL;
d1681e
 
d1681e
         //We can't let truncation to happen outside transaction.
d1681e
 
d1681e
@@ -140,23 +177,27 @@ afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
d1681e
         if (!afr_is_consistent_io_possible (local, priv, &op_errno))
d1681e
 		goto out;
d1681e
 
d1681e
-        local->fd = fd_ref (fd);
d1681e
+        local->inode = inode_ref (loc->inode);
d1681e
+        loc_copy (&local->loc, loc);
d1681e
 	local->fd_ctx = fd_ctx;
d1681e
 	fd_ctx->flags = flags;
d1681e
-
d1681e
-        call_count = local->call_count;
d1681e
+        if (xdata)
d1681e
+                local->xdata_req = dict_ref (xdata);
d1681e
 
d1681e
         local->cont.open.flags = flags;
d1681e
-
d1681e
-        for (i = 0; i < priv->child_count; i++) {
d1681e
-                if (local->child_up[i]) {
d1681e
-                        STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i,
d1681e
-                                           priv->children[i],
d1681e
-                                           priv->children[i]->fops->open,
d1681e
-                                           loc, (flags & ~O_TRUNC), fd, xdata);
d1681e
-                        if (!--call_count)
d1681e
-                                break;
d1681e
-                }
d1681e
+        local->cont.open.fd = fd_ref (fd);
d1681e
+
d1681e
+        ret = afr_inode_get_readable (frame, local->inode, this,
d1681e
+                                      NULL, &event_generation,
d1681e
+                                      AFR_DATA_TRANSACTION);
d1681e
+        if ((ret < 0) &&
d1681e
+            (afr_inode_split_brain_choice_get (local->inode,
d1681e
+                                             this, &spb_choice) == 0) &&
d1681e
+            spb_choice < 0) {
d1681e
+                afr_inode_refresh (frame, this, local->inode,
d1681e
+                                   local->inode->gfid, afr_open_continue);
d1681e
+        } else {
d1681e
+                afr_open_continue (frame, this, 0);
d1681e
         }
d1681e
 
d1681e
 	return 0;
d1681e
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
d1681e
index 20e81dd..26d3860 100644
d1681e
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
d1681e
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
d1681e
@@ -66,9 +66,9 @@ afr_lookup_and_heal_gfid (xlator_t *this, inode_t *parent, const char *name,
d1681e
                 goto out;
d1681e
         }
d1681e
 
d1681e
-        frame = afr_frame_create (this);
d1681e
+        frame = afr_frame_create (this, &ret;;
d1681e
         if (!frame) {
d1681e
-                ret = -ENOMEM;
d1681e
+                ret = -ret;
d1681e
                 goto out;
d1681e
         }
d1681e
 
d1681e
@@ -2349,18 +2349,17 @@ afr_inode_find (xlator_t *this, uuid_t gfid)
d1681e
 
d1681e
 
d1681e
 call_frame_t *
d1681e
-afr_frame_create (xlator_t *this)
d1681e
+afr_frame_create (xlator_t *this, int32_t *op_errno)
d1681e
 {
d1681e
 	call_frame_t *frame    = NULL;
d1681e
 	afr_local_t  *local    = NULL;
d1681e
-	int           op_errno = 0;
d1681e
 	pid_t         pid      = GF_CLIENT_PID_SELF_HEALD;
d1681e
 
d1681e
 	frame = create_frame (this, this->ctx->pool);
d1681e
 	if (!frame)
d1681e
 		return NULL;
d1681e
 
d1681e
-	local = AFR_FRAME_INIT (frame, op_errno);
d1681e
+	local = AFR_FRAME_INIT (frame, (*op_errno));
d1681e
 	if (!local) {
d1681e
 		STACK_DESTROY (frame->root);
d1681e
 		return NULL;
d1681e
@@ -2490,7 +2489,7 @@ afr_selfheal (xlator_t *this, uuid_t gfid)
d1681e
 	call_frame_t *frame = NULL;
d1681e
         afr_local_t *local = NULL;
d1681e
 
d1681e
-	frame = afr_frame_create (this);
d1681e
+	frame = afr_frame_create (this, NULL);
d1681e
 	if (!frame)
d1681e
 		return ret;
d1681e
 
d1681e
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
d1681e
index 2c254e8..8cf43f2 100644
d1681e
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
d1681e
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
d1681e
@@ -776,13 +776,37 @@ out:
d1681e
 	return ret;
d1681e
 }
d1681e
 
d1681e
+int
d1681e
+afr_selfheal_data_open_cbk (call_frame_t *frame, void *cookie,
d1681e
+                            xlator_t *this, int32_t op_ret, int32_t op_errno,
d1681e
+                            fd_t *fd, dict_t *xdata)
d1681e
+{
d1681e
+        afr_local_t *local = NULL;
d1681e
+        int         i      = (long) cookie;
d1681e
+
d1681e
+        local = frame->local;
d1681e
+
d1681e
+        local->replies[i].valid = 1;
d1681e
+        local->replies[i].op_ret = op_ret;
d1681e
+        local->replies[i].op_errno = op_errno;
d1681e
+
d1681e
+        syncbarrier_wake (&local->barrier);
d1681e
+
d1681e
+        return 0;
d1681e
+}
d1681e
 
d1681e
 int
d1681e
 afr_selfheal_data_open (xlator_t *this, inode_t *inode, fd_t **fd)
d1681e
 {
d1681e
-	int         ret    = 0;
d1681e
-        fd_t       *fd_tmp = NULL;
d1681e
-	loc_t       loc    = {0,};
d1681e
+        int           ret      = 0;
d1681e
+        fd_t          *fd_tmp  = NULL;
d1681e
+        loc_t         loc      = {0,};
d1681e
+        call_frame_t  *frame   = NULL;
d1681e
+        afr_local_t   *local   = NULL;
d1681e
+        afr_private_t *priv    = NULL;
d1681e
+        int           i        = 0;
d1681e
+
d1681e
+        priv = this->private;
d1681e
 
d1681e
 	fd_tmp = fd_create (inode, 0);
d1681e
 	if (!fd_tmp)
d1681e
@@ -791,7 +815,31 @@ afr_selfheal_data_open (xlator_t *this, inode_t *inode, fd_t **fd)
d1681e
 	loc.inode = inode_ref (inode);
d1681e
 	gf_uuid_copy (loc.gfid, inode->gfid);
d1681e
 
d1681e
-	ret = syncop_open (this, &loc, O_RDWR|O_LARGEFILE, fd_tmp, NULL, NULL);
d1681e
+        frame = afr_frame_create (this, &ret;;
d1681e
+        if (!frame) {
d1681e
+                ret = -ret;
d1681e
+                fd_unref (fd_tmp);
d1681e
+                goto out;
d1681e
+        }
d1681e
+        local = frame->local;
d1681e
+
d1681e
+        AFR_ONLIST (local->child_up, frame, afr_selfheal_data_open_cbk, open,
d1681e
+                    &loc, O_RDWR|O_LARGEFILE, fd_tmp, NULL);
d1681e
+
d1681e
+        ret = -ENOTCONN;
d1681e
+        for (i = 0; i < priv->child_count; i++) {
d1681e
+                if (!local->replies[i].valid)
d1681e
+                        continue;
d1681e
+
d1681e
+                if (local->replies[i].op_ret < 0) {
d1681e
+                        ret = -local->replies[i].op_errno;
d1681e
+                        continue;
d1681e
+                }
d1681e
+
d1681e
+                ret = 0;
d1681e
+                break;
d1681e
+        }
d1681e
+
d1681e
 	if (ret < 0) {
d1681e
 		fd_unref (fd_tmp);
d1681e
                 goto out;
d1681e
@@ -802,6 +850,8 @@ afr_selfheal_data_open (xlator_t *this, inode_t *inode, fd_t **fd)
d1681e
         *fd = fd_tmp;
d1681e
 out:
d1681e
         loc_wipe (&loc;;
d1681e
+        if (frame)
d1681e
+                AFR_STACK_DESTROY (frame);
d1681e
 	return ret;
d1681e
 }
d1681e
 
d1681e
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
d1681e
index f23cf8e..199f896 100644
d1681e
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
d1681e
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
d1681e
@@ -486,9 +486,9 @@ afr_selfheal_metadata_by_stbuf (xlator_t *this, struct iatt *stbuf)
d1681e
                 goto out;
d1681e
         }
d1681e
 
d1681e
-        frame = afr_frame_create (this);
d1681e
+        frame = afr_frame_create (this, &ret;;
d1681e
         if (!frame) {
d1681e
-                ret = -ENOMEM;
d1681e
+                ret = -ret;
d1681e
                 goto out;
d1681e
         }
d1681e
 
d1681e
diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c
d1681e
index 352d151..556d14b 100644
d1681e
--- a/xlators/cluster/afr/src/afr-self-heal-name.c
d1681e
+++ b/xlators/cluster/afr/src/afr-self-heal-name.c
d1681e
@@ -670,7 +670,7 @@ afr_selfheal_name (xlator_t *this, uuid_t pargfid, const char *bname,
d1681e
 	if (!parent)
d1681e
 		goto out;
d1681e
 
d1681e
-	frame = afr_frame_create (this);
d1681e
+	frame = afr_frame_create (this, NULL);
d1681e
 	if (!frame)
d1681e
 		goto out;
d1681e
 
d1681e
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
d1681e
index a1da433..188a334 100644
d1681e
--- a/xlators/cluster/afr/src/afr-self-heal.h
d1681e
+++ b/xlators/cluster/afr/src/afr-self-heal.h
d1681e
@@ -209,7 +209,7 @@ afr_selfheal_post_op (call_frame_t *frame, xlator_t *this, inode_t *inode,
d1681e
 		      int subvol, dict_t *xattr, dict_t *xdata);
d1681e
 
d1681e
 call_frame_t *
d1681e
-afr_frame_create (xlator_t *this);
d1681e
+afr_frame_create (xlator_t *this, int32_t *op_errno);
d1681e
 
d1681e
 inode_t *
d1681e
 afr_inode_find (xlator_t *this, uuid_t gfid);
d1681e
diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c
d1681e
index 74c9bb6..19cde88 100644
d1681e
--- a/xlators/cluster/afr/src/afr-self-heald.c
d1681e
+++ b/xlators/cluster/afr/src/afr-self-heald.c
d1681e
@@ -260,7 +260,7 @@ afr_shd_zero_xattrop (xlator_t *this, uuid_t gfid)
d1681e
         int raw[AFR_NUM_CHANGE_LOGS] = {0};
d1681e
 
d1681e
         priv = this->private;
d1681e
-        frame = afr_frame_create (this);
d1681e
+        frame = afr_frame_create (this, NULL);
d1681e
         if (!frame)
d1681e
                 goto out;
d1681e
         inode = afr_inode_find (this, gfid);
d1681e
@@ -457,9 +457,9 @@ afr_shd_index_sweep (struct subvol_healer *healer, char *vgfid)
d1681e
 	priv = healer->this->private;
d1681e
 	subvol = priv->children[healer->subvol];
d1681e
 
d1681e
-        frame = afr_frame_create (healer->this);
d1681e
+        frame = afr_frame_create (healer->this, &ret;;
d1681e
         if (!frame) {
d1681e
-                ret = -ENOMEM;
d1681e
+                ret = -ret;
d1681e
                 goto out;
d1681e
         }
d1681e
 
d1681e
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
d1681e
index 91c4f78..a04636f 100644
d1681e
--- a/xlators/cluster/afr/src/afr-transaction.c
d1681e
+++ b/xlators/cluster/afr/src/afr-transaction.c
d1681e
@@ -626,51 +626,10 @@ afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this)
d1681e
         return _gf_true;
d1681e
 }
d1681e
 
d1681e
-
d1681e
 void
d1681e
 afr_handle_symmetric_errors (call_frame_t *frame, xlator_t *this)
d1681e
 {
d1681e
-	afr_local_t *local = NULL;
d1681e
-	afr_private_t *priv = NULL;
d1681e
-	int op_errno = 0;
d1681e
-	int i_errno = 0;
d1681e
-	gf_boolean_t matching_errors = _gf_true;
d1681e
-	int i = 0;
d1681e
-
d1681e
-	priv = this->private;
d1681e
-	local = frame->local;
d1681e
-
d1681e
-	for (i = 0; i < priv->child_count; i++) {
d1681e
-		if (!local->replies[i].valid)
d1681e
-			continue;
d1681e
-		if (local->replies[i].op_ret != -1) {
d1681e
-			/* Operation succeeded on at least on subvol,
d1681e
-			   so it is not a failed-everywhere situation.
d1681e
-			*/
d1681e
-			matching_errors = _gf_false;
d1681e
-			break;
d1681e
-		}
d1681e
-		i_errno = local->replies[i].op_errno;
d1681e
-
d1681e
-		if (i_errno == ENOTCONN) {
d1681e
-			/* ENOTCONN is not a symmetric error. We do not
d1681e
-			   know if the operation was performed on the
d1681e
-			   backend or not.
d1681e
-			*/
d1681e
-			matching_errors = _gf_false;
d1681e
-			break;
d1681e
-		}
d1681e
-
d1681e
-		if (!op_errno) {
d1681e
-			op_errno = i_errno;
d1681e
-		} else if (op_errno != i_errno) {
d1681e
-			/* Mismatching op_errno's */
d1681e
-			matching_errors = _gf_false;
d1681e
-			break;
d1681e
-		}
d1681e
-	}
d1681e
-
d1681e
-	if (matching_errors)
d1681e
+	if (afr_is_symmetric_error (frame, this))
d1681e
 		__mark_all_success (frame, this);
d1681e
 }
d1681e
 
d1681e
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
d1681e
index 672d053..0a06eb6 100644
d1681e
--- a/xlators/cluster/afr/src/afr.h
d1681e
+++ b/xlators/cluster/afr/src/afr.h
d1681e
@@ -519,6 +519,7 @@ typedef struct _afr_local {
d1681e
 
d1681e
                 struct {
d1681e
                         int32_t flags;
d1681e
+                        fd_t *fd;
d1681e
                 } open;
d1681e
 
d1681e
                 struct {
d1681e
@@ -1214,7 +1215,7 @@ int
d1681e
 afr_get_msg_id (char *op_type);
d1681e
 
d1681e
 int
d1681e
-afr_set_in_flight_sb_status (xlator_t *this, afr_local_t *local,
d1681e
+afr_set_in_flight_sb_status (xlator_t *this, call_frame_t *frame,
d1681e
                              inode_t *inode);
d1681e
 
d1681e
 int32_t
d1681e
@@ -1272,4 +1273,7 @@ afr_write_subvol_set (call_frame_t *frame, xlator_t *this);
d1681e
 
d1681e
 int
d1681e
 afr_write_subvol_reset (call_frame_t *frame, xlator_t *this);
d1681e
+
d1681e
+gf_boolean_t
d1681e
+afr_is_symmetric_error (call_frame_t *frame, xlator_t *this);
d1681e
 #endif /* __AFR_H__ */
d1681e
-- 
d1681e
1.8.3.1
d1681e