a3470f
From 3a4682ccd935744a0c5346bae23658ff08d65343 Mon Sep 17 00:00:00 2001
a3470f
From: karthik-us <ksubrahm@redhat.com>
a3470f
Date: Mon, 15 Jan 2018 12:48:54 +0530
a3470f
Subject: [PATCH 125/128] cluster/afr: Fixing the flaws in arbiter becoming
a3470f
 source patch
a3470f
a3470f
Problem:
a3470f
Setting the write_subvol value to read_subvol in case of metadata
a3470f
transaction during pre-op (commit 19f9bcff4aada589d4321356c2670ed283f02c03)
a3470f
might lead to the original problem of arbiter becoming source.
a3470f
a3470f
Scenario:
a3470f
1) All bricks are up and good
a3470f
2) 2 writes w1 and w2 are in progress in parallel
a3470f
3) ctx->read_subvol is good for all the subvolumes
a3470f
4) w1 succeeds on brick0 and fails on brick1, yet to do post-op on
a3470f
   the disk
a3470f
5) read/lookup comes on the same file and refreshes read_subvols back
a3470f
   to all good
a3470f
6) metadata transaction happens which makes ctx->write_subvol to be
a3470f
   assigned with ctx->read_subvol which is all good
a3470f
7) w2 succeeds on brick1 and fails on brick0 and this will update the
a3470f
   brick in reverse order leading to arbiter becoming source
a3470f
a3470f
Fix:
a3470f
Instead of setting the ctx->write_subvol to ctx->read_subvol in the
a3470f
pre-op statge, if there is a metadata transaction, check in the
a3470f
function __afr_set_in_flight_sb_status() if it is a data/metadata
a3470f
transaction. Use the value of ctx->write_subvol if it is a data
a3470f
transactions and ctx->read_subvol value for other transactions.
a3470f
a3470f
With this patch we assign the value of ctx->write_subvol in the
a3470f
afr_transaction_perform_fop() with the on disk value, instead of
a3470f
assigning it in the afr_changelog_pre_op() with the in memory value.
a3470f
a3470f
Upstream Patch: https://review.gluster.org/#/c/19045/
a3470f
a3470f
> Change-Id: Id2025a7e965f0578af35b1abaac793b019c43cc4
a3470f
> BUG: 1482064
a3470f
> Signed-off-by: karthik-us <ksubrahm@redhat.com>
a3470f
a3470f
Change-Id: Ie5d6745703fa5024d27e413093f7dfd08992e1df
a3470f
BUG: 1401969
a3470f
Signed-off-by: karthik-us <ksubrahm@redhat.com>
a3470f
Reviewed-on: https://code.engineering.redhat.com/gerrit/127644
a3470f
Tested-by: RHGS Build Bot <nigelb@redhat.com>
a3470f
Reviewed-by: Ravishankar Narayanankutty <ravishankar@redhat.com>
a3470f
Tested-by: Ravishankar Narayanankutty <ravishankar@redhat.com>
a3470f
---
a3470f
 xlators/cluster/afr/src/afr-common.c      | 266 +++++++++++++++++-------------
a3470f
 xlators/cluster/afr/src/afr-dir-write.c   |  16 +-
a3470f
 xlators/cluster/afr/src/afr-inode-write.c |  57 +++++--
a3470f
 xlators/cluster/afr/src/afr-lk-common.c   |  42 +++--
a3470f
 xlators/cluster/afr/src/afr-messages.h    |   9 +-
a3470f
 xlators/cluster/afr/src/afr-transaction.c |  45 ++---
a3470f
 xlators/cluster/afr/src/afr.h             |  22 ++-
a3470f
 7 files changed, 277 insertions(+), 180 deletions(-)
a3470f
a3470f
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
a3470f
index 692f198..6e6f5fa 100644
a3470f
--- a/xlators/cluster/afr/src/afr-common.c
a3470f
+++ b/xlators/cluster/afr/src/afr-common.c
a3470f
@@ -150,6 +150,7 @@ __afr_inode_ctx_get (xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx)
a3470f
                 tmp_ctx->spb_choice = -1;
a3470f
                 tmp_ctx->read_subvol = 0;
a3470f
                 tmp_ctx->write_subvol = 0;
a3470f
+                tmp_ctx->lock_count = 0;
a3470f
         } else {
a3470f
                 tmp_ctx = (afr_inode_ctx_t *) ctx_int;
a3470f
         }
a3470f
@@ -195,7 +196,6 @@ __afr_set_in_flight_sb_status (xlator_t *this, afr_local_t *local,
a3470f
                                inode_t *inode)
a3470f
 {
a3470f
         int                 i               = 0;
a3470f
-        int                 ret             = -1;
a3470f
         int                 txn_type        = 0;
a3470f
         int                 count           = 0;
a3470f
         int                 index           = -1;
a3470f
@@ -208,16 +208,14 @@ __afr_set_in_flight_sb_status (xlator_t *this, afr_local_t *local,
a3470f
         uint32_t            event           = 0;
a3470f
         uint64_t            val             = 0;
a3470f
         afr_private_t      *priv            = NULL;
a3470f
-        afr_inode_ctx_t    *ctx             = NULL;
a3470f
 
a3470f
         priv = this->private;
a3470f
         txn_type = local->transaction.type;
a3470f
 
a3470f
-        ret = __afr_inode_ctx_get (this, inode, &ctx;;
a3470f
-        if (ret < 0)
a3470f
-                return ret;
a3470f
-
a3470f
-        val = ctx->write_subvol;
a3470f
+        if (txn_type == AFR_DATA_TRANSACTION)
a3470f
+                val = local->inode_ctx->write_subvol;
a3470f
+        else
a3470f
+                val = local->inode_ctx->read_subvol;
a3470f
 
a3470f
         metadatamap_old = metadatamap = (val & 0x000000000000ffff);
a3470f
         datamap_old = datamap = (val & 0x00000000ffff0000) >> 16;
a3470f
@@ -278,10 +276,11 @@ __afr_set_in_flight_sb_status (xlator_t *this, afr_local_t *local,
a3470f
                 (((uint64_t) datamap) << 16) |
a3470f
                 (((uint64_t) event) << 32);
a3470f
 
a3470f
-        ctx->write_subvol = val;
a3470f
-        ctx->read_subvol = val;
a3470f
+        if (txn_type == AFR_DATA_TRANSACTION)
a3470f
+                local->inode_ctx->write_subvol = val;
a3470f
+        local->inode_ctx->read_subvol = val;
a3470f
 
a3470f
-        return ret;
a3470f
+        return 0;
a3470f
 }
a3470f
 
a3470f
 gf_boolean_t
a3470f
@@ -1001,6 +1000,81 @@ afr_accuse_smallfiles (xlator_t *this, struct afr_reply *replies,
a3470f
 }
a3470f
 
a3470f
 int
a3470f
+afr_readables_fill (call_frame_t *frame, xlator_t *this, inode_t *inode,
a3470f
+                    unsigned char *data_accused,
a3470f
+                    unsigned char *metadata_accused,
a3470f
+                    unsigned char *data_readable,
a3470f
+                    unsigned char *metadata_readable,
a3470f
+                    struct afr_reply *replies)
a3470f
+{
a3470f
+        afr_local_t *local = NULL;
a3470f
+        afr_private_t *priv = NULL;
a3470f
+        dict_t *xdata = NULL;
a3470f
+        int i = 0;
a3470f
+        int ret = 0;
a3470f
+        ia_type_t ia_type = IA_INVAL;
a3470f
+
a3470f
+        local = frame->local;
a3470f
+        priv = this->private;
a3470f
+
a3470f
+        for (i = 0; i < priv->child_count; i++) {
a3470f
+                data_readable[i] = 1;
a3470f
+                metadata_readable[i] = 1;
a3470f
+        }
a3470f
+        if (AFR_IS_ARBITER_BRICK (priv, ARBITER_BRICK_INDEX)) {
a3470f
+                data_readable[ARBITER_BRICK_INDEX] =  0;
a3470f
+                metadata_readable[ARBITER_BRICK_INDEX] = 0;
a3470f
+        }
a3470f
+
a3470f
+        for (i = 0; i < priv->child_count; i++) {
a3470f
+                if (replies) {/* Lookup */
a3470f
+                        if (!replies[i].valid || replies[i].op_ret == -1 ||
a3470f
+                            (replies[i].xdata && dict_get (replies[i].xdata,
a3470f
+                                                        GLUSTERFS_BAD_INODE))) {
a3470f
+                                data_readable[i] = 0;
a3470f
+                                metadata_readable[i] = 0;
a3470f
+                                continue;
a3470f
+                        }
a3470f
+
a3470f
+                        xdata = replies[i].xdata;
a3470f
+                        ia_type = replies[i].poststat.ia_type;
a3470f
+                } else {/* pre-op xattrop */
a3470f
+                        xdata = local->transaction.pre_op_xdata[i];
a3470f
+                        ia_type = inode->ia_type;
a3470f
+                }
a3470f
+
a3470f
+                afr_accused_fill (this, xdata, data_accused,
a3470f
+                                  (ia_type == IA_IFDIR) ?
a3470f
+                                  AFR_ENTRY_TRANSACTION : AFR_DATA_TRANSACTION);
a3470f
+
a3470f
+                afr_accused_fill (this, xdata,
a3470f
+                                  metadata_accused, AFR_METADATA_TRANSACTION);
a3470f
+        }
a3470f
+
a3470f
+        if (replies && ia_type != IA_INVAL && ia_type != IA_IFDIR &&
a3470f
+            /* We want to accuse small files only when we know for
a3470f
+             * sure that there is no IO happening. Otherwise, the
a3470f
+             * ia_sizes obtained in post-refresh replies may
a3470f
+             * mismatch due to a race between inode-refresh and
a3470f
+             * ongoing writes, causing spurious heal launches*/
a3470f
+            !afr_is_possibly_under_txn (AFR_DATA_TRANSACTION, local, this)) {
a3470f
+                afr_accuse_smallfiles (this, replies, data_accused);
a3470f
+        }
a3470f
+
a3470f
+        for (i = 0; i < priv->child_count; i++) {
a3470f
+                if (data_accused[i]) {
a3470f
+                        data_readable[i] = 0;
a3470f
+                        ret = 1;
a3470f
+                }
a3470f
+                if (metadata_accused[i]) {
a3470f
+                        metadata_readable[i] = 0;
a3470f
+                        ret = 1;
a3470f
+                }
a3470f
+        }
a3470f
+        return ret;
a3470f
+}
a3470f
+
a3470f
+int
a3470f
 afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode,
a3470f
                        gf_boolean_t *start_heal)
a3470f
 {
a3470f
@@ -1025,62 +1099,9 @@ afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode,
a3470f
 	metadata_accused = alloca0 (priv->child_count);
a3470f
 	metadata_readable = alloca0 (priv->child_count);
a3470f
 
a3470f
-	for (i = 0; i < priv->child_count; i++) {
a3470f
-		data_readable[i] = 1;
a3470f
-		metadata_readable[i] = 1;
a3470f
-	}
a3470f
-        if (AFR_IS_ARBITER_BRICK (priv, ARBITER_BRICK_INDEX)) {
a3470f
-                data_readable[ARBITER_BRICK_INDEX] =  0;
a3470f
-                metadata_readable[ARBITER_BRICK_INDEX] = 0;
a3470f
-        }
a3470f
-
a3470f
-	for (i = 0; i < priv->child_count; i++) {
a3470f
-		if (!replies[i].valid) {
a3470f
-			data_readable[i] = 0;
a3470f
-			metadata_readable[i] = 0;
a3470f
-			continue;
a3470f
-		}
a3470f
-
a3470f
-		if (replies[i].op_ret == -1) {
a3470f
-			data_readable[i] = 0;
a3470f
-			metadata_readable[i] = 0;
a3470f
-			continue;
a3470f
-		}
a3470f
-
a3470f
-                if (replies[i].xdata &&
a3470f
-                    dict_get (replies[i].xdata, GLUSTERFS_BAD_INODE)) {
a3470f
-			data_readable[i] = 0;
a3470f
-			metadata_readable[i] = 0;
a3470f
-			continue;
a3470f
-                }
a3470f
-
a3470f
-		afr_accused_fill (this, replies[i].xdata, data_accused,
a3470f
-				  (replies[i].poststat.ia_type == IA_IFDIR) ?
a3470f
-				   AFR_ENTRY_TRANSACTION : AFR_DATA_TRANSACTION);
a3470f
-
a3470f
-		afr_accused_fill (this, replies[i].xdata,
a3470f
-				  metadata_accused, AFR_METADATA_TRANSACTION);
a3470f
-
a3470f
-	}
a3470f
-
a3470f
-	if ((inode->ia_type != IA_IFDIR) &&
a3470f
-            /* We want to accuse small files only when we know for sure that
a3470f
-             * there is no IO happening. Otherwise, the ia_sizes obtained in
a3470f
-             * post-refresh replies may  mismatch due to a race between inode-
a3470f
-             * refresh and ongoing writes, causing spurious heal launches*/
a3470f
-            !afr_is_possibly_under_txn (AFR_DATA_TRANSACTION, local, this))
a3470f
-		afr_accuse_smallfiles (this, replies, data_accused);
a3470f
-
a3470f
-	for (i = 0; i < priv->child_count; i++) {
a3470f
-		if (data_accused[i]) {
a3470f
-			data_readable[i] = 0;
a3470f
-			ret = 1;
a3470f
-		}
a3470f
-		if (metadata_accused[i]) {
a3470f
-			metadata_readable[i] = 0;
a3470f
-			ret = 1;
a3470f
-		}
a3470f
-	}
a3470f
+        ret = afr_readables_fill (frame, this, inode, data_accused,
a3470f
+                                  metadata_accused, data_readable,
a3470f
+                                  metadata_readable, replies);
a3470f
 
a3470f
 	for (i = 0; i < priv->child_count; i++) {
a3470f
                 if (start_heal && priv->child_up[i] &&
a3470f
@@ -5510,13 +5531,13 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this)
a3470f
         if (!local->transaction.pre_op)
a3470f
                 goto out;
a3470f
 
a3470f
-        if (priv->arbiter_count == 1) {
a3470f
-                local->transaction.pre_op_xdata =
a3470f
-                        GF_CALLOC (sizeof (*local->transaction.pre_op_xdata),
a3470f
-                                   priv->child_count, gf_afr_mt_dict_t);
a3470f
-                if (!local->transaction.pre_op_xdata)
a3470f
-                        goto out;
a3470f
+        local->transaction.pre_op_xdata =
a3470f
+                GF_CALLOC (sizeof (*local->transaction.pre_op_xdata),
a3470f
+                           priv->child_count, gf_afr_mt_dict_t);
a3470f
+        if (!local->transaction.pre_op_xdata)
a3470f
+                goto out;
a3470f
 
a3470f
+        if (priv->arbiter_count == 1) {
a3470f
                 local->transaction.pre_op_sources =
a3470f
                         GF_CALLOC (sizeof (*local->transaction.pre_op_sources),
a3470f
                                    priv->child_count, gf_afr_mt_char);
a3470f
@@ -6489,42 +6510,45 @@ int
a3470f
 afr_write_subvol_set (call_frame_t *frame, xlator_t *this)
a3470f
 {
a3470f
         afr_local_t      *local = NULL;
a3470f
-        afr_inode_ctx_t  *ctx   = NULL;
a3470f
+        afr_private_t    *priv  = NULL;
a3470f
+        unsigned char    *data_accused = NULL;
a3470f
+        unsigned char    *metadata_accused = NULL;
a3470f
+        unsigned char    *data_readable = NULL;
a3470f
+        unsigned char    *metadata_readable = NULL;
a3470f
+        uint16_t          datamap = 0;
a3470f
+        uint16_t          metadatamap = 0;
a3470f
         uint64_t          val   = 0;
a3470f
-        uint64_t          val1  = 0;
a3470f
-        int               ret   = -1;
a3470f
+        int               event = 0;
a3470f
+        int               i     = 0;
a3470f
 
a3470f
         local = frame->local;
a3470f
+        priv = this->private;
a3470f
+        data_accused = alloca0 (priv->child_count);
a3470f
+        metadata_accused = alloca0 (priv->child_count);
a3470f
+        data_readable = alloca0 (priv->child_count);
a3470f
+        metadata_readable = alloca0 (priv->child_count);
a3470f
+        event = local->event_generation;
a3470f
+
a3470f
+        afr_readables_fill (frame, this, local->inode, data_accused,
a3470f
+                            metadata_accused, data_readable, metadata_readable,
a3470f
+                            NULL);
a3470f
+
a3470f
+        for (i = 0; i < priv->child_count; i++) {
a3470f
+                if (data_readable[i])
a3470f
+                        datamap |= (1 << i);
a3470f
+                if (metadata_readable[i])
a3470f
+                        metadatamap |= (1 << i);
a3470f
+        }
a3470f
+
a3470f
+        val = ((uint64_t) metadatamap) |
a3470f
+              (((uint64_t) datamap) << 16) |
a3470f
+              (((uint64_t) event) << 32);
a3470f
+
a3470f
         LOCK(&local->inode->lock);
a3470f
         {
a3470f
-                ret = __afr_inode_ctx_get (this, local->inode, &ctx;;
a3470f
-                if (ret < 0) {
a3470f
-                        gf_msg (this->name, GF_LOG_ERROR, 0,
a3470f
-                                AFR_MSG_DICT_GET_FAILED,
a3470f
-                                "ERROR GETTING INODE CTX");
a3470f
-                        UNLOCK(&local->inode->lock);
a3470f
-                        return ret;
a3470f
-                }
a3470f
-
a3470f
-                val = ctx->write_subvol;
a3470f
-                /*
a3470f
-                 * We need to set the value of write_subvol to read_subvol in 2
a3470f
-                 * cases:
a3470f
-                 * 1. Initially when the value is 0. i.e., it's the first lock
a3470f
-                 * request.
a3470f
-                 * 2. If it's a metadata transaction. If metadata transactions
a3470f
-                 * comes in between data transactions and we have a brick
a3470f
-                 * disconnect, the next metadata transaction won't get the
a3470f
-                 * latest value of readables, since we do resetting of
a3470f
-                 * write_subvol in unlock code path only if it's a data
a3470f
-                 * transaction. To handle those scenarios we need to set the
a3470f
-                 * value of write_subvol to read_subvol in case of metadata
a3470f
-                 * transactions.
a3470f
-                */
a3470f
-                if (val == 0 ||
a3470f
-                    local->transaction.type == AFR_METADATA_TRANSACTION) {
a3470f
-                        val1 = ctx->read_subvol;
a3470f
-                        ctx->write_subvol = val1;
a3470f
+                if (local->inode_ctx->write_subvol == 0 &&
a3470f
+                    local->transaction.type == AFR_DATA_TRANSACTION) {
a3470f
+                        local->inode_ctx->write_subvol = val;
a3470f
                 }
a3470f
         }
a3470f
         UNLOCK (&local->inode->lock);
a3470f
@@ -6536,23 +6560,37 @@ int
a3470f
 afr_write_subvol_reset (call_frame_t *frame, xlator_t *this)
a3470f
 {
a3470f
         afr_local_t      *local = NULL;
a3470f
-        afr_inode_ctx_t  *ctx   = NULL;
a3470f
-        int               ret   = -1;
a3470f
 
a3470f
         local = frame->local;
a3470f
         LOCK(&local->inode->lock);
a3470f
         {
a3470f
-                ret = __afr_inode_ctx_get (this, local->inode, &ctx;;
a3470f
-                if (ret < 0) {
a3470f
-                        gf_msg (this->name, GF_LOG_ERROR, 0,
a3470f
-                                AFR_MSG_DICT_GET_FAILED,
a3470f
-                                "ERROR GETTING INODE CTX");
a3470f
-                        UNLOCK(&local->inode->lock);
a3470f
-                        return ret;
a3470f
-                }
a3470f
-                ctx->write_subvol = 0;
a3470f
+                local->inode_ctx->lock_count--;
a3470f
+
a3470f
+                if (!local->inode_ctx->lock_count)
a3470f
+                        local->inode_ctx->write_subvol = 0;
a3470f
         }
a3470f
         UNLOCK(&local->inode->lock);
a3470f
 
a3470f
         return 0;
a3470f
 }
a3470f
+
a3470f
+int
a3470f
+afr_set_inode_local (xlator_t *this, afr_local_t *local, inode_t *inode)
a3470f
+{
a3470f
+        int ret = 0;
a3470f
+
a3470f
+        local->inode = inode_ref (inode);
a3470f
+        LOCK(&local->inode->lock);
a3470f
+        {
a3470f
+                ret = __afr_inode_ctx_get (this, local->inode,
a3470f
+                                           &local->inode_ctx);
a3470f
+        }
a3470f
+        UNLOCK (&local->inode->lock);
a3470f
+        if (ret < 0) {
a3470f
+                gf_msg_callingfn (this->name, GF_LOG_ERROR, ENOMEM,
a3470f
+                                  AFR_MSG_INODE_CTX_GET_FAILED,
a3470f
+                                  "Error getting inode ctx %s",
a3470f
+                                  uuid_utoa (local->inode->gfid));
a3470f
+        }
a3470f
+        return ret;
a3470f
+}
a3470f
diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c
a3470f
index 9099b8c..e088ed6 100644
a3470f
--- a/xlators/cluster/afr/src/afr-dir-write.c
a3470f
+++ b/xlators/cluster/afr/src/afr-dir-write.c
a3470f
@@ -477,7 +477,7 @@ afr_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
a3470f
 	if (!local->fd_ctx)
a3470f
 		goto out;
a3470f
 
a3470f
-	local->inode = inode_ref (loc->inode);
a3470f
+        local->inode = inode_ref (loc->inode);
a3470f
 	local->parent = inode_ref (loc->parent);
a3470f
 
a3470f
         local->op                = GF_FOP_CREATE;
a3470f
@@ -609,7 +609,7 @@ afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
a3470f
 		goto out;
a3470f
 
a3470f
         loc_copy (&local->loc, loc);
a3470f
-	local->inode = inode_ref (loc->inode);
a3470f
+        local->inode = inode_ref (loc->inode);
a3470f
 	local->parent = inode_ref (loc->parent);
a3470f
 
a3470f
         local->op               = GF_FOP_MKNOD;
a3470f
@@ -740,7 +740,7 @@ afr_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
a3470f
 		goto out;
a3470f
 
a3470f
         loc_copy (&local->loc, loc);
a3470f
-	local->inode = inode_ref (loc->inode);
a3470f
+        local->inode = inode_ref (loc->inode);
a3470f
 	local->parent = inode_ref (loc->parent);
a3470f
 
a3470f
         local->cont.mkdir.mode  = mode;
a3470f
@@ -877,7 +877,7 @@ afr_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
a3470f
         loc_copy (&local->loc,    oldloc);
a3470f
         loc_copy (&local->newloc, newloc);
a3470f
 
a3470f
-	local->inode = inode_ref (oldloc->inode);
a3470f
+        local->inode = inode_ref (oldloc->inode);
a3470f
 	local->parent = inode_ref (newloc->parent);
a3470f
 
a3470f
         if (xdata)
a3470f
@@ -1005,7 +1005,7 @@ afr_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath,
a3470f
 		goto out;
a3470f
 
a3470f
         loc_copy (&local->loc, loc);
a3470f
-	local->inode = inode_ref (loc->inode);
a3470f
+        local->inode = inode_ref (loc->inode);
a3470f
 	local->parent = inode_ref (loc->parent);
a3470f
 
a3470f
         local->cont.symlink.linkpath = gf_strdup (linkpath);
a3470f
@@ -1142,7 +1142,7 @@ afr_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
a3470f
         loc_copy (&local->loc,    oldloc);
a3470f
         loc_copy (&local->newloc, newloc);
a3470f
 
a3470f
-	local->inode = inode_ref (oldloc->inode);
a3470f
+        local->inode = inode_ref (oldloc->inode);
a3470f
 	local->parent = inode_ref (oldloc->parent);
a3470f
 	local->parent2 = inode_ref (newloc->parent);
a3470f
 
a3470f
@@ -1295,7 +1295,7 @@ afr_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
a3470f
         loc_copy (&local->loc, loc);
a3470f
         local->xflag = xflag;
a3470f
 
a3470f
-	local->inode = inode_ref (loc->inode);
a3470f
+        local->inode = inode_ref (loc->inode);
a3470f
 	local->parent = inode_ref (loc->parent);
a3470f
 
a3470f
         if (xdata)
a3470f
@@ -1421,7 +1421,7 @@ afr_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
a3470f
 
a3470f
 
a3470f
         loc_copy (&local->loc, loc);
a3470f
-	local->inode = inode_ref (loc->inode);
a3470f
+        local->inode = inode_ref (loc->inode);
a3470f
 	local->parent = inode_ref (loc->parent);
a3470f
 
a3470f
         local->cont.rmdir.flags = flags;
a3470f
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
a3470f
index 97397f9..f0231b7 100644
a3470f
--- a/xlators/cluster/afr/src/afr-inode-write.c
a3470f
+++ b/xlators/cluster/afr/src/afr-inode-write.c
a3470f
@@ -507,6 +507,7 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
a3470f
 {
a3470f
         afr_local_t *local = NULL;
a3470f
         int op_errno = ENOMEM;
a3470f
+        int ret = -1;
a3470f
 
a3470f
 	local = AFR_FRAME_INIT (frame, op_errno);
a3470f
 	if (!local)
a3470f
@@ -529,7 +530,9 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
a3470f
 		goto out;
a3470f
 
a3470f
         local->fd = fd_ref (fd);
a3470f
-	local->inode = inode_ref (fd->inode);
a3470f
+        ret = afr_set_inode_local (this, local, fd->inode);
a3470f
+        if (ret)
a3470f
+                goto out;
a3470f
 
a3470f
 	if (dict_set_uint32 (local->xdata_req, GLUSTERFS_OPEN_FD_COUNT, 4)) {
a3470f
 		op_errno = ENOMEM;
a3470f
@@ -654,7 +657,9 @@ afr_truncate (call_frame_t *frame, xlator_t *this,
a3470f
         local->transaction.unwind = afr_truncate_unwind;
a3470f
 
a3470f
         loc_copy (&local->loc, loc);
a3470f
-	local->inode = inode_ref (loc->inode);
a3470f
+        ret = afr_set_inode_local (this, local, loc->inode);
a3470f
+        if (ret)
a3470f
+                goto out;
a3470f
 
a3470f
         local->op = GF_FOP_TRUNCATE;
a3470f
 
a3470f
@@ -768,7 +773,9 @@ afr_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
a3470f
 		goto out;
a3470f
 
a3470f
         local->fd = fd_ref (fd);
a3470f
-	local->inode = inode_ref (fd->inode);
a3470f
+        ret = afr_set_inode_local (this, local, fd->inode);
a3470f
+        if (ret)
a3470f
+                goto out;
a3470f
 
a3470f
         local->op = GF_FOP_FTRUNCATE;
a3470f
 
a3470f
@@ -886,7 +893,9 @@ afr_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf,
a3470f
         local->transaction.unwind = afr_setattr_unwind;
a3470f
 
a3470f
         loc_copy (&local->loc, loc);
a3470f
-	local->inode = inode_ref (loc->inode);
a3470f
+        ret = afr_set_inode_local (this, local, loc->inode);
a3470f
+        if (ret)
a3470f
+                goto out;
a3470f
 
a3470f
 	local->op = GF_FOP_SETATTR;
a3470f
 
a3470f
@@ -991,7 +1000,9 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this,
a3470f
         local->transaction.unwind = afr_fsetattr_unwind;
a3470f
 
a3470f
         local->fd                 = fd_ref (fd);
a3470f
-	local->inode = inode_ref (fd->inode);
a3470f
+        ret = afr_set_inode_local (this, local, fd->inode);
a3470f
+        if (ret)
a3470f
+                goto out;
a3470f
 
a3470f
 	local->op = GF_FOP_FSETATTR;
a3470f
 
a3470f
@@ -1633,7 +1644,9 @@ afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
a3470f
         local->transaction.unwind = afr_setxattr_unwind;
a3470f
 
a3470f
         loc_copy (&local->loc, loc);
a3470f
-	local->inode = inode_ref (loc->inode);
a3470f
+        ret = afr_set_inode_local (this, local, loc->inode);
a3470f
+        if (ret)
a3470f
+                goto out;
a3470f
 
a3470f
         local->transaction.main_frame = frame;
a3470f
         local->transaction.start   = LLONG_MAX - 1;
a3470f
@@ -1745,7 +1758,9 @@ afr_fsetxattr (call_frame_t *frame, xlator_t *this,
a3470f
         local->transaction.unwind = afr_fsetxattr_unwind;
a3470f
 
a3470f
         local->fd                 = fd_ref (fd);
a3470f
-	local->inode = inode_ref (fd->inode);
a3470f
+        ret = afr_set_inode_local (this, local, fd->inode);
a3470f
+        if (ret)
a3470f
+                goto out;
a3470f
 
a3470f
 	local->op = GF_FOP_FSETXATTR;
a3470f
 
a3470f
@@ -1858,7 +1873,9 @@ afr_removexattr (call_frame_t *frame, xlator_t *this,
a3470f
         local->transaction.unwind = afr_removexattr_unwind;
a3470f
 
a3470f
         loc_copy (&local->loc, loc);
a3470f
-	local->inode = inode_ref (loc->inode);
a3470f
+        ret = afr_set_inode_local (this, local, loc->inode);
a3470f
+        if (ret)
a3470f
+                goto out;
a3470f
 
a3470f
 	local->op = GF_FOP_REMOVEXATTR;
a3470f
 
a3470f
@@ -1965,7 +1982,9 @@ afr_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
a3470f
         local->transaction.unwind = afr_fremovexattr_unwind;
a3470f
 
a3470f
         local->fd = fd_ref (fd);
a3470f
-	local->inode = inode_ref (fd->inode);
a3470f
+        ret = afr_set_inode_local (this, local, fd->inode);
a3470f
+        if (ret)
a3470f
+                goto out;
a3470f
 
a3470f
 	local->op = GF_FOP_FREMOVEXATTR;
a3470f
 
a3470f
@@ -2060,7 +2079,9 @@ afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
a3470f
         local->cont.fallocate.len = len;
a3470f
 
a3470f
         local->fd = fd_ref (fd);
a3470f
-	local->inode = inode_ref (fd->inode);
a3470f
+        ret = afr_set_inode_local (this, local, fd->inode);
a3470f
+        if (ret)
a3470f
+                goto out;
a3470f
 
a3470f
 	if (xdata)
a3470f
 		local->xdata_req = dict_copy_with_ref (xdata, NULL);
a3470f
@@ -2172,7 +2193,9 @@ afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
a3470f
         local->cont.discard.len = len;
a3470f
 
a3470f
         local->fd = fd_ref (fd);
a3470f
-	local->inode = inode_ref (fd->inode);
a3470f
+        ret = afr_set_inode_local (this, local, fd->inode);
a3470f
+        if (ret)
a3470f
+                goto out;
a3470f
 
a3470f
 	if (xdata)
a3470f
 		local->xdata_req = dict_copy_with_ref (xdata, NULL);
a3470f
@@ -2281,7 +2304,9 @@ afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
a3470f
         local->cont.zerofill.len = len;
a3470f
 
a3470f
         local->fd = fd_ref (fd);
a3470f
-	local->inode = inode_ref (fd->inode);
a3470f
+        ret = afr_set_inode_local (this, local, fd->inode);
a3470f
+        if (ret)
a3470f
+                goto out;
a3470f
 
a3470f
 	if (xdata)
a3470f
 		local->xdata_req = dict_copy_with_ref (xdata, NULL);
a3470f
@@ -2393,7 +2418,9 @@ afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
a3470f
         local->transaction.unwind = afr_xattrop_unwind;
a3470f
 
a3470f
         loc_copy (&local->loc, loc);
a3470f
-	local->inode = inode_ref (loc->inode);
a3470f
+        ret = afr_set_inode_local (this, local, loc->inode);
a3470f
+        if (ret)
a3470f
+                goto out;
a3470f
 
a3470f
 	local->op = GF_FOP_XATTROP;
a3470f
 
a3470f
@@ -2487,7 +2514,9 @@ afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
a3470f
         local->transaction.unwind = afr_fxattrop_unwind;
a3470f
 
a3470f
 	local->fd = fd_ref (fd);
a3470f
-	local->inode = inode_ref (fd->inode);
a3470f
+        ret = afr_set_inode_local (this, local, fd->inode);
a3470f
+        if (ret)
a3470f
+                goto out;
a3470f
 
a3470f
 	local->op = GF_FOP_FXATTROP;
a3470f
 
a3470f
diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c
a3470f
index c17f60f..f50c7b6 100644
a3470f
--- a/xlators/cluster/afr/src/afr-lk-common.c
a3470f
+++ b/xlators/cluster/afr/src/afr-lk-common.c
a3470f
@@ -615,14 +615,14 @@ afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
a3470f
 {
a3470f
         afr_local_t             *local          = NULL;
a3470f
         afr_internal_lock_t     *int_lock       = NULL;
a3470f
-        afr_fd_ctx_t            *fd_ctx         = NULL;
a3470f
-        afr_private_t           *priv           = NULL;
a3470f
         int                      call_count     = 0;
a3470f
         int                      ret            = 0;
a3470f
 
a3470f
         local    = frame->local;
a3470f
         int_lock = &local->internal_lock;
a3470f
-        priv = this->private;
a3470f
+
a3470f
+        if (local->transaction.type == AFR_DATA_TRANSACTION && op_ret != 1)
a3470f
+                ret = afr_write_subvol_reset (frame, this);
a3470f
 
a3470f
         LOCK (&frame->lock);
a3470f
         {
a3470f
@@ -633,11 +633,6 @@ afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
a3470f
         if (call_count == 0) {
a3470f
                 gf_msg_trace (this->name, 0,
a3470f
                               "All internal locks unlocked");
a3470f
-                if (local->fd) {
a3470f
-                        fd_ctx = afr_fd_ctx_get (local->fd, this);
a3470f
-                        if (0 == AFR_COUNT (fd_ctx->lock_acquired, priv->child_count))
a3470f
-                                ret = afr_write_subvol_reset (frame, this);
a3470f
-                }
a3470f
                 int_lock->lock_cbk (frame, this);
a3470f
         }
a3470f
 
a3470f
@@ -947,6 +942,15 @@ afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
a3470f
                         } else {
a3470f
                                 int_lock->locked_nodes[child_index] |= LOCKED_YES;
a3470f
                                 int_lock->lock_count++;
a3470f
+
a3470f
+                                if (local->transaction.type ==
a3470f
+                                    AFR_DATA_TRANSACTION) {
a3470f
+                                        LOCK(&local->inode->lock);
a3470f
+                                        {
a3470f
+                                                local->inode_ctx->lock_count++;
a3470f
+                                        }
a3470f
+                                        UNLOCK (&local->inode->lock);
a3470f
+                                }
a3470f
                         }
a3470f
                 }
a3470f
                 afr_lock_blocking (frame, this, cky + 1);
a3470f
@@ -1502,13 +1506,12 @@ int32_t
a3470f
 afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
a3470f
                              int32_t op_ret, int32_t op_errno, dict_t *xdata)
a3470f
 {
a3470f
-        afr_internal_lock_t *int_lock = NULL;
a3470f
-        afr_inodelk_t       *inodelk  = NULL;
a3470f
-        afr_local_t         *local    = NULL;
a3470f
-        int call_count  = 0;
a3470f
-        int child_index = (long) cookie;
a3470f
-        afr_fd_ctx_t        *fd_ctx = NULL;
a3470f
-
a3470f
+        afr_internal_lock_t *int_lock    = NULL;
a3470f
+        afr_inodelk_t       *inodelk     = NULL;
a3470f
+        afr_local_t         *local       = NULL;
a3470f
+        afr_fd_ctx_t        *fd_ctx      = NULL;
a3470f
+        int                  call_count  = 0;
a3470f
+        int                  child_index = (long) cookie;
a3470f
 
a3470f
         local    = frame->local;
a3470f
         int_lock = &local->internal_lock;
a3470f
@@ -1553,6 +1556,15 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
a3470f
                                         fd_ctx->lock_acquired[child_index]++;
a3470f
 				}
a3470f
 			}
a3470f
+
a3470f
+                        if (local->transaction.type == AFR_DATA_TRANSACTION &&
a3470f
+                            op_ret == 0) {
a3470f
+                                LOCK(&local->inode->lock);
a3470f
+                                {
a3470f
+                                        local->inode_ctx->lock_count++;
a3470f
+                                }
a3470f
+                                UNLOCK (&local->inode->lock);
a3470f
+                        }
a3470f
 		}
a3470f
 
a3470f
                 call_count = --int_lock->lk_call_count;
a3470f
diff --git a/xlators/cluster/afr/src/afr-messages.h b/xlators/cluster/afr/src/afr-messages.h
a3470f
index 02eb206..53ffcd8 100644
a3470f
--- a/xlators/cluster/afr/src/afr-messages.h
a3470f
+++ b/xlators/cluster/afr/src/afr-messages.h
a3470f
@@ -40,7 +40,7 @@
a3470f
  */
a3470f
 
a3470f
 #define GLFS_COMP_BASE_AFR      GLFS_MSGID_COMP_AFR
a3470f
-#define GLFS_NUM_MESSAGES       42
a3470f
+#define GLFS_NUM_MESSAGES       43
a3470f
 #define GLFS_MSGID_END          (GLFS_COMP_BASE_AFR + GLFS_NUM_MESSAGES + 1)
a3470f
 
a3470f
 #define glfs_msg_start_x GLFS_COMP_BASE_AFR, "Invalid: Start of messages"
a3470f
@@ -369,5 +369,12 @@
a3470f
 */
a3470f
 #define AFR_MSG_SBRAIN_FAV_CHILD_POLICY  (GLFS_COMP_BASE_AFR + 42)
a3470f
 
a3470f
+/*!
a3470f
+ * @messageid 108043
a3470f
+ * @diagnosis
a3470f
+ * @recommendation
a3470f
+*/
a3470f
+#define AFR_MSG_INODE_CTX_GET_FAILED (GLFS_COMP_BASE_AFR + 43)
a3470f
+
a3470f
 #define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages"
a3470f
 #endif /* !_AFR_MESSAGES_H_ */
a3470f
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
a3470f
index a04636f..7e40bba 100644
a3470f
--- a/xlators/cluster/afr/src/afr-transaction.c
a3470f
+++ b/xlators/cluster/afr/src/afr-transaction.c
a3470f
@@ -372,14 +372,27 @@ afr_txn_arbitrate_fop (call_frame_t *frame, xlator_t *this)
a3470f
 int
a3470f
 afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)
a3470f
 {
a3470f
-        afr_local_t     *local = NULL;
a3470f
-        afr_private_t   *priv = NULL;
a3470f
-        fd_t            *fd   = NULL;
a3470f
+        afr_local_t   *local = NULL;
a3470f
+        afr_private_t *priv  = NULL;
a3470f
+        fd_t          *fd    = NULL;
a3470f
+        int           i      = 0;
a3470f
+        int           ret    = 0;
a3470f
 
a3470f
         local = frame->local;
a3470f
         priv = this->private;
a3470f
         fd    = local->fd;
a3470f
 
a3470f
+        if (local->transaction.type == AFR_DATA_TRANSACTION &&
a3470f
+            !local->transaction.inherited) {
a3470f
+                ret = afr_write_subvol_set (frame, this);
a3470f
+                if (ret) {
a3470f
+                        /*act as if operation failed on all subvols*/
a3470f
+                        local->op_ret = -1;
a3470f
+                        local->op_errno = -ret;
a3470f
+                        for (i = 0; i < priv->child_count; i++)
a3470f
+                                local->transaction.failed_subvols[i] = 1;
a3470f
+                }
a3470f
+        }
a3470f
         /*  Perform fops with the lk-owner from top xlator.
a3470f
          *  Eg: lk-owner of posix-lk and flush should be same,
a3470f
          *  flush cant clear the  posix-lks without that lk-owner.
a3470f
@@ -1116,32 +1129,28 @@ unlock:
a3470f
 
a3470f
 int
a3470f
 afr_changelog_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
a3470f
-		   int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
a3470f
+                   int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
a3470f
 {
a3470f
         afr_local_t *local = NULL;
a3470f
-        afr_private_t *priv = NULL;
a3470f
         int call_count = -1;
a3470f
         int child_index = -1;
a3470f
 
a3470f
         local = frame->local;
a3470f
-        priv = this->private;
a3470f
         child_index = (long) cookie;
a3470f
 
a3470f
-	if (op_ret == -1) {
a3470f
+        if (op_ret == -1) {
a3470f
                 local->op_errno = op_errno;
a3470f
-		afr_transaction_fop_failed (frame, this, child_index);
a3470f
+                afr_transaction_fop_failed (frame, this, child_index);
a3470f
         }
a3470f
 
a3470f
-        if (priv->arbiter_count == 1 && !op_ret) {
a3470f
-                if (xattr)
a3470f
-                        local->transaction.pre_op_xdata[child_index] =
a3470f
-                                                               dict_ref (xattr);
a3470f
-        }
a3470f
+        if (xattr)
a3470f
+                local->transaction.pre_op_xdata[child_index] = dict_ref (xattr);
a3470f
 
a3470f
-	call_count = afr_frame_return (frame);
a3470f
+        call_count = afr_frame_return (frame);
a3470f
 
a3470f
-        if (call_count == 0)
a3470f
-		local->transaction.changelog_resume (frame, this);
a3470f
+        if (call_count == 0) {
a3470f
+                local->transaction.changelog_resume (frame, this);
a3470f
+        }
a3470f
 
a3470f
         return 0;
a3470f
 }
a3470f
@@ -1750,10 +1759,6 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
a3470f
 	if (pre_nop)
a3470f
 		goto next;
a3470f
 
a3470f
-        ret = afr_write_subvol_set (frame, this);
a3470f
-        if (ret)
a3470f
-                goto err;
a3470f
-
a3470f
 	if (!local->pre_op_compat) {
a3470f
 		dict_copy (xdata_req, local->xdata_req);
a3470f
 		goto next;
a3470f
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
a3470f
index 0a06eb6..96fefb1 100644
a3470f
--- a/xlators/cluster/afr/src/afr.h
a3470f
+++ b/xlators/cluster/afr/src/afr.h
a3470f
@@ -377,6 +377,16 @@ typedef enum {
a3470f
         AFR_FOP_LOCK_QUORUM_FAILED,
a3470f
 } afr_fop_lock_state_t;
a3470f
 
a3470f
+typedef struct _afr_inode_ctx {
a3470f
+        uint64_t        read_subvol;
a3470f
+        uint64_t        write_subvol;
a3470f
+        int             lock_count;
a3470f
+        int             spb_choice;
a3470f
+        gf_timer_t      *timer;
a3470f
+        gf_boolean_t    need_refresh;
a3470f
+} afr_inode_ctx_t;
a3470f
+
a3470f
+
a3470f
 typedef struct _afr_local {
a3470f
 	glusterfs_fop_t  op;
a3470f
         unsigned int call_count;
a3470f
@@ -833,17 +843,10 @@ typedef struct _afr_local {
a3470f
         compound_args_t *c_args;
a3470f
 
a3470f
         gf_boolean_t is_read_txn;
a3470f
+        afr_inode_ctx_t *inode_ctx;
a3470f
 } afr_local_t;
a3470f
 
a3470f
 
a3470f
-typedef struct _afr_inode_ctx {
a3470f
-        uint64_t        read_subvol;
a3470f
-        uint64_t        write_subvol;
a3470f
-        int             spb_choice;
a3470f
-        gf_timer_t      *timer;
a3470f
-        gf_boolean_t    need_refresh;
a3470f
-} afr_inode_ctx_t;
a3470f
-
a3470f
 typedef struct afr_spbc_timeout {
a3470f
         call_frame_t *frame;
a3470f
         gf_boolean_t d_spb;
a3470f
@@ -1274,6 +1277,9 @@ afr_write_subvol_set (call_frame_t *frame, xlator_t *this);
a3470f
 int
a3470f
 afr_write_subvol_reset (call_frame_t *frame, xlator_t *this);
a3470f
 
a3470f
+int
a3470f
+afr_set_inode_local (xlator_t *this, afr_local_t *local, inode_t *inode);
a3470f
+
a3470f
 gf_boolean_t
a3470f
 afr_is_symmetric_error (call_frame_t *frame, xlator_t *this);
a3470f
 #endif /* __AFR_H__ */
a3470f
-- 
a3470f
1.8.3.1
a3470f