e7a346
From 3a4682ccd935744a0c5346bae23658ff08d65343 Mon Sep 17 00:00:00 2001
e7a346
From: karthik-us <ksubrahm@redhat.com>
e7a346
Date: Mon, 15 Jan 2018 12:48:54 +0530
e7a346
Subject: [PATCH 125/128] cluster/afr: Fixing the flaws in arbiter becoming
e7a346
 source patch
e7a346
e7a346
Problem:
e7a346
Setting the write_subvol value to read_subvol in case of metadata
e7a346
transaction during pre-op (commit 19f9bcff4aada589d4321356c2670ed283f02c03)
e7a346
might lead to the original problem of arbiter becoming source.
e7a346
e7a346
Scenario:
e7a346
1) All bricks are up and good
e7a346
2) 2 writes w1 and w2 are in progress in parallel
e7a346
3) ctx->read_subvol is good for all the subvolumes
e7a346
4) w1 succeeds on brick0 and fails on brick1, yet to do post-op on
e7a346
   the disk
e7a346
5) read/lookup comes on the same file and refreshes read_subvols back
e7a346
   to all good
e7a346
6) metadata transaction happens which makes ctx->write_subvol to be
e7a346
   assigned with ctx->read_subvol which is all good
e7a346
7) w2 succeeds on brick1 and fails on brick0 and this will update the
e7a346
   brick in reverse order leading to arbiter becoming source
e7a346
e7a346
Fix:
e7a346
Instead of setting the ctx->write_subvol to ctx->read_subvol in the
e7a346
pre-op statge, if there is a metadata transaction, check in the
e7a346
function __afr_set_in_flight_sb_status() if it is a data/metadata
e7a346
transaction. Use the value of ctx->write_subvol if it is a data
e7a346
transactions and ctx->read_subvol value for other transactions.
e7a346
e7a346
With this patch we assign the value of ctx->write_subvol in the
e7a346
afr_transaction_perform_fop() with the on disk value, instead of
e7a346
assigning it in the afr_changelog_pre_op() with the in memory value.
e7a346
e7a346
Upstream Patch: https://review.gluster.org/#/c/19045/
e7a346
e7a346
> Change-Id: Id2025a7e965f0578af35b1abaac793b019c43cc4
e7a346
> BUG: 1482064
e7a346
> Signed-off-by: karthik-us <ksubrahm@redhat.com>
e7a346
e7a346
Change-Id: Ie5d6745703fa5024d27e413093f7dfd08992e1df
e7a346
BUG: 1401969
e7a346
Signed-off-by: karthik-us <ksubrahm@redhat.com>
e7a346
Reviewed-on: https://code.engineering.redhat.com/gerrit/127644
e7a346
Tested-by: RHGS Build Bot <nigelb@redhat.com>
e7a346
Reviewed-by: Ravishankar Narayanankutty <ravishankar@redhat.com>
e7a346
Tested-by: Ravishankar Narayanankutty <ravishankar@redhat.com>
e7a346
---
e7a346
 xlators/cluster/afr/src/afr-common.c      | 266 +++++++++++++++++-------------
e7a346
 xlators/cluster/afr/src/afr-dir-write.c   |  16 +-
e7a346
 xlators/cluster/afr/src/afr-inode-write.c |  57 +++++--
e7a346
 xlators/cluster/afr/src/afr-lk-common.c   |  42 +++--
e7a346
 xlators/cluster/afr/src/afr-messages.h    |   9 +-
e7a346
 xlators/cluster/afr/src/afr-transaction.c |  45 ++---
e7a346
 xlators/cluster/afr/src/afr.h             |  22 ++-
e7a346
 7 files changed, 277 insertions(+), 180 deletions(-)
e7a346
e7a346
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
e7a346
index 692f198..6e6f5fa 100644
e7a346
--- a/xlators/cluster/afr/src/afr-common.c
e7a346
+++ b/xlators/cluster/afr/src/afr-common.c
e7a346
@@ -150,6 +150,7 @@ __afr_inode_ctx_get (xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx)
e7a346
                 tmp_ctx->spb_choice = -1;
e7a346
                 tmp_ctx->read_subvol = 0;
e7a346
                 tmp_ctx->write_subvol = 0;
e7a346
+                tmp_ctx->lock_count = 0;
e7a346
         } else {
e7a346
                 tmp_ctx = (afr_inode_ctx_t *) ctx_int;
e7a346
         }
e7a346
@@ -195,7 +196,6 @@ __afr_set_in_flight_sb_status (xlator_t *this, afr_local_t *local,
e7a346
                                inode_t *inode)
e7a346
 {
e7a346
         int                 i               = 0;
e7a346
-        int                 ret             = -1;
e7a346
         int                 txn_type        = 0;
e7a346
         int                 count           = 0;
e7a346
         int                 index           = -1;
e7a346
@@ -208,16 +208,14 @@ __afr_set_in_flight_sb_status (xlator_t *this, afr_local_t *local,
e7a346
         uint32_t            event           = 0;
e7a346
         uint64_t            val             = 0;
e7a346
         afr_private_t      *priv            = NULL;
e7a346
-        afr_inode_ctx_t    *ctx             = NULL;
e7a346
 
e7a346
         priv = this->private;
e7a346
         txn_type = local->transaction.type;
e7a346
 
e7a346
-        ret = __afr_inode_ctx_get (this, inode, &ctx;;
e7a346
-        if (ret < 0)
e7a346
-                return ret;
e7a346
-
e7a346
-        val = ctx->write_subvol;
e7a346
+        if (txn_type == AFR_DATA_TRANSACTION)
e7a346
+                val = local->inode_ctx->write_subvol;
e7a346
+        else
e7a346
+                val = local->inode_ctx->read_subvol;
e7a346
 
e7a346
         metadatamap_old = metadatamap = (val & 0x000000000000ffff);
e7a346
         datamap_old = datamap = (val & 0x00000000ffff0000) >> 16;
e7a346
@@ -278,10 +276,11 @@ __afr_set_in_flight_sb_status (xlator_t *this, afr_local_t *local,
e7a346
                 (((uint64_t) datamap) << 16) |
e7a346
                 (((uint64_t) event) << 32);
e7a346
 
e7a346
-        ctx->write_subvol = val;
e7a346
-        ctx->read_subvol = val;
e7a346
+        if (txn_type == AFR_DATA_TRANSACTION)
e7a346
+                local->inode_ctx->write_subvol = val;
e7a346
+        local->inode_ctx->read_subvol = val;
e7a346
 
e7a346
-        return ret;
e7a346
+        return 0;
e7a346
 }
e7a346
 
e7a346
 gf_boolean_t
e7a346
@@ -1001,6 +1000,81 @@ afr_accuse_smallfiles (xlator_t *this, struct afr_reply *replies,
e7a346
 }
e7a346
 
e7a346
 int
e7a346
+afr_readables_fill (call_frame_t *frame, xlator_t *this, inode_t *inode,
e7a346
+                    unsigned char *data_accused,
e7a346
+                    unsigned char *metadata_accused,
e7a346
+                    unsigned char *data_readable,
e7a346
+                    unsigned char *metadata_readable,
e7a346
+                    struct afr_reply *replies)
e7a346
+{
e7a346
+        afr_local_t *local = NULL;
e7a346
+        afr_private_t *priv = NULL;
e7a346
+        dict_t *xdata = NULL;
e7a346
+        int i = 0;
e7a346
+        int ret = 0;
e7a346
+        ia_type_t ia_type = IA_INVAL;
e7a346
+
e7a346
+        local = frame->local;
e7a346
+        priv = this->private;
e7a346
+
e7a346
+        for (i = 0; i < priv->child_count; i++) {
e7a346
+                data_readable[i] = 1;
e7a346
+                metadata_readable[i] = 1;
e7a346
+        }
e7a346
+        if (AFR_IS_ARBITER_BRICK (priv, ARBITER_BRICK_INDEX)) {
e7a346
+                data_readable[ARBITER_BRICK_INDEX] =  0;
e7a346
+                metadata_readable[ARBITER_BRICK_INDEX] = 0;
e7a346
+        }
e7a346
+
e7a346
+        for (i = 0; i < priv->child_count; i++) {
e7a346
+                if (replies) {/* Lookup */
e7a346
+                        if (!replies[i].valid || replies[i].op_ret == -1 ||
e7a346
+                            (replies[i].xdata && dict_get (replies[i].xdata,
e7a346
+                                                        GLUSTERFS_BAD_INODE))) {
e7a346
+                                data_readable[i] = 0;
e7a346
+                                metadata_readable[i] = 0;
e7a346
+                                continue;
e7a346
+                        }
e7a346
+
e7a346
+                        xdata = replies[i].xdata;
e7a346
+                        ia_type = replies[i].poststat.ia_type;
e7a346
+                } else {/* pre-op xattrop */
e7a346
+                        xdata = local->transaction.pre_op_xdata[i];
e7a346
+                        ia_type = inode->ia_type;
e7a346
+                }
e7a346
+
e7a346
+                afr_accused_fill (this, xdata, data_accused,
e7a346
+                                  (ia_type == IA_IFDIR) ?
e7a346
+                                  AFR_ENTRY_TRANSACTION : AFR_DATA_TRANSACTION);
e7a346
+
e7a346
+                afr_accused_fill (this, xdata,
e7a346
+                                  metadata_accused, AFR_METADATA_TRANSACTION);
e7a346
+        }
e7a346
+
e7a346
+        if (replies && ia_type != IA_INVAL && ia_type != IA_IFDIR &&
e7a346
+            /* We want to accuse small files only when we know for
e7a346
+             * sure that there is no IO happening. Otherwise, the
e7a346
+             * ia_sizes obtained in post-refresh replies may
e7a346
+             * mismatch due to a race between inode-refresh and
e7a346
+             * ongoing writes, causing spurious heal launches*/
e7a346
+            !afr_is_possibly_under_txn (AFR_DATA_TRANSACTION, local, this)) {
e7a346
+                afr_accuse_smallfiles (this, replies, data_accused);
e7a346
+        }
e7a346
+
e7a346
+        for (i = 0; i < priv->child_count; i++) {
e7a346
+                if (data_accused[i]) {
e7a346
+                        data_readable[i] = 0;
e7a346
+                        ret = 1;
e7a346
+                }
e7a346
+                if (metadata_accused[i]) {
e7a346
+                        metadata_readable[i] = 0;
e7a346
+                        ret = 1;
e7a346
+                }
e7a346
+        }
e7a346
+        return ret;
e7a346
+}
e7a346
+
e7a346
+int
e7a346
 afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode,
e7a346
                        gf_boolean_t *start_heal)
e7a346
 {
e7a346
@@ -1025,62 +1099,9 @@ afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode,
e7a346
 	metadata_accused = alloca0 (priv->child_count);
e7a346
 	metadata_readable = alloca0 (priv->child_count);
e7a346
 
e7a346
-	for (i = 0; i < priv->child_count; i++) {
e7a346
-		data_readable[i] = 1;
e7a346
-		metadata_readable[i] = 1;
e7a346
-	}
e7a346
-        if (AFR_IS_ARBITER_BRICK (priv, ARBITER_BRICK_INDEX)) {
e7a346
-                data_readable[ARBITER_BRICK_INDEX] =  0;
e7a346
-                metadata_readable[ARBITER_BRICK_INDEX] = 0;
e7a346
-        }
e7a346
-
e7a346
-	for (i = 0; i < priv->child_count; i++) {
e7a346
-		if (!replies[i].valid) {
e7a346
-			data_readable[i] = 0;
e7a346
-			metadata_readable[i] = 0;
e7a346
-			continue;
e7a346
-		}
e7a346
-
e7a346
-		if (replies[i].op_ret == -1) {
e7a346
-			data_readable[i] = 0;
e7a346
-			metadata_readable[i] = 0;
e7a346
-			continue;
e7a346
-		}
e7a346
-
e7a346
-                if (replies[i].xdata &&
e7a346
-                    dict_get (replies[i].xdata, GLUSTERFS_BAD_INODE)) {
e7a346
-			data_readable[i] = 0;
e7a346
-			metadata_readable[i] = 0;
e7a346
-			continue;
e7a346
-                }
e7a346
-
e7a346
-		afr_accused_fill (this, replies[i].xdata, data_accused,
e7a346
-				  (replies[i].poststat.ia_type == IA_IFDIR) ?
e7a346
-				   AFR_ENTRY_TRANSACTION : AFR_DATA_TRANSACTION);
e7a346
-
e7a346
-		afr_accused_fill (this, replies[i].xdata,
e7a346
-				  metadata_accused, AFR_METADATA_TRANSACTION);
e7a346
-
e7a346
-	}
e7a346
-
e7a346
-	if ((inode->ia_type != IA_IFDIR) &&
e7a346
-            /* We want to accuse small files only when we know for sure that
e7a346
-             * there is no IO happening. Otherwise, the ia_sizes obtained in
e7a346
-             * post-refresh replies may  mismatch due to a race between inode-
e7a346
-             * refresh and ongoing writes, causing spurious heal launches*/
e7a346
-            !afr_is_possibly_under_txn (AFR_DATA_TRANSACTION, local, this))
e7a346
-		afr_accuse_smallfiles (this, replies, data_accused);
e7a346
-
e7a346
-	for (i = 0; i < priv->child_count; i++) {
e7a346
-		if (data_accused[i]) {
e7a346
-			data_readable[i] = 0;
e7a346
-			ret = 1;
e7a346
-		}
e7a346
-		if (metadata_accused[i]) {
e7a346
-			metadata_readable[i] = 0;
e7a346
-			ret = 1;
e7a346
-		}
e7a346
-	}
e7a346
+        ret = afr_readables_fill (frame, this, inode, data_accused,
e7a346
+                                  metadata_accused, data_readable,
e7a346
+                                  metadata_readable, replies);
e7a346
 
e7a346
 	for (i = 0; i < priv->child_count; i++) {
e7a346
                 if (start_heal && priv->child_up[i] &&
e7a346
@@ -5510,13 +5531,13 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this)
e7a346
         if (!local->transaction.pre_op)
e7a346
                 goto out;
e7a346
 
e7a346
-        if (priv->arbiter_count == 1) {
e7a346
-                local->transaction.pre_op_xdata =
e7a346
-                        GF_CALLOC (sizeof (*local->transaction.pre_op_xdata),
e7a346
-                                   priv->child_count, gf_afr_mt_dict_t);
e7a346
-                if (!local->transaction.pre_op_xdata)
e7a346
-                        goto out;
e7a346
+        local->transaction.pre_op_xdata =
e7a346
+                GF_CALLOC (sizeof (*local->transaction.pre_op_xdata),
e7a346
+                           priv->child_count, gf_afr_mt_dict_t);
e7a346
+        if (!local->transaction.pre_op_xdata)
e7a346
+                goto out;
e7a346
 
e7a346
+        if (priv->arbiter_count == 1) {
e7a346
                 local->transaction.pre_op_sources =
e7a346
                         GF_CALLOC (sizeof (*local->transaction.pre_op_sources),
e7a346
                                    priv->child_count, gf_afr_mt_char);
e7a346
@@ -6489,42 +6510,45 @@ int
e7a346
 afr_write_subvol_set (call_frame_t *frame, xlator_t *this)
e7a346
 {
e7a346
         afr_local_t      *local = NULL;
e7a346
-        afr_inode_ctx_t  *ctx   = NULL;
e7a346
+        afr_private_t    *priv  = NULL;
e7a346
+        unsigned char    *data_accused = NULL;
e7a346
+        unsigned char    *metadata_accused = NULL;
e7a346
+        unsigned char    *data_readable = NULL;
e7a346
+        unsigned char    *metadata_readable = NULL;
e7a346
+        uint16_t          datamap = 0;
e7a346
+        uint16_t          metadatamap = 0;
e7a346
         uint64_t          val   = 0;
e7a346
-        uint64_t          val1  = 0;
e7a346
-        int               ret   = -1;
e7a346
+        int               event = 0;
e7a346
+        int               i     = 0;
e7a346
 
e7a346
         local = frame->local;
e7a346
+        priv = this->private;
e7a346
+        data_accused = alloca0 (priv->child_count);
e7a346
+        metadata_accused = alloca0 (priv->child_count);
e7a346
+        data_readable = alloca0 (priv->child_count);
e7a346
+        metadata_readable = alloca0 (priv->child_count);
e7a346
+        event = local->event_generation;
e7a346
+
e7a346
+        afr_readables_fill (frame, this, local->inode, data_accused,
e7a346
+                            metadata_accused, data_readable, metadata_readable,
e7a346
+                            NULL);
e7a346
+
e7a346
+        for (i = 0; i < priv->child_count; i++) {
e7a346
+                if (data_readable[i])
e7a346
+                        datamap |= (1 << i);
e7a346
+                if (metadata_readable[i])
e7a346
+                        metadatamap |= (1 << i);
e7a346
+        }
e7a346
+
e7a346
+        val = ((uint64_t) metadatamap) |
e7a346
+              (((uint64_t) datamap) << 16) |
e7a346
+              (((uint64_t) event) << 32);
e7a346
+
e7a346
         LOCK(&local->inode->lock);
e7a346
         {
e7a346
-                ret = __afr_inode_ctx_get (this, local->inode, &ctx;;
e7a346
-                if (ret < 0) {
e7a346
-                        gf_msg (this->name, GF_LOG_ERROR, 0,
e7a346
-                                AFR_MSG_DICT_GET_FAILED,
e7a346
-                                "ERROR GETTING INODE CTX");
e7a346
-                        UNLOCK(&local->inode->lock);
e7a346
-                        return ret;
e7a346
-                }
e7a346
-
e7a346
-                val = ctx->write_subvol;
e7a346
-                /*
e7a346
-                 * We need to set the value of write_subvol to read_subvol in 2
e7a346
-                 * cases:
e7a346
-                 * 1. Initially when the value is 0. i.e., it's the first lock
e7a346
-                 * request.
e7a346
-                 * 2. If it's a metadata transaction. If metadata transactions
e7a346
-                 * comes in between data transactions and we have a brick
e7a346
-                 * disconnect, the next metadata transaction won't get the
e7a346
-                 * latest value of readables, since we do resetting of
e7a346
-                 * write_subvol in unlock code path only if it's a data
e7a346
-                 * transaction. To handle those scenarios we need to set the
e7a346
-                 * value of write_subvol to read_subvol in case of metadata
e7a346
-                 * transactions.
e7a346
-                */
e7a346
-                if (val == 0 ||
e7a346
-                    local->transaction.type == AFR_METADATA_TRANSACTION) {
e7a346
-                        val1 = ctx->read_subvol;
e7a346
-                        ctx->write_subvol = val1;
e7a346
+                if (local->inode_ctx->write_subvol == 0 &&
e7a346
+                    local->transaction.type == AFR_DATA_TRANSACTION) {
e7a346
+                        local->inode_ctx->write_subvol = val;
e7a346
                 }
e7a346
         }
e7a346
         UNLOCK (&local->inode->lock);
e7a346
@@ -6536,23 +6560,37 @@ int
e7a346
 afr_write_subvol_reset (call_frame_t *frame, xlator_t *this)
e7a346
 {
e7a346
         afr_local_t      *local = NULL;
e7a346
-        afr_inode_ctx_t  *ctx   = NULL;
e7a346
-        int               ret   = -1;
e7a346
 
e7a346
         local = frame->local;
e7a346
         LOCK(&local->inode->lock);
e7a346
         {
e7a346
-                ret = __afr_inode_ctx_get (this, local->inode, &ctx;;
e7a346
-                if (ret < 0) {
e7a346
-                        gf_msg (this->name, GF_LOG_ERROR, 0,
e7a346
-                                AFR_MSG_DICT_GET_FAILED,
e7a346
-                                "ERROR GETTING INODE CTX");
e7a346
-                        UNLOCK(&local->inode->lock);
e7a346
-                        return ret;
e7a346
-                }
e7a346
-                ctx->write_subvol = 0;
e7a346
+                local->inode_ctx->lock_count--;
e7a346
+
e7a346
+                if (!local->inode_ctx->lock_count)
e7a346
+                        local->inode_ctx->write_subvol = 0;
e7a346
         }
e7a346
         UNLOCK(&local->inode->lock);
e7a346
 
e7a346
         return 0;
e7a346
 }
e7a346
+
e7a346
+int
e7a346
+afr_set_inode_local (xlator_t *this, afr_local_t *local, inode_t *inode)
e7a346
+{
e7a346
+        int ret = 0;
e7a346
+
e7a346
+        local->inode = inode_ref (inode);
e7a346
+        LOCK(&local->inode->lock);
e7a346
+        {
e7a346
+                ret = __afr_inode_ctx_get (this, local->inode,
e7a346
+                                           &local->inode_ctx);
e7a346
+        }
e7a346
+        UNLOCK (&local->inode->lock);
e7a346
+        if (ret < 0) {
e7a346
+                gf_msg_callingfn (this->name, GF_LOG_ERROR, ENOMEM,
e7a346
+                                  AFR_MSG_INODE_CTX_GET_FAILED,
e7a346
+                                  "Error getting inode ctx %s",
e7a346
+                                  uuid_utoa (local->inode->gfid));
e7a346
+        }
e7a346
+        return ret;
e7a346
+}
e7a346
diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c
e7a346
index 9099b8c..e088ed6 100644
e7a346
--- a/xlators/cluster/afr/src/afr-dir-write.c
e7a346
+++ b/xlators/cluster/afr/src/afr-dir-write.c
e7a346
@@ -477,7 +477,7 @@ afr_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
e7a346
 	if (!local->fd_ctx)
e7a346
 		goto out;
e7a346
 
e7a346
-	local->inode = inode_ref (loc->inode);
e7a346
+        local->inode = inode_ref (loc->inode);
e7a346
 	local->parent = inode_ref (loc->parent);
e7a346
 
e7a346
         local->op                = GF_FOP_CREATE;
e7a346
@@ -609,7 +609,7 @@ afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
e7a346
 		goto out;
e7a346
 
e7a346
         loc_copy (&local->loc, loc);
e7a346
-	local->inode = inode_ref (loc->inode);
e7a346
+        local->inode = inode_ref (loc->inode);
e7a346
 	local->parent = inode_ref (loc->parent);
e7a346
 
e7a346
         local->op               = GF_FOP_MKNOD;
e7a346
@@ -740,7 +740,7 @@ afr_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
e7a346
 		goto out;
e7a346
 
e7a346
         loc_copy (&local->loc, loc);
e7a346
-	local->inode = inode_ref (loc->inode);
e7a346
+        local->inode = inode_ref (loc->inode);
e7a346
 	local->parent = inode_ref (loc->parent);
e7a346
 
e7a346
         local->cont.mkdir.mode  = mode;
e7a346
@@ -877,7 +877,7 @@ afr_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
e7a346
         loc_copy (&local->loc,    oldloc);
e7a346
         loc_copy (&local->newloc, newloc);
e7a346
 
e7a346
-	local->inode = inode_ref (oldloc->inode);
e7a346
+        local->inode = inode_ref (oldloc->inode);
e7a346
 	local->parent = inode_ref (newloc->parent);
e7a346
 
e7a346
         if (xdata)
e7a346
@@ -1005,7 +1005,7 @@ afr_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath,
e7a346
 		goto out;
e7a346
 
e7a346
         loc_copy (&local->loc, loc);
e7a346
-	local->inode = inode_ref (loc->inode);
e7a346
+        local->inode = inode_ref (loc->inode);
e7a346
 	local->parent = inode_ref (loc->parent);
e7a346
 
e7a346
         local->cont.symlink.linkpath = gf_strdup (linkpath);
e7a346
@@ -1142,7 +1142,7 @@ afr_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
e7a346
         loc_copy (&local->loc,    oldloc);
e7a346
         loc_copy (&local->newloc, newloc);
e7a346
 
e7a346
-	local->inode = inode_ref (oldloc->inode);
e7a346
+        local->inode = inode_ref (oldloc->inode);
e7a346
 	local->parent = inode_ref (oldloc->parent);
e7a346
 	local->parent2 = inode_ref (newloc->parent);
e7a346
 
e7a346
@@ -1295,7 +1295,7 @@ afr_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
e7a346
         loc_copy (&local->loc, loc);
e7a346
         local->xflag = xflag;
e7a346
 
e7a346
-	local->inode = inode_ref (loc->inode);
e7a346
+        local->inode = inode_ref (loc->inode);
e7a346
 	local->parent = inode_ref (loc->parent);
e7a346
 
e7a346
         if (xdata)
e7a346
@@ -1421,7 +1421,7 @@ afr_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
e7a346
 
e7a346
 
e7a346
         loc_copy (&local->loc, loc);
e7a346
-	local->inode = inode_ref (loc->inode);
e7a346
+        local->inode = inode_ref (loc->inode);
e7a346
 	local->parent = inode_ref (loc->parent);
e7a346
 
e7a346
         local->cont.rmdir.flags = flags;
e7a346
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
e7a346
index 97397f9..f0231b7 100644
e7a346
--- a/xlators/cluster/afr/src/afr-inode-write.c
e7a346
+++ b/xlators/cluster/afr/src/afr-inode-write.c
e7a346
@@ -507,6 +507,7 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
e7a346
 {
e7a346
         afr_local_t *local = NULL;
e7a346
         int op_errno = ENOMEM;
e7a346
+        int ret = -1;
e7a346
 
e7a346
 	local = AFR_FRAME_INIT (frame, op_errno);
e7a346
 	if (!local)
e7a346
@@ -529,7 +530,9 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
e7a346
 		goto out;
e7a346
 
e7a346
         local->fd = fd_ref (fd);
e7a346
-	local->inode = inode_ref (fd->inode);
e7a346
+        ret = afr_set_inode_local (this, local, fd->inode);
e7a346
+        if (ret)
e7a346
+                goto out;
e7a346
 
e7a346
 	if (dict_set_uint32 (local->xdata_req, GLUSTERFS_OPEN_FD_COUNT, 4)) {
e7a346
 		op_errno = ENOMEM;
e7a346
@@ -654,7 +657,9 @@ afr_truncate (call_frame_t *frame, xlator_t *this,
e7a346
         local->transaction.unwind = afr_truncate_unwind;
e7a346
 
e7a346
         loc_copy (&local->loc, loc);
e7a346
-	local->inode = inode_ref (loc->inode);
e7a346
+        ret = afr_set_inode_local (this, local, loc->inode);
e7a346
+        if (ret)
e7a346
+                goto out;
e7a346
 
e7a346
         local->op = GF_FOP_TRUNCATE;
e7a346
 
e7a346
@@ -768,7 +773,9 @@ afr_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
e7a346
 		goto out;
e7a346
 
e7a346
         local->fd = fd_ref (fd);
e7a346
-	local->inode = inode_ref (fd->inode);
e7a346
+        ret = afr_set_inode_local (this, local, fd->inode);
e7a346
+        if (ret)
e7a346
+                goto out;
e7a346
 
e7a346
         local->op = GF_FOP_FTRUNCATE;
e7a346
 
e7a346
@@ -886,7 +893,9 @@ afr_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf,
e7a346
         local->transaction.unwind = afr_setattr_unwind;
e7a346
 
e7a346
         loc_copy (&local->loc, loc);
e7a346
-	local->inode = inode_ref (loc->inode);
e7a346
+        ret = afr_set_inode_local (this, local, loc->inode);
e7a346
+        if (ret)
e7a346
+                goto out;
e7a346
 
e7a346
 	local->op = GF_FOP_SETATTR;
e7a346
 
e7a346
@@ -991,7 +1000,9 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this,
e7a346
         local->transaction.unwind = afr_fsetattr_unwind;
e7a346
 
e7a346
         local->fd                 = fd_ref (fd);
e7a346
-	local->inode = inode_ref (fd->inode);
e7a346
+        ret = afr_set_inode_local (this, local, fd->inode);
e7a346
+        if (ret)
e7a346
+                goto out;
e7a346
 
e7a346
 	local->op = GF_FOP_FSETATTR;
e7a346
 
e7a346
@@ -1633,7 +1644,9 @@ afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
e7a346
         local->transaction.unwind = afr_setxattr_unwind;
e7a346
 
e7a346
         loc_copy (&local->loc, loc);
e7a346
-	local->inode = inode_ref (loc->inode);
e7a346
+        ret = afr_set_inode_local (this, local, loc->inode);
e7a346
+        if (ret)
e7a346
+                goto out;
e7a346
 
e7a346
         local->transaction.main_frame = frame;
e7a346
         local->transaction.start   = LLONG_MAX - 1;
e7a346
@@ -1745,7 +1758,9 @@ afr_fsetxattr (call_frame_t *frame, xlator_t *this,
e7a346
         local->transaction.unwind = afr_fsetxattr_unwind;
e7a346
 
e7a346
         local->fd                 = fd_ref (fd);
e7a346
-	local->inode = inode_ref (fd->inode);
e7a346
+        ret = afr_set_inode_local (this, local, fd->inode);
e7a346
+        if (ret)
e7a346
+                goto out;
e7a346
 
e7a346
 	local->op = GF_FOP_FSETXATTR;
e7a346
 
e7a346
@@ -1858,7 +1873,9 @@ afr_removexattr (call_frame_t *frame, xlator_t *this,
e7a346
         local->transaction.unwind = afr_removexattr_unwind;
e7a346
 
e7a346
         loc_copy (&local->loc, loc);
e7a346
-	local->inode = inode_ref (loc->inode);
e7a346
+        ret = afr_set_inode_local (this, local, loc->inode);
e7a346
+        if (ret)
e7a346
+                goto out;
e7a346
 
e7a346
 	local->op = GF_FOP_REMOVEXATTR;
e7a346
 
e7a346
@@ -1965,7 +1982,9 @@ afr_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
e7a346
         local->transaction.unwind = afr_fremovexattr_unwind;
e7a346
 
e7a346
         local->fd = fd_ref (fd);
e7a346
-	local->inode = inode_ref (fd->inode);
e7a346
+        ret = afr_set_inode_local (this, local, fd->inode);
e7a346
+        if (ret)
e7a346
+                goto out;
e7a346
 
e7a346
 	local->op = GF_FOP_FREMOVEXATTR;
e7a346
 
e7a346
@@ -2060,7 +2079,9 @@ afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
e7a346
         local->cont.fallocate.len = len;
e7a346
 
e7a346
         local->fd = fd_ref (fd);
e7a346
-	local->inode = inode_ref (fd->inode);
e7a346
+        ret = afr_set_inode_local (this, local, fd->inode);
e7a346
+        if (ret)
e7a346
+                goto out;
e7a346
 
e7a346
 	if (xdata)
e7a346
 		local->xdata_req = dict_copy_with_ref (xdata, NULL);
e7a346
@@ -2172,7 +2193,9 @@ afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
e7a346
         local->cont.discard.len = len;
e7a346
 
e7a346
         local->fd = fd_ref (fd);
e7a346
-	local->inode = inode_ref (fd->inode);
e7a346
+        ret = afr_set_inode_local (this, local, fd->inode);
e7a346
+        if (ret)
e7a346
+                goto out;
e7a346
 
e7a346
 	if (xdata)
e7a346
 		local->xdata_req = dict_copy_with_ref (xdata, NULL);
e7a346
@@ -2281,7 +2304,9 @@ afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
e7a346
         local->cont.zerofill.len = len;
e7a346
 
e7a346
         local->fd = fd_ref (fd);
e7a346
-	local->inode = inode_ref (fd->inode);
e7a346
+        ret = afr_set_inode_local (this, local, fd->inode);
e7a346
+        if (ret)
e7a346
+                goto out;
e7a346
 
e7a346
 	if (xdata)
e7a346
 		local->xdata_req = dict_copy_with_ref (xdata, NULL);
e7a346
@@ -2393,7 +2418,9 @@ afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
e7a346
         local->transaction.unwind = afr_xattrop_unwind;
e7a346
 
e7a346
         loc_copy (&local->loc, loc);
e7a346
-	local->inode = inode_ref (loc->inode);
e7a346
+        ret = afr_set_inode_local (this, local, loc->inode);
e7a346
+        if (ret)
e7a346
+                goto out;
e7a346
 
e7a346
 	local->op = GF_FOP_XATTROP;
e7a346
 
e7a346
@@ -2487,7 +2514,9 @@ afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
e7a346
         local->transaction.unwind = afr_fxattrop_unwind;
e7a346
 
e7a346
 	local->fd = fd_ref (fd);
e7a346
-	local->inode = inode_ref (fd->inode);
e7a346
+        ret = afr_set_inode_local (this, local, fd->inode);
e7a346
+        if (ret)
e7a346
+                goto out;
e7a346
 
e7a346
 	local->op = GF_FOP_FXATTROP;
e7a346
 
e7a346
diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c
e7a346
index c17f60f..f50c7b6 100644
e7a346
--- a/xlators/cluster/afr/src/afr-lk-common.c
e7a346
+++ b/xlators/cluster/afr/src/afr-lk-common.c
e7a346
@@ -615,14 +615,14 @@ afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
 {
e7a346
         afr_local_t             *local          = NULL;
e7a346
         afr_internal_lock_t     *int_lock       = NULL;
e7a346
-        afr_fd_ctx_t            *fd_ctx         = NULL;
e7a346
-        afr_private_t           *priv           = NULL;
e7a346
         int                      call_count     = 0;
e7a346
         int                      ret            = 0;
e7a346
 
e7a346
         local    = frame->local;
e7a346
         int_lock = &local->internal_lock;
e7a346
-        priv = this->private;
e7a346
+
e7a346
+        if (local->transaction.type == AFR_DATA_TRANSACTION && op_ret != 1)
e7a346
+                ret = afr_write_subvol_reset (frame, this);
e7a346
 
e7a346
         LOCK (&frame->lock);
e7a346
         {
e7a346
@@ -633,11 +633,6 @@ afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
         if (call_count == 0) {
e7a346
                 gf_msg_trace (this->name, 0,
e7a346
                               "All internal locks unlocked");
e7a346
-                if (local->fd) {
e7a346
-                        fd_ctx = afr_fd_ctx_get (local->fd, this);
e7a346
-                        if (0 == AFR_COUNT (fd_ctx->lock_acquired, priv->child_count))
e7a346
-                                ret = afr_write_subvol_reset (frame, this);
e7a346
-                }
e7a346
                 int_lock->lock_cbk (frame, this);
e7a346
         }
e7a346
 
e7a346
@@ -947,6 +942,15 @@ afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
                         } else {
e7a346
                                 int_lock->locked_nodes[child_index] |= LOCKED_YES;
e7a346
                                 int_lock->lock_count++;
e7a346
+
e7a346
+                                if (local->transaction.type ==
e7a346
+                                    AFR_DATA_TRANSACTION) {
e7a346
+                                        LOCK(&local->inode->lock);
e7a346
+                                        {
e7a346
+                                                local->inode_ctx->lock_count++;
e7a346
+                                        }
e7a346
+                                        UNLOCK (&local->inode->lock);
e7a346
+                                }
e7a346
                         }
e7a346
                 }
e7a346
                 afr_lock_blocking (frame, this, cky + 1);
e7a346
@@ -1502,13 +1506,12 @@ int32_t
e7a346
 afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
                              int32_t op_ret, int32_t op_errno, dict_t *xdata)
e7a346
 {
e7a346
-        afr_internal_lock_t *int_lock = NULL;
e7a346
-        afr_inodelk_t       *inodelk  = NULL;
e7a346
-        afr_local_t         *local    = NULL;
e7a346
-        int call_count  = 0;
e7a346
-        int child_index = (long) cookie;
e7a346
-        afr_fd_ctx_t        *fd_ctx = NULL;
e7a346
-
e7a346
+        afr_internal_lock_t *int_lock    = NULL;
e7a346
+        afr_inodelk_t       *inodelk     = NULL;
e7a346
+        afr_local_t         *local       = NULL;
e7a346
+        afr_fd_ctx_t        *fd_ctx      = NULL;
e7a346
+        int                  call_count  = 0;
e7a346
+        int                  child_index = (long) cookie;
e7a346
 
e7a346
         local    = frame->local;
e7a346
         int_lock = &local->internal_lock;
e7a346
@@ -1553,6 +1556,15 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
                                         fd_ctx->lock_acquired[child_index]++;
e7a346
 				}
e7a346
 			}
e7a346
+
e7a346
+                        if (local->transaction.type == AFR_DATA_TRANSACTION &&
e7a346
+                            op_ret == 0) {
e7a346
+                                LOCK(&local->inode->lock);
e7a346
+                                {
e7a346
+                                        local->inode_ctx->lock_count++;
e7a346
+                                }
e7a346
+                                UNLOCK (&local->inode->lock);
e7a346
+                        }
e7a346
 		}
e7a346
 
e7a346
                 call_count = --int_lock->lk_call_count;
e7a346
diff --git a/xlators/cluster/afr/src/afr-messages.h b/xlators/cluster/afr/src/afr-messages.h
e7a346
index 02eb206..53ffcd8 100644
e7a346
--- a/xlators/cluster/afr/src/afr-messages.h
e7a346
+++ b/xlators/cluster/afr/src/afr-messages.h
e7a346
@@ -40,7 +40,7 @@
e7a346
  */
e7a346
 
e7a346
 #define GLFS_COMP_BASE_AFR      GLFS_MSGID_COMP_AFR
e7a346
-#define GLFS_NUM_MESSAGES       42
e7a346
+#define GLFS_NUM_MESSAGES       43
e7a346
 #define GLFS_MSGID_END          (GLFS_COMP_BASE_AFR + GLFS_NUM_MESSAGES + 1)
e7a346
 
e7a346
 #define glfs_msg_start_x GLFS_COMP_BASE_AFR, "Invalid: Start of messages"
e7a346
@@ -369,5 +369,12 @@
e7a346
 */
e7a346
 #define AFR_MSG_SBRAIN_FAV_CHILD_POLICY  (GLFS_COMP_BASE_AFR + 42)
e7a346
 
e7a346
+/*!
e7a346
+ * @messageid 108043
e7a346
+ * @diagnosis
e7a346
+ * @recommendation
e7a346
+*/
e7a346
+#define AFR_MSG_INODE_CTX_GET_FAILED (GLFS_COMP_BASE_AFR + 43)
e7a346
+
e7a346
 #define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages"
e7a346
 #endif /* !_AFR_MESSAGES_H_ */
e7a346
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
e7a346
index a04636f..7e40bba 100644
e7a346
--- a/xlators/cluster/afr/src/afr-transaction.c
e7a346
+++ b/xlators/cluster/afr/src/afr-transaction.c
e7a346
@@ -372,14 +372,27 @@ afr_txn_arbitrate_fop (call_frame_t *frame, xlator_t *this)
e7a346
 int
e7a346
 afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)
e7a346
 {
e7a346
-        afr_local_t     *local = NULL;
e7a346
-        afr_private_t   *priv = NULL;
e7a346
-        fd_t            *fd   = NULL;
e7a346
+        afr_local_t   *local = NULL;
e7a346
+        afr_private_t *priv  = NULL;
e7a346
+        fd_t          *fd    = NULL;
e7a346
+        int           i      = 0;
e7a346
+        int           ret    = 0;
e7a346
 
e7a346
         local = frame->local;
e7a346
         priv = this->private;
e7a346
         fd    = local->fd;
e7a346
 
e7a346
+        if (local->transaction.type == AFR_DATA_TRANSACTION &&
e7a346
+            !local->transaction.inherited) {
e7a346
+                ret = afr_write_subvol_set (frame, this);
e7a346
+                if (ret) {
e7a346
+                        /*act as if operation failed on all subvols*/
e7a346
+                        local->op_ret = -1;
e7a346
+                        local->op_errno = -ret;
e7a346
+                        for (i = 0; i < priv->child_count; i++)
e7a346
+                                local->transaction.failed_subvols[i] = 1;
e7a346
+                }
e7a346
+        }
e7a346
         /*  Perform fops with the lk-owner from top xlator.
e7a346
          *  Eg: lk-owner of posix-lk and flush should be same,
e7a346
          *  flush cant clear the  posix-lks without that lk-owner.
e7a346
@@ -1116,32 +1129,28 @@ unlock:
e7a346
 
e7a346
 int
e7a346
 afr_changelog_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
-		   int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
e7a346
+                   int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
e7a346
 {
e7a346
         afr_local_t *local = NULL;
e7a346
-        afr_private_t *priv = NULL;
e7a346
         int call_count = -1;
e7a346
         int child_index = -1;
e7a346
 
e7a346
         local = frame->local;
e7a346
-        priv = this->private;
e7a346
         child_index = (long) cookie;
e7a346
 
e7a346
-	if (op_ret == -1) {
e7a346
+        if (op_ret == -1) {
e7a346
                 local->op_errno = op_errno;
e7a346
-		afr_transaction_fop_failed (frame, this, child_index);
e7a346
+                afr_transaction_fop_failed (frame, this, child_index);
e7a346
         }
e7a346
 
e7a346
-        if (priv->arbiter_count == 1 && !op_ret) {
e7a346
-                if (xattr)
e7a346
-                        local->transaction.pre_op_xdata[child_index] =
e7a346
-                                                               dict_ref (xattr);
e7a346
-        }
e7a346
+        if (xattr)
e7a346
+                local->transaction.pre_op_xdata[child_index] = dict_ref (xattr);
e7a346
 
e7a346
-	call_count = afr_frame_return (frame);
e7a346
+        call_count = afr_frame_return (frame);
e7a346
 
e7a346
-        if (call_count == 0)
e7a346
-		local->transaction.changelog_resume (frame, this);
e7a346
+        if (call_count == 0) {
e7a346
+                local->transaction.changelog_resume (frame, this);
e7a346
+        }
e7a346
 
e7a346
         return 0;
e7a346
 }
e7a346
@@ -1750,10 +1759,6 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
e7a346
 	if (pre_nop)
e7a346
 		goto next;
e7a346
 
e7a346
-        ret = afr_write_subvol_set (frame, this);
e7a346
-        if (ret)
e7a346
-                goto err;
e7a346
-
e7a346
 	if (!local->pre_op_compat) {
e7a346
 		dict_copy (xdata_req, local->xdata_req);
e7a346
 		goto next;
e7a346
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
e7a346
index 0a06eb6..96fefb1 100644
e7a346
--- a/xlators/cluster/afr/src/afr.h
e7a346
+++ b/xlators/cluster/afr/src/afr.h
e7a346
@@ -377,6 +377,16 @@ typedef enum {
e7a346
         AFR_FOP_LOCK_QUORUM_FAILED,
e7a346
 } afr_fop_lock_state_t;
e7a346
 
e7a346
+typedef struct _afr_inode_ctx {
e7a346
+        uint64_t        read_subvol;
e7a346
+        uint64_t        write_subvol;
e7a346
+        int             lock_count;
e7a346
+        int             spb_choice;
e7a346
+        gf_timer_t      *timer;
e7a346
+        gf_boolean_t    need_refresh;
e7a346
+} afr_inode_ctx_t;
e7a346
+
e7a346
+
e7a346
 typedef struct _afr_local {
e7a346
 	glusterfs_fop_t  op;
e7a346
         unsigned int call_count;
e7a346
@@ -833,17 +843,10 @@ typedef struct _afr_local {
e7a346
         compound_args_t *c_args;
e7a346
 
e7a346
         gf_boolean_t is_read_txn;
e7a346
+        afr_inode_ctx_t *inode_ctx;
e7a346
 } afr_local_t;
e7a346
 
e7a346
 
e7a346
-typedef struct _afr_inode_ctx {
e7a346
-        uint64_t        read_subvol;
e7a346
-        uint64_t        write_subvol;
e7a346
-        int             spb_choice;
e7a346
-        gf_timer_t      *timer;
e7a346
-        gf_boolean_t    need_refresh;
e7a346
-} afr_inode_ctx_t;
e7a346
-
e7a346
 typedef struct afr_spbc_timeout {
e7a346
         call_frame_t *frame;
e7a346
         gf_boolean_t d_spb;
e7a346
@@ -1274,6 +1277,9 @@ afr_write_subvol_set (call_frame_t *frame, xlator_t *this);
e7a346
 int
e7a346
 afr_write_subvol_reset (call_frame_t *frame, xlator_t *this);
e7a346
 
e7a346
+int
e7a346
+afr_set_inode_local (xlator_t *this, afr_local_t *local, inode_t *inode);
e7a346
+
e7a346
 gf_boolean_t
e7a346
 afr_is_symmetric_error (call_frame_t *frame, xlator_t *this);
e7a346
 #endif /* __AFR_H__ */
e7a346
-- 
e7a346
1.8.3.1
e7a346