d1681e
From 3a4682ccd935744a0c5346bae23658ff08d65343 Mon Sep 17 00:00:00 2001
d1681e
From: karthik-us <ksubrahm@redhat.com>
d1681e
Date: Mon, 15 Jan 2018 12:48:54 +0530
d1681e
Subject: [PATCH 125/128] cluster/afr: Fixing the flaws in arbiter becoming
d1681e
 source patch
d1681e
d1681e
Problem:
d1681e
Setting the write_subvol value to read_subvol in case of metadata
d1681e
transaction during pre-op (commit 19f9bcff4aada589d4321356c2670ed283f02c03)
d1681e
might lead to the original problem of arbiter becoming source.
d1681e
d1681e
Scenario:
d1681e
1) All bricks are up and good
d1681e
2) 2 writes w1 and w2 are in progress in parallel
d1681e
3) ctx->read_subvol is good for all the subvolumes
d1681e
4) w1 succeeds on brick0 and fails on brick1, yet to do post-op on
d1681e
   the disk
d1681e
5) read/lookup comes on the same file and refreshes read_subvols back
d1681e
   to all good
d1681e
6) metadata transaction happens which makes ctx->write_subvol to be
d1681e
   assigned with ctx->read_subvol which is all good
d1681e
7) w2 succeeds on brick1 and fails on brick0 and this will update the
d1681e
   brick in reverse order leading to arbiter becoming source
d1681e
d1681e
Fix:
d1681e
Instead of setting the ctx->write_subvol to ctx->read_subvol in the
d1681e
pre-op statge, if there is a metadata transaction, check in the
d1681e
function __afr_set_in_flight_sb_status() if it is a data/metadata
d1681e
transaction. Use the value of ctx->write_subvol if it is a data
d1681e
transactions and ctx->read_subvol value for other transactions.
d1681e
d1681e
With this patch we assign the value of ctx->write_subvol in the
d1681e
afr_transaction_perform_fop() with the on disk value, instead of
d1681e
assigning it in the afr_changelog_pre_op() with the in memory value.
d1681e
d1681e
Upstream Patch: https://review.gluster.org/#/c/19045/
d1681e
d1681e
> Change-Id: Id2025a7e965f0578af35b1abaac793b019c43cc4
d1681e
> BUG: 1482064
d1681e
> Signed-off-by: karthik-us <ksubrahm@redhat.com>
d1681e
d1681e
Change-Id: Ie5d6745703fa5024d27e413093f7dfd08992e1df
d1681e
BUG: 1401969
d1681e
Signed-off-by: karthik-us <ksubrahm@redhat.com>
d1681e
Reviewed-on: https://code.engineering.redhat.com/gerrit/127644
d1681e
Tested-by: RHGS Build Bot <nigelb@redhat.com>
d1681e
Reviewed-by: Ravishankar Narayanankutty <ravishankar@redhat.com>
d1681e
Tested-by: Ravishankar Narayanankutty <ravishankar@redhat.com>
d1681e
---
d1681e
 xlators/cluster/afr/src/afr-common.c      | 266 +++++++++++++++++-------------
d1681e
 xlators/cluster/afr/src/afr-dir-write.c   |  16 +-
d1681e
 xlators/cluster/afr/src/afr-inode-write.c |  57 +++++--
d1681e
 xlators/cluster/afr/src/afr-lk-common.c   |  42 +++--
d1681e
 xlators/cluster/afr/src/afr-messages.h    |   9 +-
d1681e
 xlators/cluster/afr/src/afr-transaction.c |  45 ++---
d1681e
 xlators/cluster/afr/src/afr.h             |  22 ++-
d1681e
 7 files changed, 277 insertions(+), 180 deletions(-)
d1681e
d1681e
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
d1681e
index 692f198..6e6f5fa 100644
d1681e
--- a/xlators/cluster/afr/src/afr-common.c
d1681e
+++ b/xlators/cluster/afr/src/afr-common.c
d1681e
@@ -150,6 +150,7 @@ __afr_inode_ctx_get (xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx)
d1681e
                 tmp_ctx->spb_choice = -1;
d1681e
                 tmp_ctx->read_subvol = 0;
d1681e
                 tmp_ctx->write_subvol = 0;
d1681e
+                tmp_ctx->lock_count = 0;
d1681e
         } else {
d1681e
                 tmp_ctx = (afr_inode_ctx_t *) ctx_int;
d1681e
         }
d1681e
@@ -195,7 +196,6 @@ __afr_set_in_flight_sb_status (xlator_t *this, afr_local_t *local,
d1681e
                                inode_t *inode)
d1681e
 {
d1681e
         int                 i               = 0;
d1681e
-        int                 ret             = -1;
d1681e
         int                 txn_type        = 0;
d1681e
         int                 count           = 0;
d1681e
         int                 index           = -1;
d1681e
@@ -208,16 +208,14 @@ __afr_set_in_flight_sb_status (xlator_t *this, afr_local_t *local,
d1681e
         uint32_t            event           = 0;
d1681e
         uint64_t            val             = 0;
d1681e
         afr_private_t      *priv            = NULL;
d1681e
-        afr_inode_ctx_t    *ctx             = NULL;
d1681e
 
d1681e
         priv = this->private;
d1681e
         txn_type = local->transaction.type;
d1681e
 
d1681e
-        ret = __afr_inode_ctx_get (this, inode, &ctx;;
d1681e
-        if (ret < 0)
d1681e
-                return ret;
d1681e
-
d1681e
-        val = ctx->write_subvol;
d1681e
+        if (txn_type == AFR_DATA_TRANSACTION)
d1681e
+                val = local->inode_ctx->write_subvol;
d1681e
+        else
d1681e
+                val = local->inode_ctx->read_subvol;
d1681e
 
d1681e
         metadatamap_old = metadatamap = (val & 0x000000000000ffff);
d1681e
         datamap_old = datamap = (val & 0x00000000ffff0000) >> 16;
d1681e
@@ -278,10 +276,11 @@ __afr_set_in_flight_sb_status (xlator_t *this, afr_local_t *local,
d1681e
                 (((uint64_t) datamap) << 16) |
d1681e
                 (((uint64_t) event) << 32);
d1681e
 
d1681e
-        ctx->write_subvol = val;
d1681e
-        ctx->read_subvol = val;
d1681e
+        if (txn_type == AFR_DATA_TRANSACTION)
d1681e
+                local->inode_ctx->write_subvol = val;
d1681e
+        local->inode_ctx->read_subvol = val;
d1681e
 
d1681e
-        return ret;
d1681e
+        return 0;
d1681e
 }
d1681e
 
d1681e
 gf_boolean_t
d1681e
@@ -1001,6 +1000,81 @@ afr_accuse_smallfiles (xlator_t *this, struct afr_reply *replies,
d1681e
 }
d1681e
 
d1681e
 int
d1681e
+afr_readables_fill (call_frame_t *frame, xlator_t *this, inode_t *inode,
d1681e
+                    unsigned char *data_accused,
d1681e
+                    unsigned char *metadata_accused,
d1681e
+                    unsigned char *data_readable,
d1681e
+                    unsigned char *metadata_readable,
d1681e
+                    struct afr_reply *replies)
d1681e
+{
d1681e
+        afr_local_t *local = NULL;
d1681e
+        afr_private_t *priv = NULL;
d1681e
+        dict_t *xdata = NULL;
d1681e
+        int i = 0;
d1681e
+        int ret = 0;
d1681e
+        ia_type_t ia_type = IA_INVAL;
d1681e
+
d1681e
+        local = frame->local;
d1681e
+        priv = this->private;
d1681e
+
d1681e
+        for (i = 0; i < priv->child_count; i++) {
d1681e
+                data_readable[i] = 1;
d1681e
+                metadata_readable[i] = 1;
d1681e
+        }
d1681e
+        if (AFR_IS_ARBITER_BRICK (priv, ARBITER_BRICK_INDEX)) {
d1681e
+                data_readable[ARBITER_BRICK_INDEX] =  0;
d1681e
+                metadata_readable[ARBITER_BRICK_INDEX] = 0;
d1681e
+        }
d1681e
+
d1681e
+        for (i = 0; i < priv->child_count; i++) {
d1681e
+                if (replies) {/* Lookup */
d1681e
+                        if (!replies[i].valid || replies[i].op_ret == -1 ||
d1681e
+                            (replies[i].xdata && dict_get (replies[i].xdata,
d1681e
+                                                        GLUSTERFS_BAD_INODE))) {
d1681e
+                                data_readable[i] = 0;
d1681e
+                                metadata_readable[i] = 0;
d1681e
+                                continue;
d1681e
+                        }
d1681e
+
d1681e
+                        xdata = replies[i].xdata;
d1681e
+                        ia_type = replies[i].poststat.ia_type;
d1681e
+                } else {/* pre-op xattrop */
d1681e
+                        xdata = local->transaction.pre_op_xdata[i];
d1681e
+                        ia_type = inode->ia_type;
d1681e
+                }
d1681e
+
d1681e
+                afr_accused_fill (this, xdata, data_accused,
d1681e
+                                  (ia_type == IA_IFDIR) ?
d1681e
+                                  AFR_ENTRY_TRANSACTION : AFR_DATA_TRANSACTION);
d1681e
+
d1681e
+                afr_accused_fill (this, xdata,
d1681e
+                                  metadata_accused, AFR_METADATA_TRANSACTION);
d1681e
+        }
d1681e
+
d1681e
+        if (replies && ia_type != IA_INVAL && ia_type != IA_IFDIR &&
d1681e
+            /* We want to accuse small files only when we know for
d1681e
+             * sure that there is no IO happening. Otherwise, the
d1681e
+             * ia_sizes obtained in post-refresh replies may
d1681e
+             * mismatch due to a race between inode-refresh and
d1681e
+             * ongoing writes, causing spurious heal launches*/
d1681e
+            !afr_is_possibly_under_txn (AFR_DATA_TRANSACTION, local, this)) {
d1681e
+                afr_accuse_smallfiles (this, replies, data_accused);
d1681e
+        }
d1681e
+
d1681e
+        for (i = 0; i < priv->child_count; i++) {
d1681e
+                if (data_accused[i]) {
d1681e
+                        data_readable[i] = 0;
d1681e
+                        ret = 1;
d1681e
+                }
d1681e
+                if (metadata_accused[i]) {
d1681e
+                        metadata_readable[i] = 0;
d1681e
+                        ret = 1;
d1681e
+                }
d1681e
+        }
d1681e
+        return ret;
d1681e
+}
d1681e
+
d1681e
+int
d1681e
 afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode,
d1681e
                        gf_boolean_t *start_heal)
d1681e
 {
d1681e
@@ -1025,62 +1099,9 @@ afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode,
d1681e
 	metadata_accused = alloca0 (priv->child_count);
d1681e
 	metadata_readable = alloca0 (priv->child_count);
d1681e
 
d1681e
-	for (i = 0; i < priv->child_count; i++) {
d1681e
-		data_readable[i] = 1;
d1681e
-		metadata_readable[i] = 1;
d1681e
-	}
d1681e
-        if (AFR_IS_ARBITER_BRICK (priv, ARBITER_BRICK_INDEX)) {
d1681e
-                data_readable[ARBITER_BRICK_INDEX] =  0;
d1681e
-                metadata_readable[ARBITER_BRICK_INDEX] = 0;
d1681e
-        }
d1681e
-
d1681e
-	for (i = 0; i < priv->child_count; i++) {
d1681e
-		if (!replies[i].valid) {
d1681e
-			data_readable[i] = 0;
d1681e
-			metadata_readable[i] = 0;
d1681e
-			continue;
d1681e
-		}
d1681e
-
d1681e
-		if (replies[i].op_ret == -1) {
d1681e
-			data_readable[i] = 0;
d1681e
-			metadata_readable[i] = 0;
d1681e
-			continue;
d1681e
-		}
d1681e
-
d1681e
-                if (replies[i].xdata &&
d1681e
-                    dict_get (replies[i].xdata, GLUSTERFS_BAD_INODE)) {
d1681e
-			data_readable[i] = 0;
d1681e
-			metadata_readable[i] = 0;
d1681e
-			continue;
d1681e
-                }
d1681e
-
d1681e
-		afr_accused_fill (this, replies[i].xdata, data_accused,
d1681e
-				  (replies[i].poststat.ia_type == IA_IFDIR) ?
d1681e
-				   AFR_ENTRY_TRANSACTION : AFR_DATA_TRANSACTION);
d1681e
-
d1681e
-		afr_accused_fill (this, replies[i].xdata,
d1681e
-				  metadata_accused, AFR_METADATA_TRANSACTION);
d1681e
-
d1681e
-	}
d1681e
-
d1681e
-	if ((inode->ia_type != IA_IFDIR) &&
d1681e
-            /* We want to accuse small files only when we know for sure that
d1681e
-             * there is no IO happening. Otherwise, the ia_sizes obtained in
d1681e
-             * post-refresh replies may  mismatch due to a race between inode-
d1681e
-             * refresh and ongoing writes, causing spurious heal launches*/
d1681e
-            !afr_is_possibly_under_txn (AFR_DATA_TRANSACTION, local, this))
d1681e
-		afr_accuse_smallfiles (this, replies, data_accused);
d1681e
-
d1681e
-	for (i = 0; i < priv->child_count; i++) {
d1681e
-		if (data_accused[i]) {
d1681e
-			data_readable[i] = 0;
d1681e
-			ret = 1;
d1681e
-		}
d1681e
-		if (metadata_accused[i]) {
d1681e
-			metadata_readable[i] = 0;
d1681e
-			ret = 1;
d1681e
-		}
d1681e
-	}
d1681e
+        ret = afr_readables_fill (frame, this, inode, data_accused,
d1681e
+                                  metadata_accused, data_readable,
d1681e
+                                  metadata_readable, replies);
d1681e
 
d1681e
 	for (i = 0; i < priv->child_count; i++) {
d1681e
                 if (start_heal && priv->child_up[i] &&
d1681e
@@ -5510,13 +5531,13 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this)
d1681e
         if (!local->transaction.pre_op)
d1681e
                 goto out;
d1681e
 
d1681e
-        if (priv->arbiter_count == 1) {
d1681e
-                local->transaction.pre_op_xdata =
d1681e
-                        GF_CALLOC (sizeof (*local->transaction.pre_op_xdata),
d1681e
-                                   priv->child_count, gf_afr_mt_dict_t);
d1681e
-                if (!local->transaction.pre_op_xdata)
d1681e
-                        goto out;
d1681e
+        local->transaction.pre_op_xdata =
d1681e
+                GF_CALLOC (sizeof (*local->transaction.pre_op_xdata),
d1681e
+                           priv->child_count, gf_afr_mt_dict_t);
d1681e
+        if (!local->transaction.pre_op_xdata)
d1681e
+                goto out;
d1681e
 
d1681e
+        if (priv->arbiter_count == 1) {
d1681e
                 local->transaction.pre_op_sources =
d1681e
                         GF_CALLOC (sizeof (*local->transaction.pre_op_sources),
d1681e
                                    priv->child_count, gf_afr_mt_char);
d1681e
@@ -6489,42 +6510,45 @@ int
d1681e
 afr_write_subvol_set (call_frame_t *frame, xlator_t *this)
d1681e
 {
d1681e
         afr_local_t      *local = NULL;
d1681e
-        afr_inode_ctx_t  *ctx   = NULL;
d1681e
+        afr_private_t    *priv  = NULL;
d1681e
+        unsigned char    *data_accused = NULL;
d1681e
+        unsigned char    *metadata_accused = NULL;
d1681e
+        unsigned char    *data_readable = NULL;
d1681e
+        unsigned char    *metadata_readable = NULL;
d1681e
+        uint16_t          datamap = 0;
d1681e
+        uint16_t          metadatamap = 0;
d1681e
         uint64_t          val   = 0;
d1681e
-        uint64_t          val1  = 0;
d1681e
-        int               ret   = -1;
d1681e
+        int               event = 0;
d1681e
+        int               i     = 0;
d1681e
 
d1681e
         local = frame->local;
d1681e
+        priv = this->private;
d1681e
+        data_accused = alloca0 (priv->child_count);
d1681e
+        metadata_accused = alloca0 (priv->child_count);
d1681e
+        data_readable = alloca0 (priv->child_count);
d1681e
+        metadata_readable = alloca0 (priv->child_count);
d1681e
+        event = local->event_generation;
d1681e
+
d1681e
+        afr_readables_fill (frame, this, local->inode, data_accused,
d1681e
+                            metadata_accused, data_readable, metadata_readable,
d1681e
+                            NULL);
d1681e
+
d1681e
+        for (i = 0; i < priv->child_count; i++) {
d1681e
+                if (data_readable[i])
d1681e
+                        datamap |= (1 << i);
d1681e
+                if (metadata_readable[i])
d1681e
+                        metadatamap |= (1 << i);
d1681e
+        }
d1681e
+
d1681e
+        val = ((uint64_t) metadatamap) |
d1681e
+              (((uint64_t) datamap) << 16) |
d1681e
+              (((uint64_t) event) << 32);
d1681e
+
d1681e
         LOCK(&local->inode->lock);
d1681e
         {
d1681e
-                ret = __afr_inode_ctx_get (this, local->inode, &ctx;;
d1681e
-                if (ret < 0) {
d1681e
-                        gf_msg (this->name, GF_LOG_ERROR, 0,
d1681e
-                                AFR_MSG_DICT_GET_FAILED,
d1681e
-                                "ERROR GETTING INODE CTX");
d1681e
-                        UNLOCK(&local->inode->lock);
d1681e
-                        return ret;
d1681e
-                }
d1681e
-
d1681e
-                val = ctx->write_subvol;
d1681e
-                /*
d1681e
-                 * We need to set the value of write_subvol to read_subvol in 2
d1681e
-                 * cases:
d1681e
-                 * 1. Initially when the value is 0. i.e., it's the first lock
d1681e
-                 * request.
d1681e
-                 * 2. If it's a metadata transaction. If metadata transactions
d1681e
-                 * comes in between data transactions and we have a brick
d1681e
-                 * disconnect, the next metadata transaction won't get the
d1681e
-                 * latest value of readables, since we do resetting of
d1681e
-                 * write_subvol in unlock code path only if it's a data
d1681e
-                 * transaction. To handle those scenarios we need to set the
d1681e
-                 * value of write_subvol to read_subvol in case of metadata
d1681e
-                 * transactions.
d1681e
-                */
d1681e
-                if (val == 0 ||
d1681e
-                    local->transaction.type == AFR_METADATA_TRANSACTION) {
d1681e
-                        val1 = ctx->read_subvol;
d1681e
-                        ctx->write_subvol = val1;
d1681e
+                if (local->inode_ctx->write_subvol == 0 &&
d1681e
+                    local->transaction.type == AFR_DATA_TRANSACTION) {
d1681e
+                        local->inode_ctx->write_subvol = val;
d1681e
                 }
d1681e
         }
d1681e
         UNLOCK (&local->inode->lock);
d1681e
@@ -6536,23 +6560,37 @@ int
d1681e
 afr_write_subvol_reset (call_frame_t *frame, xlator_t *this)
d1681e
 {
d1681e
         afr_local_t      *local = NULL;
d1681e
-        afr_inode_ctx_t  *ctx   = NULL;
d1681e
-        int               ret   = -1;
d1681e
 
d1681e
         local = frame->local;
d1681e
         LOCK(&local->inode->lock);
d1681e
         {
d1681e
-                ret = __afr_inode_ctx_get (this, local->inode, &ctx;;
d1681e
-                if (ret < 0) {
d1681e
-                        gf_msg (this->name, GF_LOG_ERROR, 0,
d1681e
-                                AFR_MSG_DICT_GET_FAILED,
d1681e
-                                "ERROR GETTING INODE CTX");
d1681e
-                        UNLOCK(&local->inode->lock);
d1681e
-                        return ret;
d1681e
-                }
d1681e
-                ctx->write_subvol = 0;
d1681e
+                local->inode_ctx->lock_count--;
d1681e
+
d1681e
+                if (!local->inode_ctx->lock_count)
d1681e
+                        local->inode_ctx->write_subvol = 0;
d1681e
         }
d1681e
         UNLOCK(&local->inode->lock);
d1681e
 
d1681e
         return 0;
d1681e
 }
d1681e
+
d1681e
+int
d1681e
+afr_set_inode_local (xlator_t *this, afr_local_t *local, inode_t *inode)
d1681e
+{
d1681e
+        int ret = 0;
d1681e
+
d1681e
+        local->inode = inode_ref (inode);
d1681e
+        LOCK(&local->inode->lock);
d1681e
+        {
d1681e
+                ret = __afr_inode_ctx_get (this, local->inode,
d1681e
+                                           &local->inode_ctx);
d1681e
+        }
d1681e
+        UNLOCK (&local->inode->lock);
d1681e
+        if (ret < 0) {
d1681e
+                gf_msg_callingfn (this->name, GF_LOG_ERROR, ENOMEM,
d1681e
+                                  AFR_MSG_INODE_CTX_GET_FAILED,
d1681e
+                                  "Error getting inode ctx %s",
d1681e
+                                  uuid_utoa (local->inode->gfid));
d1681e
+        }
d1681e
+        return ret;
d1681e
+}
d1681e
diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c
d1681e
index 9099b8c..e088ed6 100644
d1681e
--- a/xlators/cluster/afr/src/afr-dir-write.c
d1681e
+++ b/xlators/cluster/afr/src/afr-dir-write.c
d1681e
@@ -477,7 +477,7 @@ afr_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
d1681e
 	if (!local->fd_ctx)
d1681e
 		goto out;
d1681e
 
d1681e
-	local->inode = inode_ref (loc->inode);
d1681e
+        local->inode = inode_ref (loc->inode);
d1681e
 	local->parent = inode_ref (loc->parent);
d1681e
 
d1681e
         local->op                = GF_FOP_CREATE;
d1681e
@@ -609,7 +609,7 @@ afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
d1681e
 		goto out;
d1681e
 
d1681e
         loc_copy (&local->loc, loc);
d1681e
-	local->inode = inode_ref (loc->inode);
d1681e
+        local->inode = inode_ref (loc->inode);
d1681e
 	local->parent = inode_ref (loc->parent);
d1681e
 
d1681e
         local->op               = GF_FOP_MKNOD;
d1681e
@@ -740,7 +740,7 @@ afr_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
d1681e
 		goto out;
d1681e
 
d1681e
         loc_copy (&local->loc, loc);
d1681e
-	local->inode = inode_ref (loc->inode);
d1681e
+        local->inode = inode_ref (loc->inode);
d1681e
 	local->parent = inode_ref (loc->parent);
d1681e
 
d1681e
         local->cont.mkdir.mode  = mode;
d1681e
@@ -877,7 +877,7 @@ afr_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
d1681e
         loc_copy (&local->loc,    oldloc);
d1681e
         loc_copy (&local->newloc, newloc);
d1681e
 
d1681e
-	local->inode = inode_ref (oldloc->inode);
d1681e
+        local->inode = inode_ref (oldloc->inode);
d1681e
 	local->parent = inode_ref (newloc->parent);
d1681e
 
d1681e
         if (xdata)
d1681e
@@ -1005,7 +1005,7 @@ afr_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath,
d1681e
 		goto out;
d1681e
 
d1681e
         loc_copy (&local->loc, loc);
d1681e
-	local->inode = inode_ref (loc->inode);
d1681e
+        local->inode = inode_ref (loc->inode);
d1681e
 	local->parent = inode_ref (loc->parent);
d1681e
 
d1681e
         local->cont.symlink.linkpath = gf_strdup (linkpath);
d1681e
@@ -1142,7 +1142,7 @@ afr_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
d1681e
         loc_copy (&local->loc,    oldloc);
d1681e
         loc_copy (&local->newloc, newloc);
d1681e
 
d1681e
-	local->inode = inode_ref (oldloc->inode);
d1681e
+        local->inode = inode_ref (oldloc->inode);
d1681e
 	local->parent = inode_ref (oldloc->parent);
d1681e
 	local->parent2 = inode_ref (newloc->parent);
d1681e
 
d1681e
@@ -1295,7 +1295,7 @@ afr_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
d1681e
         loc_copy (&local->loc, loc);
d1681e
         local->xflag = xflag;
d1681e
 
d1681e
-	local->inode = inode_ref (loc->inode);
d1681e
+        local->inode = inode_ref (loc->inode);
d1681e
 	local->parent = inode_ref (loc->parent);
d1681e
 
d1681e
         if (xdata)
d1681e
@@ -1421,7 +1421,7 @@ afr_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
d1681e
 
d1681e
 
d1681e
         loc_copy (&local->loc, loc);
d1681e
-	local->inode = inode_ref (loc->inode);
d1681e
+        local->inode = inode_ref (loc->inode);
d1681e
 	local->parent = inode_ref (loc->parent);
d1681e
 
d1681e
         local->cont.rmdir.flags = flags;
d1681e
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
d1681e
index 97397f9..f0231b7 100644
d1681e
--- a/xlators/cluster/afr/src/afr-inode-write.c
d1681e
+++ b/xlators/cluster/afr/src/afr-inode-write.c
d1681e
@@ -507,6 +507,7 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
d1681e
 {
d1681e
         afr_local_t *local = NULL;
d1681e
         int op_errno = ENOMEM;
d1681e
+        int ret = -1;
d1681e
 
d1681e
 	local = AFR_FRAME_INIT (frame, op_errno);
d1681e
 	if (!local)
d1681e
@@ -529,7 +530,9 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
d1681e
 		goto out;
d1681e
 
d1681e
         local->fd = fd_ref (fd);
d1681e
-	local->inode = inode_ref (fd->inode);
d1681e
+        ret = afr_set_inode_local (this, local, fd->inode);
d1681e
+        if (ret)
d1681e
+                goto out;
d1681e
 
d1681e
 	if (dict_set_uint32 (local->xdata_req, GLUSTERFS_OPEN_FD_COUNT, 4)) {
d1681e
 		op_errno = ENOMEM;
d1681e
@@ -654,7 +657,9 @@ afr_truncate (call_frame_t *frame, xlator_t *this,
d1681e
         local->transaction.unwind = afr_truncate_unwind;
d1681e
 
d1681e
         loc_copy (&local->loc, loc);
d1681e
-	local->inode = inode_ref (loc->inode);
d1681e
+        ret = afr_set_inode_local (this, local, loc->inode);
d1681e
+        if (ret)
d1681e
+                goto out;
d1681e
 
d1681e
         local->op = GF_FOP_TRUNCATE;
d1681e
 
d1681e
@@ -768,7 +773,9 @@ afr_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
d1681e
 		goto out;
d1681e
 
d1681e
         local->fd = fd_ref (fd);
d1681e
-	local->inode = inode_ref (fd->inode);
d1681e
+        ret = afr_set_inode_local (this, local, fd->inode);
d1681e
+        if (ret)
d1681e
+                goto out;
d1681e
 
d1681e
         local->op = GF_FOP_FTRUNCATE;
d1681e
 
d1681e
@@ -886,7 +893,9 @@ afr_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf,
d1681e
         local->transaction.unwind = afr_setattr_unwind;
d1681e
 
d1681e
         loc_copy (&local->loc, loc);
d1681e
-	local->inode = inode_ref (loc->inode);
d1681e
+        ret = afr_set_inode_local (this, local, loc->inode);
d1681e
+        if (ret)
d1681e
+                goto out;
d1681e
 
d1681e
 	local->op = GF_FOP_SETATTR;
d1681e
 
d1681e
@@ -991,7 +1000,9 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this,
d1681e
         local->transaction.unwind = afr_fsetattr_unwind;
d1681e
 
d1681e
         local->fd                 = fd_ref (fd);
d1681e
-	local->inode = inode_ref (fd->inode);
d1681e
+        ret = afr_set_inode_local (this, local, fd->inode);
d1681e
+        if (ret)
d1681e
+                goto out;
d1681e
 
d1681e
 	local->op = GF_FOP_FSETATTR;
d1681e
 
d1681e
@@ -1633,7 +1644,9 @@ afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
d1681e
         local->transaction.unwind = afr_setxattr_unwind;
d1681e
 
d1681e
         loc_copy (&local->loc, loc);
d1681e
-	local->inode = inode_ref (loc->inode);
d1681e
+        ret = afr_set_inode_local (this, local, loc->inode);
d1681e
+        if (ret)
d1681e
+                goto out;
d1681e
 
d1681e
         local->transaction.main_frame = frame;
d1681e
         local->transaction.start   = LLONG_MAX - 1;
d1681e
@@ -1745,7 +1758,9 @@ afr_fsetxattr (call_frame_t *frame, xlator_t *this,
d1681e
         local->transaction.unwind = afr_fsetxattr_unwind;
d1681e
 
d1681e
         local->fd                 = fd_ref (fd);
d1681e
-	local->inode = inode_ref (fd->inode);
d1681e
+        ret = afr_set_inode_local (this, local, fd->inode);
d1681e
+        if (ret)
d1681e
+                goto out;
d1681e
 
d1681e
 	local->op = GF_FOP_FSETXATTR;
d1681e
 
d1681e
@@ -1858,7 +1873,9 @@ afr_removexattr (call_frame_t *frame, xlator_t *this,
d1681e
         local->transaction.unwind = afr_removexattr_unwind;
d1681e
 
d1681e
         loc_copy (&local->loc, loc);
d1681e
-	local->inode = inode_ref (loc->inode);
d1681e
+        ret = afr_set_inode_local (this, local, loc->inode);
d1681e
+        if (ret)
d1681e
+                goto out;
d1681e
 
d1681e
 	local->op = GF_FOP_REMOVEXATTR;
d1681e
 
d1681e
@@ -1965,7 +1982,9 @@ afr_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
d1681e
         local->transaction.unwind = afr_fremovexattr_unwind;
d1681e
 
d1681e
         local->fd = fd_ref (fd);
d1681e
-	local->inode = inode_ref (fd->inode);
d1681e
+        ret = afr_set_inode_local (this, local, fd->inode);
d1681e
+        if (ret)
d1681e
+                goto out;
d1681e
 
d1681e
 	local->op = GF_FOP_FREMOVEXATTR;
d1681e
 
d1681e
@@ -2060,7 +2079,9 @@ afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
d1681e
         local->cont.fallocate.len = len;
d1681e
 
d1681e
         local->fd = fd_ref (fd);
d1681e
-	local->inode = inode_ref (fd->inode);
d1681e
+        ret = afr_set_inode_local (this, local, fd->inode);
d1681e
+        if (ret)
d1681e
+                goto out;
d1681e
 
d1681e
 	if (xdata)
d1681e
 		local->xdata_req = dict_copy_with_ref (xdata, NULL);
d1681e
@@ -2172,7 +2193,9 @@ afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
d1681e
         local->cont.discard.len = len;
d1681e
 
d1681e
         local->fd = fd_ref (fd);
d1681e
-	local->inode = inode_ref (fd->inode);
d1681e
+        ret = afr_set_inode_local (this, local, fd->inode);
d1681e
+        if (ret)
d1681e
+                goto out;
d1681e
 
d1681e
 	if (xdata)
d1681e
 		local->xdata_req = dict_copy_with_ref (xdata, NULL);
d1681e
@@ -2281,7 +2304,9 @@ afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
d1681e
         local->cont.zerofill.len = len;
d1681e
 
d1681e
         local->fd = fd_ref (fd);
d1681e
-	local->inode = inode_ref (fd->inode);
d1681e
+        ret = afr_set_inode_local (this, local, fd->inode);
d1681e
+        if (ret)
d1681e
+                goto out;
d1681e
 
d1681e
 	if (xdata)
d1681e
 		local->xdata_req = dict_copy_with_ref (xdata, NULL);
d1681e
@@ -2393,7 +2418,9 @@ afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
d1681e
         local->transaction.unwind = afr_xattrop_unwind;
d1681e
 
d1681e
         loc_copy (&local->loc, loc);
d1681e
-	local->inode = inode_ref (loc->inode);
d1681e
+        ret = afr_set_inode_local (this, local, loc->inode);
d1681e
+        if (ret)
d1681e
+                goto out;
d1681e
 
d1681e
 	local->op = GF_FOP_XATTROP;
d1681e
 
d1681e
@@ -2487,7 +2514,9 @@ afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
d1681e
         local->transaction.unwind = afr_fxattrop_unwind;
d1681e
 
d1681e
 	local->fd = fd_ref (fd);
d1681e
-	local->inode = inode_ref (fd->inode);
d1681e
+        ret = afr_set_inode_local (this, local, fd->inode);
d1681e
+        if (ret)
d1681e
+                goto out;
d1681e
 
d1681e
 	local->op = GF_FOP_FXATTROP;
d1681e
 
d1681e
diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c
d1681e
index c17f60f..f50c7b6 100644
d1681e
--- a/xlators/cluster/afr/src/afr-lk-common.c
d1681e
+++ b/xlators/cluster/afr/src/afr-lk-common.c
d1681e
@@ -615,14 +615,14 @@ afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
d1681e
 {
d1681e
         afr_local_t             *local          = NULL;
d1681e
         afr_internal_lock_t     *int_lock       = NULL;
d1681e
-        afr_fd_ctx_t            *fd_ctx         = NULL;
d1681e
-        afr_private_t           *priv           = NULL;
d1681e
         int                      call_count     = 0;
d1681e
         int                      ret            = 0;
d1681e
 
d1681e
         local    = frame->local;
d1681e
         int_lock = &local->internal_lock;
d1681e
-        priv = this->private;
d1681e
+
d1681e
+        if (local->transaction.type == AFR_DATA_TRANSACTION && op_ret != 1)
d1681e
+                ret = afr_write_subvol_reset (frame, this);
d1681e
 
d1681e
         LOCK (&frame->lock);
d1681e
         {
d1681e
@@ -633,11 +633,6 @@ afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
d1681e
         if (call_count == 0) {
d1681e
                 gf_msg_trace (this->name, 0,
d1681e
                               "All internal locks unlocked");
d1681e
-                if (local->fd) {
d1681e
-                        fd_ctx = afr_fd_ctx_get (local->fd, this);
d1681e
-                        if (0 == AFR_COUNT (fd_ctx->lock_acquired, priv->child_count))
d1681e
-                                ret = afr_write_subvol_reset (frame, this);
d1681e
-                }
d1681e
                 int_lock->lock_cbk (frame, this);
d1681e
         }
d1681e
 
d1681e
@@ -947,6 +942,15 @@ afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
d1681e
                         } else {
d1681e
                                 int_lock->locked_nodes[child_index] |= LOCKED_YES;
d1681e
                                 int_lock->lock_count++;
d1681e
+
d1681e
+                                if (local->transaction.type ==
d1681e
+                                    AFR_DATA_TRANSACTION) {
d1681e
+                                        LOCK(&local->inode->lock);
d1681e
+                                        {
d1681e
+                                                local->inode_ctx->lock_count++;
d1681e
+                                        }
d1681e
+                                        UNLOCK (&local->inode->lock);
d1681e
+                                }
d1681e
                         }
d1681e
                 }
d1681e
                 afr_lock_blocking (frame, this, cky + 1);
d1681e
@@ -1502,13 +1506,12 @@ int32_t
d1681e
 afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
d1681e
                              int32_t op_ret, int32_t op_errno, dict_t *xdata)
d1681e
 {
d1681e
-        afr_internal_lock_t *int_lock = NULL;
d1681e
-        afr_inodelk_t       *inodelk  = NULL;
d1681e
-        afr_local_t         *local    = NULL;
d1681e
-        int call_count  = 0;
d1681e
-        int child_index = (long) cookie;
d1681e
-        afr_fd_ctx_t        *fd_ctx = NULL;
d1681e
-
d1681e
+        afr_internal_lock_t *int_lock    = NULL;
d1681e
+        afr_inodelk_t       *inodelk     = NULL;
d1681e
+        afr_local_t         *local       = NULL;
d1681e
+        afr_fd_ctx_t        *fd_ctx      = NULL;
d1681e
+        int                  call_count  = 0;
d1681e
+        int                  child_index = (long) cookie;
d1681e
 
d1681e
         local    = frame->local;
d1681e
         int_lock = &local->internal_lock;
d1681e
@@ -1553,6 +1556,15 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
d1681e
                                         fd_ctx->lock_acquired[child_index]++;
d1681e
 				}
d1681e
 			}
d1681e
+
d1681e
+                        if (local->transaction.type == AFR_DATA_TRANSACTION &&
d1681e
+                            op_ret == 0) {
d1681e
+                                LOCK(&local->inode->lock);
d1681e
+                                {
d1681e
+                                        local->inode_ctx->lock_count++;
d1681e
+                                }
d1681e
+                                UNLOCK (&local->inode->lock);
d1681e
+                        }
d1681e
 		}
d1681e
 
d1681e
                 call_count = --int_lock->lk_call_count;
d1681e
diff --git a/xlators/cluster/afr/src/afr-messages.h b/xlators/cluster/afr/src/afr-messages.h
d1681e
index 02eb206..53ffcd8 100644
d1681e
--- a/xlators/cluster/afr/src/afr-messages.h
d1681e
+++ b/xlators/cluster/afr/src/afr-messages.h
d1681e
@@ -40,7 +40,7 @@
d1681e
  */
d1681e
 
d1681e
 #define GLFS_COMP_BASE_AFR      GLFS_MSGID_COMP_AFR
d1681e
-#define GLFS_NUM_MESSAGES       42
d1681e
+#define GLFS_NUM_MESSAGES       43
d1681e
 #define GLFS_MSGID_END          (GLFS_COMP_BASE_AFR + GLFS_NUM_MESSAGES + 1)
d1681e
 
d1681e
 #define glfs_msg_start_x GLFS_COMP_BASE_AFR, "Invalid: Start of messages"
d1681e
@@ -369,5 +369,12 @@
d1681e
 */
d1681e
 #define AFR_MSG_SBRAIN_FAV_CHILD_POLICY  (GLFS_COMP_BASE_AFR + 42)
d1681e
 
d1681e
+/*!
d1681e
+ * @messageid 108043
d1681e
+ * @diagnosis
d1681e
+ * @recommendation
d1681e
+*/
d1681e
+#define AFR_MSG_INODE_CTX_GET_FAILED (GLFS_COMP_BASE_AFR + 43)
d1681e
+
d1681e
 #define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages"
d1681e
 #endif /* !_AFR_MESSAGES_H_ */
d1681e
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
d1681e
index a04636f..7e40bba 100644
d1681e
--- a/xlators/cluster/afr/src/afr-transaction.c
d1681e
+++ b/xlators/cluster/afr/src/afr-transaction.c
d1681e
@@ -372,14 +372,27 @@ afr_txn_arbitrate_fop (call_frame_t *frame, xlator_t *this)
d1681e
 int
d1681e
 afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)
d1681e
 {
d1681e
-        afr_local_t     *local = NULL;
d1681e
-        afr_private_t   *priv = NULL;
d1681e
-        fd_t            *fd   = NULL;
d1681e
+        afr_local_t   *local = NULL;
d1681e
+        afr_private_t *priv  = NULL;
d1681e
+        fd_t          *fd    = NULL;
d1681e
+        int           i      = 0;
d1681e
+        int           ret    = 0;
d1681e
 
d1681e
         local = frame->local;
d1681e
         priv = this->private;
d1681e
         fd    = local->fd;
d1681e
 
d1681e
+        if (local->transaction.type == AFR_DATA_TRANSACTION &&
d1681e
+            !local->transaction.inherited) {
d1681e
+                ret = afr_write_subvol_set (frame, this);
d1681e
+                if (ret) {
d1681e
+                        /*act as if operation failed on all subvols*/
d1681e
+                        local->op_ret = -1;
d1681e
+                        local->op_errno = -ret;
d1681e
+                        for (i = 0; i < priv->child_count; i++)
d1681e
+                                local->transaction.failed_subvols[i] = 1;
d1681e
+                }
d1681e
+        }
d1681e
         /*  Perform fops with the lk-owner from top xlator.
d1681e
          *  Eg: lk-owner of posix-lk and flush should be same,
d1681e
          *  flush cant clear the  posix-lks without that lk-owner.
d1681e
@@ -1116,32 +1129,28 @@ unlock:
d1681e
 
d1681e
 int
d1681e
 afr_changelog_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
d1681e
-		   int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
d1681e
+                   int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
d1681e
 {
d1681e
         afr_local_t *local = NULL;
d1681e
-        afr_private_t *priv = NULL;
d1681e
         int call_count = -1;
d1681e
         int child_index = -1;
d1681e
 
d1681e
         local = frame->local;
d1681e
-        priv = this->private;
d1681e
         child_index = (long) cookie;
d1681e
 
d1681e
-	if (op_ret == -1) {
d1681e
+        if (op_ret == -1) {
d1681e
                 local->op_errno = op_errno;
d1681e
-		afr_transaction_fop_failed (frame, this, child_index);
d1681e
+                afr_transaction_fop_failed (frame, this, child_index);
d1681e
         }
d1681e
 
d1681e
-        if (priv->arbiter_count == 1 && !op_ret) {
d1681e
-                if (xattr)
d1681e
-                        local->transaction.pre_op_xdata[child_index] =
d1681e
-                                                               dict_ref (xattr);
d1681e
-        }
d1681e
+        if (xattr)
d1681e
+                local->transaction.pre_op_xdata[child_index] = dict_ref (xattr);
d1681e
 
d1681e
-	call_count = afr_frame_return (frame);
d1681e
+        call_count = afr_frame_return (frame);
d1681e
 
d1681e
-        if (call_count == 0)
d1681e
-		local->transaction.changelog_resume (frame, this);
d1681e
+        if (call_count == 0) {
d1681e
+                local->transaction.changelog_resume (frame, this);
d1681e
+        }
d1681e
 
d1681e
         return 0;
d1681e
 }
d1681e
@@ -1750,10 +1759,6 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
d1681e
 	if (pre_nop)
d1681e
 		goto next;
d1681e
 
d1681e
-        ret = afr_write_subvol_set (frame, this);
d1681e
-        if (ret)
d1681e
-                goto err;
d1681e
-
d1681e
 	if (!local->pre_op_compat) {
d1681e
 		dict_copy (xdata_req, local->xdata_req);
d1681e
 		goto next;
d1681e
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
d1681e
index 0a06eb6..96fefb1 100644
d1681e
--- a/xlators/cluster/afr/src/afr.h
d1681e
+++ b/xlators/cluster/afr/src/afr.h
d1681e
@@ -377,6 +377,16 @@ typedef enum {
d1681e
         AFR_FOP_LOCK_QUORUM_FAILED,
d1681e
 } afr_fop_lock_state_t;
d1681e
 
d1681e
+typedef struct _afr_inode_ctx {
d1681e
+        uint64_t        read_subvol;
d1681e
+        uint64_t        write_subvol;
d1681e
+        int             lock_count;
d1681e
+        int             spb_choice;
d1681e
+        gf_timer_t      *timer;
d1681e
+        gf_boolean_t    need_refresh;
d1681e
+} afr_inode_ctx_t;
d1681e
+
d1681e
+
d1681e
 typedef struct _afr_local {
d1681e
 	glusterfs_fop_t  op;
d1681e
         unsigned int call_count;
d1681e
@@ -833,17 +843,10 @@ typedef struct _afr_local {
d1681e
         compound_args_t *c_args;
d1681e
 
d1681e
         gf_boolean_t is_read_txn;
d1681e
+        afr_inode_ctx_t *inode_ctx;
d1681e
 } afr_local_t;
d1681e
 
d1681e
 
d1681e
-typedef struct _afr_inode_ctx {
d1681e
-        uint64_t        read_subvol;
d1681e
-        uint64_t        write_subvol;
d1681e
-        int             spb_choice;
d1681e
-        gf_timer_t      *timer;
d1681e
-        gf_boolean_t    need_refresh;
d1681e
-} afr_inode_ctx_t;
d1681e
-
d1681e
 typedef struct afr_spbc_timeout {
d1681e
         call_frame_t *frame;
d1681e
         gf_boolean_t d_spb;
d1681e
@@ -1274,6 +1277,9 @@ afr_write_subvol_set (call_frame_t *frame, xlator_t *this);
d1681e
 int
d1681e
 afr_write_subvol_reset (call_frame_t *frame, xlator_t *this);
d1681e
 
d1681e
+int
d1681e
+afr_set_inode_local (xlator_t *this, afr_local_t *local, inode_t *inode);
d1681e
+
d1681e
 gf_boolean_t
d1681e
 afr_is_symmetric_error (call_frame_t *frame, xlator_t *this);
d1681e
 #endif /* __AFR_H__ */
d1681e
-- 
d1681e
1.8.3.1
d1681e