Blob Blame History Raw
From 30fb0e640ae94d9591e9bb64800b0971e52d5416 Mon Sep 17 00:00:00 2001
From: Pranith Kumar K <pkarampu@redhat.com>
Date: Wed, 31 Jan 2018 16:41:14 +0530
Subject: [PATCH 194/201] cluster/afr: Make AFR eager-locking similar to EC

Problem:
1) Afr's eager-lock only works for data transactions.
2) When there are conflicting writes, write with conflicting region initiates
unlock of eager-lock leading to extra pre-ops and post-ops on the file. When
eager-lock goes off, it leads to extra fsyncs for random-write workload in afr.

Solution (that is modeled after EC):
In EC, when there is a conflicting write, it waits for the current write to
complete before it winds the conflicted write. This leads to better utilization
of network and disk, because we will not be doing extra xattrops and FSYNCs and
inodelk/unlock. Moved fd based counters to inode based counters.

I tried to model the solution based on EC's locking, but it is not similar to
AFR because we had to keep backward compatibility.

Lifecycle of lock:
==================
First transaction is added to inode->owners list and an inodelk will be sent on
the wire. All the next transactions will be put in inode->waiters list until
the first transaction completes inodelk and [f]xattrop completely.  Once
[f]xattrop also completes, all the requests in the inode->waiters list are
checked if it conflict with any of the existing locks which are in
inode->owners list and if not are added to inode->owners list and resumed with
doing transaction. When these transactions complete fop phase they will be
moved to inode->post_op list and resume the transactions that were paused
because of conflicts. Post-op and unlock will not be issued on the wire until
that is the last transaction on that inode. Last transaction when it has to
perform post-op can choose to sleep for deyed-post-op-secs value. During that
time if any other transaction comes, it will wake up the sleeping transaction
and takes over the ownership of the lock and the cycle continues. If the
dealyed-post-op-secs expire, then the timer thread will wakeup the sleeping
transaction and it will set lock->release to true and starts doing post-op and
then unlock. During this time if any other transactions come, they will be put
in inode->frozen list. Once the previous unlock comes it will move the frozen
list to waiters list and moves the first element from this waiters-list to
owners-list and attempts the lock and the cycle continues. This is the general
idea.  There is logic at the time of dealying and at the time of new
transaction or in flush fop to wakeup existing sleeping transactions or
choosing whether to delay a transaction etc, which is subjected to change based
on future enhancements etc.

 >Fixes: #418
 >BUG: 1549606

Upstream-patch: https://review.gluster.org/19503
BUG: 1491785
Change-Id: I88b570bbcf332a27c82d2767dfa82472f60055dc
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/131945
Tested-by: RHGS Build Bot <nigelb@redhat.com>
---
 tests/bugs/replicate/bug-966018.t              |  36 -
 xlators/cluster/afr/src/afr-common.c           | 315 ++++-----
 xlators/cluster/afr/src/afr-inode-write.c      |   6 +-
 xlators/cluster/afr/src/afr-lk-common.c        | 348 +++-------
 xlators/cluster/afr/src/afr-self-heal-common.c |  13 +-
 xlators/cluster/afr/src/afr-self-heal-data.c   |  14 +-
 xlators/cluster/afr/src/afr-self-heal.h        |   2 +-
 xlators/cluster/afr/src/afr-transaction.c      | 913 ++++++++++++++-----------
 xlators/cluster/afr/src/afr-transaction.h      |  13 +-
 xlators/cluster/afr/src/afr.h                  |  96 ++-
 10 files changed, 813 insertions(+), 943 deletions(-)
 delete mode 100644 tests/bugs/replicate/bug-966018.t

diff --git a/tests/bugs/replicate/bug-966018.t b/tests/bugs/replicate/bug-966018.t
deleted file mode 100644
index 1b5296b..0000000
--- a/tests/bugs/replicate/bug-966018.t
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-
-. $(dirname $0)/../../include.rc
-. $(dirname $0)/../../volume.rc
-. $(dirname $0)/../../nfs.rc
-
-#This tests if cluster.eager-lock blocks metadata operations on nfs/fuse mounts.
-#If it is not woken up, INODELK from the next command waits
-#for post-op-delay secs.
-
-cleanup;
-TEST glusterd
-TEST pidof glusterd
-
-TEST $CLI volume create $V0 replica 2 $H0:$B0/r2_0 $H0:$B0/r2_1
-TEST $CLI volume set $V0 ensure-durability off
-TEST $CLI volume set $V0 cluster.eager-lock on
-TEST $CLI volume set $V0 cluster.post-op-delay-secs 3
-TEST $CLI volume set $V0 nfs.disable false
-
-TEST $CLI volume start $V0
-TEST $CLI volume profile $V0 start
-EXPECT_WITHIN $NFS_EXPORT_TIMEOUT "1" is_nfs_export_available;
-TEST mount_nfs $H0:/$V0 $N0 nolock;
-TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id=$V0 $M0
-echo 1 > $N0/1 && chmod +x $N0/1
-echo 1 > $M0/1 && chmod +x $M0/1
-
-#Check that INODELK MAX latency is not in the order of seconds
-#Test if the MAX INODELK fop latency is of the order of seconds.
-inodelk_max_latency=$($CLI volume profile $V0 info | grep INODELK | awk 'BEGIN {max = 0} {if ($6 > max) max=$6;} END {print max}' | cut -d. -f 1 | egrep "[0-9]{7,}")
-
-TEST [ -z $inodelk_max_latency ]
-EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $N0
-
-cleanup;
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 06863b6..6025a60 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -126,37 +126,77 @@ afr_is_possibly_under_txn (afr_transaction_type type, afr_local_t *local,
         return _gf_false;
 }
 
+static void
+afr_inode_ctx_destroy (afr_inode_ctx_t *ctx)
+{
+        int i = 0;
+
+        if (!ctx)
+                return;
+
+        for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
+                GF_FREE (ctx->pre_op_done[i]);
+        }
+
+        GF_FREE (ctx);
+}
+
 int
 __afr_inode_ctx_get (xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx)
 {
-        uint64_t                ctx_int = 0;
-        int                     ret     = -1;
-        afr_inode_ctx_t        *tmp_ctx = NULL;
+        uint64_t        ctx_int   = 0;
+        int             ret       = -1;
+        int             i         = -1;
+        int             num_locks = -1;
+        afr_inode_ctx_t *ictx     = NULL;
+        afr_lock_t      *lock     = NULL;
+        afr_private_t   *priv     = this->private;
 
         ret = __inode_ctx_get (inode, this, &ctx_int);
-        if (ret) {
-                tmp_ctx = GF_CALLOC (1, sizeof (afr_inode_ctx_t),
-                                     gf_afr_mt_inode_ctx_t);
-                if (!tmp_ctx)
-                        goto out;
+        if (ret == 0) {
+                *ctx = (afr_inode_ctx_t *)ctx_int;
+                return 0;
+        }
 
-                ctx_int = (long) tmp_ctx;
-                ret = __inode_ctx_set (inode, this, &ctx_int);
-                if (ret) {
-                        GF_FREE (tmp_ctx);
+        ictx = GF_CALLOC (1, sizeof (afr_inode_ctx_t), gf_afr_mt_inode_ctx_t);
+        if (!ictx)
+                goto out;
+
+        for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
+                ictx->pre_op_done[i] = GF_CALLOC (sizeof *ictx->pre_op_done[i],
+                                                  priv->child_count,
+                                                  gf_afr_mt_int32_t);
+                if (!ictx->pre_op_done[i]) {
+                        ret = -ENOMEM;
                         goto out;
                 }
-                tmp_ctx->spb_choice = -1;
-                tmp_ctx->read_subvol = 0;
-                tmp_ctx->write_subvol = 0;
-                tmp_ctx->lock_count = 0;
-        } else {
-                tmp_ctx = (afr_inode_ctx_t *) ctx_int;
         }
 
-        *ctx = tmp_ctx;
+        num_locks = sizeof(ictx->lock)/sizeof(afr_lock_t);
+        for (i = 0; i < num_locks; i++) {
+                lock = &ictx->lock[i];
+                INIT_LIST_HEAD (&lock->post_op);
+                INIT_LIST_HEAD (&lock->frozen);
+                INIT_LIST_HEAD (&lock->waiting);
+                INIT_LIST_HEAD (&lock->owners);
+        }
+
+        ctx_int = (uint64_t)ictx;
+        ret = __inode_ctx_set (inode, this, &ctx_int);
+        if (ret) {
+                goto out;
+        }
+
+        ictx->spb_choice = -1;
+        ictx->read_subvol = 0;
+        ictx->write_subvol = 0;
+        ictx->lock_count = 0;
         ret = 0;
+        *ctx = ictx;
 out:
+        if (ret) {
+                afr_inode_ctx_destroy (ictx);
+        }
         return ret;
 }
 
@@ -1745,10 +1785,6 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)
 
         GF_FREE (local->internal_lock.locked_nodes);
 
-        for (i = 0; local->internal_lock.inodelk[i].domain; i++) {
-                GF_FREE (local->internal_lock.inodelk[i].locked_nodes);
-        }
-
         GF_FREE (local->internal_lock.lower_locked_nodes);
 
         afr_entry_lockee_cleanup (&local->internal_lock);
@@ -1765,7 +1801,6 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)
                 GF_FREE (local->transaction.changelog_xdata);
         }
 
-        GF_FREE (local->transaction.eager_lock);
         GF_FREE (local->transaction.failed_subvols);
 
         GF_FREE (local->transaction.basename);
@@ -1812,16 +1847,6 @@ afr_local_replies_wipe (afr_local_t *local, afr_private_t *priv)
 	memset (local->replies, 0, sizeof(*local->replies) * priv->child_count);
 }
 
-void
-afr_remove_eager_lock_stub (afr_local_t *local)
-{
-        LOCK (&local->fd->lock);
-        {
-                list_del_init (&local->transaction.eager_locked);
-        }
-        UNLOCK (&local->fd->lock);
-}
-
 static gf_boolean_t
 afr_fop_lock_is_unlock (call_frame_t *frame)
 {
@@ -1926,10 +1951,6 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this)
 
 	syncbarrier_destroy (&local->barrier);
 
-        if (local->transaction.eager_lock_on &&
-            !list_empty (&local->transaction.eager_locked))
-                afr_remove_eager_lock_stub (local);
-
         afr_local_transaction_cleanup (local, this);
 
         priv = this->private;
@@ -3160,22 +3181,8 @@ out:
 void
 _afr_cleanup_fd_ctx (afr_fd_ctx_t *fd_ctx)
 {
-        int i = 0;
-
-
-	for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++)
-		GF_FREE (fd_ctx->pre_op_done[i]);
-
         GF_FREE (fd_ctx->opened_on);
-
-        GF_FREE (fd_ctx->lock_piggyback);
-
-        GF_FREE (fd_ctx->lock_acquired);
-
-	pthread_mutex_destroy (&fd_ctx->delay_lock);
-
         GF_FREE (fd_ctx);
-
         return;
 }
 
@@ -3193,15 +3200,7 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd)
         fd_ctx = (afr_fd_ctx_t *)(long) ctx;
 
         if (fd_ctx) {
-                /*no need to take any locks*/
-                if (!list_empty (&fd_ctx->eager_locked))
-                        gf_msg (this->name, GF_LOG_WARNING, 0,
-                                AFR_MSG_INVALID_DATA, "%s: Stale "
-                                "Eager-lock stubs found",
-                                uuid_utoa (fd->inode->gfid));
-
                 _afr_cleanup_fd_ctx (fd_ctx);
-
         }
 
 out:
@@ -3282,23 +3281,6 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd)
                 goto out;
         }
 
-        ret = pthread_mutex_init (&fd_ctx->delay_lock, NULL);
-        if (ret) {
-                GF_FREE (fd_ctx);
-                fd_ctx = NULL;
-                goto out;
-        }
-
-	for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
-		fd_ctx->pre_op_done[i] = GF_CALLOC (sizeof (*fd_ctx->pre_op_done[i]),
-						    priv->child_count,
-						    gf_afr_mt_int32_t);
-		if (!fd_ctx->pre_op_done[i]) {
-			ret = -ENOMEM;
-			goto out;
-		}
-	}
-
         fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on),
                                        priv->child_count,
                                        gf_afr_mt_int32_t);
@@ -3314,26 +3296,8 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd)
 			fd_ctx->opened_on[i] = AFR_FD_NOT_OPENED;
 	}
 
-        fd_ctx->lock_piggyback = GF_CALLOC (sizeof (*fd_ctx->lock_piggyback),
-                                            priv->child_count,
-                                            gf_afr_mt_char);
-        if (!fd_ctx->lock_piggyback) {
-                ret = -ENOMEM;
-                goto out;
-        }
-
-        fd_ctx->lock_acquired = GF_CALLOC (sizeof (*fd_ctx->lock_acquired),
-                                           priv->child_count,
-                                           gf_afr_mt_char);
-        if (!fd_ctx->lock_acquired) {
-                ret = -ENOMEM;
-                goto out;
-        }
-
 	fd_ctx->readdir_subvol = -1;
 
-        INIT_LIST_HEAD (&fd_ctx->eager_locked);
-
         ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx);
         if (ret)
                 gf_msg_debug (this->name, 0,
@@ -3405,12 +3369,70 @@ afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
         return 0;
 }
 
+afr_local_t*
+afr_wakeup_same_fd_delayed_op (xlator_t *this, afr_lock_t *lock, fd_t *fd)
+{
+        afr_local_t *local = NULL;
+
+        if (lock->delay_timer) {
+                local = list_entry(lock->post_op.next, afr_local_t,
+                                   transaction.owner_list);
+                if (fd == local->fd) {
+                        if (gf_timer_call_cancel (this->ctx,
+                                                  lock->delay_timer)) {
+                                local = NULL;
+                        } else {
+                                lock->delay_timer = NULL;
+                        }
+                } else {
+                        local = NULL;
+                }
+        }
+
+        return local;
+}
+
+void
+afr_delayed_changelog_wake_resume (xlator_t *this, inode_t *inode,
+                                   call_stub_t *stub)
+{
+        afr_inode_ctx_t *ctx = NULL;
+        afr_lock_t      *lock = NULL;
+        afr_local_t     *metadata_local = NULL;
+        afr_local_t     *data_local = NULL;
+        LOCK (&inode->lock);
+        {
+                (void)__afr_inode_ctx_get (this, inode, &ctx);
+                lock = &ctx->lock[AFR_DATA_TRANSACTION];
+                data_local = afr_wakeup_same_fd_delayed_op (this, lock,
+                                                            stub->args.fd);
+                lock = &ctx->lock[AFR_METADATA_TRANSACTION];
+                metadata_local = afr_wakeup_same_fd_delayed_op (this, lock,
+                                                                stub->args.fd);
+        }
+        UNLOCK (&inode->lock);
+
+        if (data_local) {
+                data_local->transaction.resume_stub = stub;
+        } else if (metadata_local) {
+                metadata_local->transaction.resume_stub = stub;
+        } else {
+                call_resume (stub);
+        }
+        if (data_local) {
+                afr_delayed_changelog_wake_up_cbk (data_local);
+        }
+        if (metadata_local) {
+                afr_delayed_changelog_wake_up_cbk (metadata_local);
+        }
+}
+
 int
 afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
 {
-        afr_local_t   *local = NULL;
-        call_stub_t   *stub = NULL;
-        int            op_errno   = ENOMEM;
+        afr_local_t *local   = NULL;
+        call_stub_t *stub    = NULL;
+        int         op_errno = ENOMEM;
 
 	local = AFR_FRAME_INIT (frame, op_errno);
 	if (!local)
@@ -3426,7 +3448,7 @@ afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
         if (!stub)
                 goto out;
 
-        afr_delayed_changelog_wake_resume (this, fd, stub);
+        afr_delayed_changelog_wake_resume (this, fd->inode, stub);
 
 	return 0;
 out:
@@ -3434,7 +3456,6 @@ out:
         return 0;
 }
 
-
 int
 afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
 		  int32_t op_ret, int32_t op_errno, dict_t *xdata)
@@ -4497,7 +4518,7 @@ afr_forget (xlator_t *this, inode_t *inode)
                 return 0;
 
         ctx = (afr_inode_ctx_t *)ctx_int;
-        GF_FREE (ctx);
+        afr_inode_ctx_destroy (ctx);
         return 0;
 }
 
@@ -5310,21 +5331,6 @@ out:
 }
 
 int
-afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count)
-{
-        int             ret = -ENOMEM;
-
-        lk->domain = dom;
-        lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes),
-                                      child_count, gf_afr_mt_char);
-        if (NULL == lk->locked_nodes)
-                goto out;
-        ret = 0;
-out:
-        return ret;
-}
-
-int
 afr_transaction_local_init (afr_local_t *local, xlator_t *this)
 {
         int            ret = -ENOMEM;
@@ -5335,25 +5341,9 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this)
         if (ret < 0)
                 goto out;
 
-        if ((local->transaction.type == AFR_DATA_TRANSACTION) ||
-            (local->transaction.type == AFR_METADATA_TRANSACTION)) {
-                ret = afr_inodelk_init (&local->internal_lock.inodelk[0],
-                                        this->name, priv->child_count);
-                if (ret < 0)
-                        goto out;
-        }
-
         ret = -ENOMEM;
 	local->pre_op_compat = priv->pre_op_compat;
 
-        local->transaction.eager_lock =
-                GF_CALLOC (sizeof (*local->transaction.eager_lock),
-                           priv->child_count,
-                           gf_afr_mt_int32_t);
-
-        if (!local->transaction.eager_lock)
-                goto out;
-
         local->transaction.pre_op = GF_CALLOC (sizeof (*local->transaction.pre_op),
                                                priv->child_count,
                                                gf_afr_mt_char);
@@ -5385,9 +5375,9 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this)
         if (!local->pending)
                 goto out;
 
-	INIT_LIST_HEAD (&local->transaction.eager_locked);
-
         ret = 0;
+        INIT_LIST_HEAD (&local->transaction.wait_list);
+        INIT_LIST_HEAD (&local->transaction.owner_list);
 out:
         return ret;
 }
@@ -5422,24 +5412,6 @@ out:
         return;
 }
 
-void
-afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this)
-{
-        afr_local_t     *local = NULL;
-        afr_fd_ctx_t    *fd_ctx   = NULL;
-
-        local = frame->local;
-
-        if (!local->fd)
-		return;
-
-	fd_ctx = afr_fd_ctx_get (local->fd, this);
-	if (!fd_ctx)
-		return;
-
-	fd_ctx->open_fd_count = local->open_fd_count;
-}
-
 int**
 afr_mark_pending_changelog (afr_private_t *priv, unsigned char *pending,
                             dict_t *xattr, ia_type_t iat)
@@ -5548,7 +5520,7 @@ out:
 
 int
 afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this,
-                                  inode_t *inode, gf_boolean_t *dsh,
+                                  fd_t *fd, gf_boolean_t *dsh,
                                   gf_boolean_t *pflag)
 {
         int ret = -1;
@@ -5558,8 +5530,8 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this,
         unsigned char *healed_sinks = NULL;
         unsigned char *undid_pending = NULL;
         afr_private_t   *priv = NULL;
-        fd_t          *fd = NULL;
         struct afr_reply *locked_replies = NULL;
+        inode_t *inode = fd->inode;
 
         priv = this->private;
         data_lock = alloca0 (priv->child_count);
@@ -5568,18 +5540,6 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this,
         healed_sinks = alloca0 (priv->child_count);
         undid_pending = alloca0 (priv->child_count);
 
-        /* Heal-info does an open() on the file being examined so that the
-         * current eager-lock holding client, if present, at some point sees
-         * open-fd count being > 1 and releases the eager-lock so that heal-info
-         * doesn't remain blocked forever until IO completes.
-         */
-        ret = afr_selfheal_data_open (this, inode, &fd);
-        if (ret < 0) {
-                gf_msg_debug (this->name, -ret, "%s: Failed to open",
-                              uuid_utoa (inode->gfid));
-                goto out;
-        }
-
         locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count);
 
         ret = afr_selfheal_inodelk (frame, this, inode, this->name,
@@ -5602,8 +5562,6 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this,
 out:
         if (locked_replies)
                 afr_replies_wipe (locked_replies, priv->child_count);
-        if (fd)
-                fd_unref (fd);
         return ret;
 }
 
@@ -5688,6 +5646,7 @@ afr_selfheal_locked_inspect (call_frame_t *frame, xlator_t *this, uuid_t gfid,
 
 {
         int ret             = -1;
+        fd_t *fd            = NULL;
         gf_boolean_t    dsh = _gf_false;
         gf_boolean_t    msh = _gf_false;
         gf_boolean_t    esh = _gf_false;
@@ -5699,6 +5658,21 @@ afr_selfheal_locked_inspect (call_frame_t *frame, xlator_t *this, uuid_t gfid,
 
         /* For every heal type hold locks and check if it indeed needs heal */
 
+
+        /* Heal-info does an open() on the file being examined so that the
+         * current eager-lock holding client, if present, at some point sees
+         * open-fd count being > 1 and releases the eager-lock so that heal-info
+         * doesn't remain blocked forever until IO completes.
+         */
+        if ((*inode)->ia_type == IA_IFREG) {
+                ret = afr_selfheal_data_open (this, *inode, &fd);
+                if (ret < 0) {
+                        gf_msg_debug (this->name, -ret, "%s: Failed to open",
+                                      uuid_utoa ((*inode)->gfid));
+                        goto out;
+                }
+        }
+
         if (msh) {
                 ret = afr_selfheal_locked_metadata_inspect (frame, this,
                                                             *inode, &msh,
@@ -5708,7 +5682,7 @@ afr_selfheal_locked_inspect (call_frame_t *frame, xlator_t *this, uuid_t gfid,
         }
 
         if (dsh) {
-                ret = afr_selfheal_locked_data_inspect (frame, this, *inode,
+                ret = afr_selfheal_locked_data_inspect (frame, this, fd,
                                                         &dsh, pending);
                 if (ret == -EIO || (ret == -EAGAIN))
                         goto out;
@@ -5723,6 +5697,8 @@ out:
         *data_selfheal = dsh;
         *entry_selfheal = esh;
         *metadata_selfheal = msh;
+        if (fd)
+                fd_unref (fd);
         return ret;
 }
 
@@ -6352,6 +6328,7 @@ afr_write_subvol_reset (call_frame_t *frame, xlator_t *this)
         local = frame->local;
         LOCK(&local->inode->lock);
         {
+                GF_ASSERT (local->inode_ctx->lock_count > 0);
                 local->inode_ctx->lock_count--;
 
                 if (!local->inode_ctx->lock_count)
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index 2402bb2..b52b6ca 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -341,14 +341,14 @@ afr_process_post_writev (call_frame_t *frame, xlator_t *this)
                    the xattrs are not reliably pointing at
                    a stale file.
                 */
-                afr_fd_report_unstable_write (this, local->fd);
+                afr_fd_report_unstable_write (this, local);
 
         __afr_inode_write_finalize (frame, this);
 
         afr_writev_handle_short_writes (frame, this);
 
         if (local->update_open_fd_count)
-                afr_handle_open_fd_count (frame, this);
+                local->inode_ctx->open_fd_count = local->open_fd_count;
 
 }
 
@@ -2590,7 +2590,7 @@ afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
         local->op = GF_FOP_FSYNC;
         local->cont.fsync.datasync = datasync;
 
-	if (afr_fd_has_witnessed_unstable_write (this, fd)) {
+	if (afr_fd_has_witnessed_unstable_write (this, fd->inode)) {
 		/* don't care. we only wanted to CLEAR the bit */
 	}
 
diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c
index 260815f..be3de01 100644
--- a/xlators/cluster/afr/src/afr-lk-common.c
+++ b/xlators/cluster/afr/src/afr-lk-common.c
@@ -52,31 +52,6 @@ afr_entry_lockee_cmp (const void *l1, const void *l2)
 
 int afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index);
 
-static int
-afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this);
-
-static uint64_t afr_lock_number = 1;
-
-static uint64_t
-get_afr_lock_number ()
-{
-        return (++afr_lock_number);
-}
-
-int
-afr_set_lock_number (call_frame_t *frame, xlator_t *this)
-{
-        afr_local_t         *local    = NULL;
-        afr_internal_lock_t *int_lock = NULL;
-
-        local    = frame->local;
-        int_lock = &local->internal_lock;
-
-        int_lock->lock_number = get_afr_lock_number ();
-
-        return 0;
-}
-
 void
 afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner)
 {
@@ -203,21 +178,16 @@ initialize_inodelk_variables (call_frame_t *frame, xlator_t *this)
         afr_local_t         *local    = NULL;
         afr_internal_lock_t *int_lock = NULL;
         afr_private_t       *priv     = NULL;
-        afr_inodelk_t       *inodelk  = NULL;
 
         priv     = this->private;
         local    = frame->local;
         int_lock = &local->internal_lock;
 
-        inodelk = afr_get_inodelk (int_lock, int_lock->domain);
-
-        inodelk->lock_count    = 0;
+        int_lock->lock_count    = 0;
         int_lock->lk_attempted_count = 0;
         int_lock->lock_op_ret   = -1;
         int_lock->lock_op_errno = 0;
 
-        memset (inodelk->locked_nodes, 0,
-                sizeof (*inodelk->locked_nodes) * priv->child_count);
         memset (int_lock->locked_nodes, 0,
                 sizeof (*int_lock->locked_nodes) * priv->child_count);
 
@@ -286,12 +256,7 @@ void
 afr_update_uninodelk (afr_local_t *local, afr_internal_lock_t *int_lock,
                     int32_t child_index)
 {
-        afr_inodelk_t       *inodelk = NULL;
-
-        inodelk = afr_get_inodelk (int_lock, int_lock->domain);
-        inodelk->locked_nodes[child_index] &= LOCKED_NO;
-        if (local->transaction.eager_lock)
-                local->transaction.eager_lock[child_index] = 0;
+        int_lock->locked_nodes[child_index] &= LOCKED_NO;
 
 }
 
@@ -331,35 +296,27 @@ static int
 afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)
 {
         afr_internal_lock_t *int_lock = NULL;
-        afr_inodelk_t       *inodelk  = NULL;
         afr_local_t         *local    = NULL;
         afr_private_t       *priv     = NULL;
         struct gf_flock flock = {0,};
-        struct gf_flock full_flock = {0,};
-        struct gf_flock *flock_use = NULL;
         int call_count = 0;
         int i = 0;
-        int piggyback = 0;
-        afr_fd_ctx_t        *fd_ctx      = NULL;
-
 
         local    = frame->local;
         int_lock = &local->internal_lock;
         priv     = this->private;
 
-        inodelk = afr_get_inodelk (int_lock, int_lock->domain);
-
-        flock.l_start = inodelk->flock.l_start;
-        flock.l_len   = inodelk->flock.l_len;
+        flock.l_start = int_lock->flock.l_start;
+        flock.l_len   = int_lock->flock.l_len;
         flock.l_type  = F_UNLCK;
 
-        full_flock.l_type = F_UNLCK;
-        call_count = afr_locked_nodes_count (inodelk->locked_nodes,
+        call_count = afr_locked_nodes_count (int_lock->locked_nodes,
                                              priv->child_count);
 
         int_lock->lk_call_count = call_count;
 
         if (!call_count) {
+                GF_ASSERT (!local->transaction.do_eager_unlock);
                 gf_msg_trace (this->name, 0,
                               "No internal locks unlocked");
 
@@ -367,64 +324,28 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)
                 goto out;
         }
 
-        if (local->fd)
-                fd_ctx = afr_fd_ctx_get (local->fd, this);
-
         for (i = 0; i < priv->child_count; i++) {
-                if ((inodelk->locked_nodes[i] & LOCKED_YES) != LOCKED_YES)
+                if ((int_lock->locked_nodes[i] & LOCKED_YES) != LOCKED_YES)
                         continue;
 
                 if (local->fd) {
-                        flock_use = &flock;
-                        if (!local->transaction.eager_lock[i]) {
-                                goto wind;
-                        }
-
-                        piggyback = 0;
-
-                        LOCK (&local->fd->lock);
-                        {
-                                if (fd_ctx->lock_piggyback[i]) {
-                                        fd_ctx->lock_piggyback[i]--;
-                                        piggyback = 1;
-                                } else {
-                                        fd_ctx->lock_acquired[i]--;
-                                }
-                        }
-                        UNLOCK (&local->fd->lock);
-
-                        if (piggyback) {
-                                afr_unlock_inodelk_cbk (frame, (void *) (long) i,
-                                                        this, 1, 0, NULL);
-                                if (!--call_count)
-                                        break;
-                                continue;
-                        }
-
-                        flock_use = &full_flock;
-                wind:
                         STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk,
                                            (void *) (long)i,
                                            priv->children[i],
                                            priv->children[i]->fops->finodelk,
                                            int_lock->domain, local->fd,
-                                           F_SETLK, flock_use, NULL);
-
-                        if (!--call_count)
-                                break;
-
+                                           F_SETLK, &flock, NULL);
                 } else {
-
                         STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk,
                                            (void *) (long)i,
                                            priv->children[i],
                                            priv->children[i]->fops->inodelk,
                                            int_lock->domain, &local->loc,
                                            F_SETLK, &flock, NULL);
-
-                        if (!--call_count)
-                                break;
                 }
+
+                if (!--call_count)
+                        break;
         }
 out:
         return 0;
@@ -512,6 +433,18 @@ out:
 
 }
 
+int32_t
+afr_unlock_now (call_frame_t *frame, xlator_t *this)
+{
+        afr_local_t *local = frame->local;
+
+        if (afr_is_inodelk_transaction(local->transaction.type))
+                afr_unlock_inodelk (frame, this);
+        else
+                afr_unlock_entrylk (frame, this);
+        return 0;
+}
+
 static int32_t
 afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
               int32_t op_ret, int32_t op_errno, dict_t *xdata)
@@ -553,7 +486,7 @@ afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
 
         if ((op_ret == -1) &&
             (op_errno == ENOSYS)) {
-                afr_unlock (frame, this);
+                afr_unlock_now (frame, this);
         } else {
                 if (op_ret == 0) {
                         if (local->transaction.type == AFR_ENTRY_TRANSACTION ||
@@ -598,38 +531,6 @@ afr_blocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
         return 0;
 }
 
-static int
-afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this)
-{
-        afr_internal_lock_t *int_lock = NULL;
-        afr_inodelk_t       *inodelk  = NULL;
-        afr_local_t         *local    = NULL;
-        afr_private_t       *priv     = NULL;
-
-        priv     = this->private;
-        local    = frame->local;
-        int_lock = &local->internal_lock;
-
-        switch (local->transaction.type) {
-        case AFR_DATA_TRANSACTION:
-        case AFR_METADATA_TRANSACTION:
-                inodelk = afr_get_inodelk (int_lock, int_lock->domain);
-                memcpy (inodelk->locked_nodes, int_lock->locked_nodes,
-                        sizeof (*inodelk->locked_nodes) * priv->child_count);
-                inodelk->lock_count = int_lock->lock_count;
-                break;
-
-        case AFR_ENTRY_RENAME_TRANSACTION:
-        case AFR_ENTRY_TRANSACTION:
-                /*entrylk_count is being used in both non-blocking and blocking
-                 * modes */
-                break;
-        }
-
-        return 0;
-
-}
-
 static gf_boolean_t
 afr_is_entrylk (afr_transaction_type trans_type)
 {
@@ -733,7 +634,6 @@ int
 afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
 {
         afr_internal_lock_t *int_lock    = NULL;
-        afr_inodelk_t       *inodelk     = NULL;
         afr_local_t         *local       = NULL;
         afr_private_t       *priv        = NULL;
         struct gf_flock flock = {0,};
@@ -752,10 +652,9 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
 
 
         if (!is_entrylk) {
-                inodelk = afr_get_inodelk (int_lock, int_lock->domain);
-                flock.l_start = inodelk->flock.l_start;
-                flock.l_len   = inodelk->flock.l_len;
-                flock.l_type  = inodelk->flock.l_type;
+                flock.l_start = int_lock->flock.l_start;
+                flock.l_len   = int_lock->flock.l_len;
+                flock.l_type  = int_lock->flock.l_type;
         }
 
         if (local->fd) {
@@ -770,9 +669,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
                         local->op_ret           = -1;
                         int_lock->lock_op_ret   = -1;
 
-                        afr_copy_locked_nodes (frame, this);
-
-                        afr_unlock (frame, this);
+                        afr_unlock_now (frame, this);
 
                         return 0;
                 }
@@ -784,9 +681,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
                         local->op_ret           = -1;
                         int_lock->lock_op_ret   = -1;
 
-                        afr_copy_locked_nodes (frame, this);
-
-                        afr_unlock(frame, this);
+                        afr_unlock_now(frame, this);
 
                         return 0;
                 }
@@ -798,8 +693,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
                 gf_msg_debug (this->name, 0,
                               "we're done locking");
 
-                afr_copy_locked_nodes (frame, this);
-
                 int_lock->lock_op_ret = 0;
                 int_lock->lock_cbk (frame, this);
                 return 0;
@@ -815,7 +708,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
         case AFR_METADATA_TRANSACTION:
 
                 if (local->fd) {
-
                         STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk,
                                            (void *) (long) child_index,
                                            priv->children[child_index],
@@ -824,7 +716,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
                                            F_SETLKW, &flock, NULL);
 
                 } else {
-
                         STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk,
                                            (void *) (long) child_index,
                                            priv->children[child_index],
@@ -841,7 +732,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
                  *and 'fd-less' children */
 
                 if (local->fd) {
-
                         STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk,
                                            (void *) (long) cookie,
                                            priv->children[child_index],
@@ -850,7 +740,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
                                            int_lock->lockee[lockee_no].basename,
                                            ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
                 } else {
-
                         STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk,
                                            (void *) (long) cookie,
                                            priv->children[child_index],
@@ -922,7 +811,6 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
         local    = frame->local;
         int_lock = &local->internal_lock;
 
-
 	LOCK (&frame->lock);
 	{
 		if (op_ret < 0 ) {
@@ -969,7 +857,7 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                                       "with blocking calls",
                                       int_lock->lock_count);
 
-                        afr_unlock(frame, this);
+                        afr_unlock_now(frame, this);
                 }
         }
 
@@ -1009,7 +897,7 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)
                         local->op_errno         = EINVAL;
                         int_lock->lock_op_errno = EINVAL;
 
-			afr_unlock (frame, this);
+			afr_unlock_now (frame, this);
                         return -1;
                 }
 
@@ -1021,7 +909,7 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)
                         gf_msg (this->name, GF_LOG_INFO, 0,
                                 AFR_MSG_INFO_COMMON,
                                 "fd not open on any subvolumes. aborting.");
-                        afr_unlock (frame, this);
+                        afr_unlock_now (frame, this);
                         goto out;
                 }
 
@@ -1031,7 +919,6 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)
                         index = i%copies;
                         lockee_no = i/copies;
                         if (local->child_up[index]) {
-
                                 STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk,
                                                    (void *) (long) i,
                                                    priv->children[index],
@@ -1053,7 +940,6 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)
                         index = i%copies;
                         lockee_no = i/copies;
                         if (local->child_up[index]) {
-
                                 STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk,
                                                    (void *) (long) i,
                                                    priv->children[index],
@@ -1077,18 +963,12 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                              int32_t op_ret, int32_t op_errno, dict_t *xdata)
 {
         afr_internal_lock_t *int_lock    = NULL;
-        afr_inodelk_t       *inodelk     = NULL;
         afr_local_t         *local       = NULL;
-        afr_fd_ctx_t        *fd_ctx      = NULL;
         int                  call_count  = 0;
         int                  child_index = (long) cookie;
 
         local    = frame->local;
         int_lock = &local->internal_lock;
-        inodelk = afr_get_inodelk (int_lock, int_lock->domain);
-
-	if (local->fd)
-		fd_ctx = afr_fd_ctx_get (local->fd, this);
 
         LOCK (&frame->lock);
         {
@@ -1105,43 +985,27 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
 				int_lock->lock_op_errno      = op_errno;
 				local->op_errno              = op_errno;
 			}
-			if (local->transaction.eager_lock)
-				local->transaction.eager_lock[child_index] = 0;
 		} else {
-			inodelk->locked_nodes[child_index] |= LOCKED_YES;
-			inodelk->lock_count++;
-
-			if (local->transaction.eager_lock &&
-			    local->transaction.eager_lock[child_index] &&
-			    local->fd) {
-				/* piggybacked */
-				if (op_ret == 1) {
-					/* piggybacked */
-				} else if (op_ret == 0) {
-					/* lock acquired from server */
-                                        fd_ctx->lock_acquired[child_index]++;
-				}
-			}
-
-                        if (local->transaction.type == AFR_DATA_TRANSACTION &&
-                            op_ret == 0) {
-                                LOCK(&local->inode->lock);
-                                {
-                                        local->inode_ctx->lock_count++;
-                                }
-                                UNLOCK (&local->inode->lock);
-                        }
+			int_lock->locked_nodes[child_index] |= LOCKED_YES;
+			int_lock->lock_count++;
 		}
 
                 call_count = --int_lock->lk_call_count;
         }
         UNLOCK (&frame->lock);
 
+        if (op_ret == 0 && local->transaction.type == AFR_DATA_TRANSACTION) {
+                LOCK (&local->inode->lock);
+                {
+                        local->inode_ctx->lock_count++;
+                }
+                UNLOCK (&local->inode->lock);
+        }
         if (call_count == 0) {
                 gf_msg_trace (this->name, 0,
                               "Last inode locking reply received");
                 /* all locks successful. Proceed to call FOP */
-                if (inodelk->lock_count == int_lock->lk_expected_count) {
+                if (int_lock->lock_count == int_lock->lk_expected_count) {
                         gf_msg_trace (this->name, 0,
                                       "All servers locked. Calling the cbk");
                         int_lock->lock_op_ret = 0;
@@ -1155,7 +1019,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                                       "Trying again with blocking calls",
                                       int_lock->lock_count);
 
-                        afr_unlock(frame, this);
+                        afr_unlock_now(frame, this);
                 }
         }
 
@@ -1166,30 +1030,17 @@ int
 afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
 {
         afr_internal_lock_t *int_lock = NULL;
-        afr_inodelk_t       *inodelk  = NULL;
         afr_local_t         *local    = NULL;
         afr_private_t       *priv     = NULL;
         afr_fd_ctx_t        *fd_ctx   = NULL;
         int32_t             call_count = 0;
         int                 i          = 0;
         int                 ret        = 0;
-        struct              gf_flock flock = {0,};
-        struct              gf_flock full_flock = {0,};
-        struct              gf_flock *flock_use = NULL;
-        int                 piggyback = 0;
 
         local    = frame->local;
         int_lock = &local->internal_lock;
         priv     = this->private;
 
-        inodelk = afr_get_inodelk (int_lock, int_lock->domain);
-
-        flock.l_start = inodelk->flock.l_start;
-        flock.l_len   = inodelk->flock.l_len;
-        flock.l_type  = inodelk->flock.l_type;
-
-        full_flock.l_type = inodelk->flock.l_type;
-
         initialize_inodelk_variables (frame, this);
 
         if (local->fd) {
@@ -1205,88 +1056,48 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
                         local->op_errno         = EINVAL;
                         int_lock->lock_op_errno = EINVAL;
 
-			afr_unlock (frame, this);
+			afr_unlock_now (frame, this);
                         ret = -1;
                         goto out;
                 }
+        }
 
-                call_count = internal_lock_count (frame, this);
-                int_lock->lk_call_count = call_count;
-                int_lock->lk_expected_count = call_count;
-
-                if (!call_count) {
-                        gf_msg (this->name, GF_LOG_INFO, 0,
-                                AFR_MSG_SUBVOLS_DOWN,
-                                "All bricks are down, aborting.");
-                        afr_unlock (frame, this);
-                        goto out;
-                }
-
-                /* Send non-blocking inodelk calls only on up children
-                   and where the fd has been opened */
-                for (i = 0; i < priv->child_count; i++) {
-                        if (!local->child_up[i])
-                                continue;
-
-                        flock_use = &flock;
-                        if (!local->transaction.eager_lock_on) {
-                                goto wind;
-                        }
-
-                        piggyback = 0;
-                        local->transaction.eager_lock[i] = 1;
-
-			afr_set_delayed_post_op (frame, this);
+        call_count = internal_lock_count (frame, this);
+        int_lock->lk_call_count = call_count;
+        int_lock->lk_expected_count = call_count;
 
-                        LOCK (&local->fd->lock);
-                        {
-                                if (fd_ctx->lock_acquired[i]) {
-                                        fd_ctx->lock_piggyback[i]++;
-                                        piggyback = 1;
-                                }
-                        }
-                        UNLOCK (&local->fd->lock);
+        if (!call_count) {
+                gf_msg (this->name, GF_LOG_INFO, 0,
+                        AFR_MSG_SUBVOLS_DOWN,
+                        "All bricks are down, aborting.");
+                afr_unlock_now (frame, this);
+                goto out;
+        }
 
-                        if (piggyback) {
-                                /* (op_ret == 1) => indicate piggybacked lock */
-                                afr_nonblocking_inodelk_cbk (frame, (void *) (long) i,
-                                                             this, 1, 0, NULL);
-                                if (!--call_count)
-                                        break;
-                                continue;
-                        }
-                        flock_use = &full_flock;
-                wind:
+        /* Send non-blocking inodelk calls only on up children
+           and where the fd has been opened */
+        for (i = 0; i < priv->child_count; i++) {
+                if (!local->child_up[i])
+                        continue;
 
+                if (local->fd) {
                         STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk,
                                            (void *) (long) i,
                                            priv->children[i],
                                            priv->children[i]->fops->finodelk,
                                            int_lock->domain, local->fd,
-                                           F_SETLK, flock_use, NULL);
-
-                        if (!--call_count)
-                                break;
-                }
-        } else {
-                call_count = internal_lock_count (frame, this);
-                int_lock->lk_call_count = call_count;
-                int_lock->lk_expected_count = call_count;
-
-                for (i = 0; i < priv->child_count; i++) {
-                        if (!local->child_up[i])
-                                continue;
+                                           F_SETLK, &int_lock->flock, NULL);
+                } else {
 
                         STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk,
                                            (void *) (long) i,
                                            priv->children[i],
                                            priv->children[i]->fops->inodelk,
                                            int_lock->domain, &local->loc,
-                                           F_SETLK, &flock, NULL);
-
-                        if (!--call_count)
-                                break;
+                                           F_SETLK, &int_lock->flock, NULL);
                 }
+                if (!--call_count)
+                        break;
         }
 out:
         return ret;
@@ -1296,13 +1107,32 @@ int32_t
 afr_unlock (call_frame_t *frame, xlator_t *this)
 {
         afr_local_t *local = NULL;
+        afr_lock_t  *lock  = NULL;
 
         local = frame->local;
 
-        if (afr_is_inodelk_transaction(local->transaction.type))
-                afr_unlock_inodelk (frame, this);
-        else
-                afr_unlock_entrylk (frame, this);
+        if (!local->transaction.eager_lock_on)
+                goto out;
+        lock = &local->inode_ctx->lock[local->transaction.type];
+        LOCK (&local->inode->lock);
+        {
+                list_del_init (&local->transaction.owner_list);
+                if (list_empty (&lock->owners) && list_empty (&lock->post_op)) {
+                        local->transaction.do_eager_unlock = _gf_true;
+        /*TODO: Need to get metadata use on_disk and inherit/uninherit
+         *GF_ASSERT (!local->inode_ctx->on_disk[local->transaction.type]);
+         *GF_ASSERT (!local->inode_ctx->inherited[local->transaction.type]);
+        */
+                        GF_ASSERT (lock->release);
+                }
+        }
+        UNLOCK (&local->inode->lock);
+        if (!local->transaction.do_eager_unlock) {
+                local->internal_lock.lock_cbk (frame, this);
+                return 0;
+        }
 
+out:
+        afr_unlock_now (frame, this);
         return 0;
 }
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index f61b237..32fd24a 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -2463,6 +2463,7 @@ afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid)
         int           data_ret          = 1;
         int           or_ret            = 0;
         inode_t      *inode             = NULL;
+        fd_t         *fd                = NULL;
 	gf_boolean_t  data_selfheal     = _gf_false;
 	gf_boolean_t  metadata_selfheal = _gf_false;
 	gf_boolean_t  entry_selfheal    = _gf_false;
@@ -2487,8 +2488,16 @@ afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid)
                 goto out;
         }
 
+        if (inode->ia_type == IA_IFREG) {
+                ret = afr_selfheal_data_open (this, inode, &fd);
+                if (!fd) {
+                        ret = -EIO;
+                        goto out;
+                }
+        }
+
 	if (data_selfheal && dataheal_enabled)
-                data_ret = afr_selfheal_data (frame, this, inode);
+                data_ret = afr_selfheal_data (frame, this, fd);
 
 	if (metadata_selfheal && priv->metadata_self_heal)
                 metadata_ret = afr_selfheal_metadata (frame, this, inode);
@@ -2510,6 +2519,8 @@ afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid)
 out:
         if (inode)
                 inode_unref (inode);
+        if (fd)
+                fd_unref (fd);
         return ret;
 }
 /*
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index bcd0dec..f872a98 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -856,22 +856,15 @@ out:
 }
 
 int
-afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode)
+afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd)
 {
 	afr_private_t *priv = NULL;
 	unsigned char *locked_on = NULL;
 	int ret = 0;
-	fd_t *fd = NULL;
+        inode_t *inode = fd->inode;
 
 	priv = this->private;
 
-	ret = afr_selfheal_data_open (this, inode, &fd);
-	if (!fd) {
-                gf_msg_debug (this->name, -ret, "%s: Failed to open",
-                              uuid_utoa (inode->gfid));
-                return -EIO;
-        }
-
 	locked_on = alloca0 (priv->child_count);
 
 	ret = afr_selfheal_tie_breaker_inodelk (frame, this, inode,
@@ -898,8 +891,5 @@ unlock:
 	afr_selfheal_uninodelk (frame, this, inode, priv->sh_domain, 0, 0,
 	                        locked_on);
 
-	if (fd)
-		fd_unref (fd);
-
 	return ret;
 }
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index 188a334..b015976 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -102,7 +102,7 @@ afr_selfheal_name (xlator_t *this, uuid_t gfid, const char *name,
                    void *gfid_req, dict_t *xdata);
 
 int
-afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode);
+afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd);
 
 int
 afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode);
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index acbfe1a..993029d 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -25,6 +25,18 @@ typedef enum {
         AFR_TRANSACTION_POST_OP,
 } afr_xattrop_type_t;
 
+static void
+afr_lock_resume_shared (struct list_head *list);
+
+void
+__afr_transaction_wake_shared (afr_local_t *local, struct list_head *shared);
+
+void
+afr_changelog_post_op (call_frame_t *frame, xlator_t *this);
+
+int
+afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this);
+
 gf_boolean_t
 afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this);
 
@@ -168,13 +180,14 @@ afr_transaction_fop (call_frame_t *frame, xlator_t *this)
         return 0;
 }
 
-
 int
 afr_transaction_done (call_frame_t *frame, xlator_t *this)
 {
-        afr_local_t *local = NULL;
-        afr_private_t *priv = NULL;
-        gf_boolean_t unwind = _gf_false;
+        afr_local_t   *local      = NULL;
+        afr_private_t *priv       = NULL;
+        gf_boolean_t  unwind      = _gf_false;
+        afr_lock_t    *lock       = NULL;
+        afr_local_t   *lock_local = NULL;
 
         priv  = this->private;
         local = frame->local;
@@ -188,6 +201,31 @@ afr_transaction_done (call_frame_t *frame, xlator_t *this)
                 if (unwind)/*It definitely did post-op*/
                         afr_zero_fill_stat (local);
         }
+
+        if (local->transaction.do_eager_unlock) {
+                lock = &local->inode_ctx->lock[local->transaction.type];
+                LOCK (&local->inode->lock);
+                {
+                        lock->acquired = _gf_false;
+                        lock->release = _gf_false;
+                        list_splice_init (&lock->frozen,
+                                          &lock->waiting);
+                        if (list_empty (&lock->waiting))
+                                goto unlock;
+                        lock_local = list_entry (lock->waiting.next,
+                                                 afr_local_t,
+                                                transaction.wait_list);
+                        list_del_init (&lock_local->transaction.wait_list);
+                        list_add (&lock_local->transaction.owner_list,
+                                  &lock->owners);
+                }
+unlock:
+                UNLOCK (&local->inode->lock);
+        }
+        if (lock_local) {
+                afr_lock (lock_local->transaction.frame,
+                          lock_local->transaction.frame->this);
+        }
         local->transaction.unwind (frame, this);
 
         AFR_STACK_DESTROY (frame);
@@ -195,6 +233,52 @@ afr_transaction_done (call_frame_t *frame, xlator_t *this)
         return 0;
 }
 
+static void
+afr_lock_fail_shared (afr_local_t *local, struct list_head *list)
+{
+        afr_local_t *each = NULL;
+
+        while (!list_empty(list)) {
+                each = list_entry (list->next, afr_local_t,
+                                   transaction.wait_list);
+                list_del_init(&each->transaction.wait_list);
+                each->op_ret = -1;
+                each->op_errno = local->op_errno;
+                afr_transaction_done (each->transaction.frame,
+                                      each->transaction.frame->this);
+        }
+}
+
+static void
+afr_handle_lock_acquire_failure (afr_local_t *local, gf_boolean_t locked)
+{
+        struct list_head shared;
+        afr_lock_t *lock = NULL;
+
+        if (!local->transaction.eager_lock_on)
+                goto out;
+
+        lock = &local->inode_ctx->lock[local->transaction.type];
+
+        INIT_LIST_HEAD (&shared);
+        LOCK (&local->inode->lock);
+        {
+                list_splice_init (&lock->waiting, &shared);
+        }
+        UNLOCK (&local->inode->lock);
+
+        afr_lock_fail_shared (local, &shared);
+        local->transaction.do_eager_unlock = _gf_true;
+out:
+        if (locked) {
+                local->internal_lock.lock_cbk = afr_transaction_done;
+                afr_unlock (local->transaction.frame,
+                            local->transaction.frame->this);
+        } else {
+                afr_transaction_done (local->transaction.frame,
+                                      local->transaction.frame->this);
+        }
+}
 
 call_frame_t*
 afr_transaction_detach_fop_frame (call_frame_t *frame)
@@ -334,6 +418,7 @@ afr_txn_arbitrate_fop (call_frame_t *frame, xlator_t *this)
         afr_local_t *local = NULL;
         afr_private_t *priv = NULL;
         int pre_op_sources_count = 0;
+        int i = 0;
 
         priv = this->private;
         local = frame->local;
@@ -345,11 +430,11 @@ afr_txn_arbitrate_fop (call_frame_t *frame, xlator_t *this)
         /* If arbiter is the only source, do not proceed. */
         if (pre_op_sources_count < 2 &&
             local->transaction.pre_op_sources[ARBITER_BRICK_INDEX]) {
-                local->internal_lock.lock_cbk = afr_transaction_done;
                 local->op_ret = -1;
                 local->op_errno =  ENOTCONN;
-                afr_restore_lk_owner (frame);
-                afr_unlock (frame, this);
+                for (i = 0; i < priv->child_count; i++)
+                        local->transaction.failed_subvols[i] = 1;
+                afr_changelog_post_op (frame, this);/*uninherit should happen*/
         } else {
                 afr_transaction_fop (frame, this);
         }
@@ -362,14 +447,16 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)
 {
         afr_local_t   *local = NULL;
         afr_private_t *priv  = NULL;
-        fd_t          *fd    = NULL;
         int           i      = 0;
         int           ret    = 0;
+        int     failure_count = 0;
+        struct list_head shared;
+        afr_lock_t *lock = NULL;
 
         local = frame->local;
         priv = this->private;
-        fd    = local->fd;
 
+        INIT_LIST_HEAD (&shared);
         if (local->transaction.type == AFR_DATA_TRANSACTION &&
             !local->transaction.inherited) {
                 ret = afr_write_subvol_set (frame, this);
@@ -394,22 +481,31 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)
 		   just now, before OP */
 		afr_changelog_pre_op_update (frame, this);
 
-        /* The wake up needs to happen independent of
-           what type of fop arrives here. If it was
-           a write, then it has already inherited the
-           lock and changelog. If it was not a write,
-           then the presumption of the optimization (of
-           optimizing for successive write operations)
-           fails.
-        */
-        if (fd)
-                afr_delayed_changelog_wake_up (this, fd);
+        if (!local->transaction.eager_lock_on ||
+            local->transaction.inherited)
+                goto fop;
+        failure_count = AFR_COUNT (local->transaction.failed_subvols,
+                                   priv->child_count);
+        if (failure_count == priv->child_count) {
+                afr_handle_lock_acquire_failure (local, _gf_true);
+        } else {
+                lock = &local->inode_ctx->lock[local->transaction.type];
+                LOCK (&local->inode->lock);
+                {
+                        lock->acquired = _gf_true;
+                        __afr_transaction_wake_shared (local, &shared);
+                }
+                UNLOCK (&local->inode->lock);
+        }
+
+fop:
         if (priv->arbiter_count == 1) {
                 afr_txn_arbitrate_fop (frame, this);
         } else {
                 afr_transaction_fop (frame, this);
         }
 
+        afr_lock_resume_shared (&shared);
 	return 0;
 }
 
@@ -486,30 +582,14 @@ afr_changelog_post_op_done (call_frame_t *frame, xlator_t *this)
 }
 
 
-afr_inodelk_t*
-afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom)
-{
-        afr_inodelk_t *inodelk = NULL;
-        int           i = 0;
-
-        for (i = 0; int_lock->inodelk[i].domain; i++) {
-                inodelk = &int_lock->inodelk[i];
-                if (strcmp (dom, inodelk->domain) == 0)
-                        return inodelk;
-        }
-        return NULL;
-}
-
 unsigned char*
 afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock)
 {
         unsigned char *locked_nodes = NULL;
-        afr_inodelk_t *inodelk = NULL;
         switch (type) {
         case AFR_DATA_TRANSACTION:
         case AFR_METADATA_TRANSACTION:
-                inodelk = afr_get_inodelk (int_lock, int_lock->domain);
-                locked_nodes = inodelk->locked_nodes;
+                locked_nodes = int_lock->locked_nodes;
         break;
 
         case AFR_ENTRY_TRANSACTION:
@@ -834,27 +914,19 @@ afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this)
 {
 	afr_local_t *local = NULL;
 	afr_private_t *priv = NULL;
-	fd_t *fd = NULL;
+        afr_inode_ctx_t *ctx = NULL;
 	int i = 0;
 	gf_boolean_t ret = _gf_false;
-	afr_fd_ctx_t *fd_ctx = NULL;
 	int type = 0;
 
 	local = frame->local;
 	priv = this->private;
-	fd = local->fd;
+        ctx = local->inode_ctx;
 
 	type = afr_index_for_transaction_type (local->transaction.type);
 	if (type != AFR_DATA_TRANSACTION)
 		return !local->transaction.dirtied;
 
-	if (!fd)
-		return !local->transaction.dirtied;
-
-	fd_ctx = afr_fd_ctx_get (fd, this);
-	if (!fd_ctx)
-		return _gf_false;
-
 	if (local->transaction.no_uninherit)
 		return _gf_false;
 
@@ -868,34 +940,34 @@ afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this)
 	if (local->transaction.uninherit_done)
 		return local->transaction.uninherit_value;
 
-	LOCK(&fd->lock);
+	LOCK(&local->inode->lock);
 	{
 		for (i = 0; i < priv->child_count; i++) {
 			if (local->transaction.pre_op[i] !=
-			    fd_ctx->pre_op_done[type][i]) {
+			    ctx->pre_op_done[type][i]) {
 				ret = !local->transaction.dirtied;
 				goto unlock;
 			}
 		}
 
-		if (fd_ctx->inherited[type]) {
+		if (ctx->inherited[type]) {
 			ret = _gf_true;
-			fd_ctx->inherited[type]--;
-		} else if (fd_ctx->on_disk[type]) {
+			ctx->inherited[type]--;
+		} else if (ctx->on_disk[type]) {
 			ret = _gf_false;
-			fd_ctx->on_disk[type]--;
+			ctx->on_disk[type]--;
 		} else {
 			/* ASSERT */
 			ret = _gf_false;
 		}
 
-		if (!fd_ctx->inherited[type] && !fd_ctx->on_disk[type]) {
+		if (!ctx->inherited[type] && !ctx->on_disk[type]) {
 			for (i = 0; i < priv->child_count; i++)
-				fd_ctx->pre_op_done[type][i] = 0;
+				ctx->pre_op_done[type][i] = 0;
 		}
 	}
 unlock:
-	UNLOCK(&fd->lock);
+	UNLOCK(&local->inode->lock);
 
 	local->transaction.uninherit_done = _gf_true;
 	local->transaction.uninherit_value = ret;
@@ -909,31 +981,21 @@ afr_changelog_pre_op_inherit (call_frame_t *frame, xlator_t *this)
 {
 	afr_local_t *local = NULL;
 	afr_private_t *priv = NULL;
-	fd_t *fd = NULL;
 	int i = 0;
 	gf_boolean_t ret = _gf_false;
-	afr_fd_ctx_t *fd_ctx = NULL;
 	int type = 0;
 
 	local = frame->local;
 	priv = this->private;
-	fd = local->fd;
 
 	if (local->transaction.type != AFR_DATA_TRANSACTION)
 		return _gf_false;
 
 	type = afr_index_for_transaction_type (local->transaction.type);
 
-	if (!fd)
-		return _gf_false;
-
-	fd_ctx = afr_fd_ctx_get (fd, this);
-	if (!fd_ctx)
-		return _gf_false;
-
-	LOCK(&fd->lock);
+	LOCK(&local->inode->lock);
 	{
-		if (!fd_ctx->on_disk[type]) {
+		if (!local->inode_ctx->on_disk[type]) {
 			/* nothing to inherit yet */
 			ret = _gf_false;
 			goto unlock;
@@ -941,21 +1003,21 @@ afr_changelog_pre_op_inherit (call_frame_t *frame, xlator_t *this)
 
 		for (i = 0; i < priv->child_count; i++) {
 			if (local->transaction.pre_op[i] !=
-			    fd_ctx->pre_op_done[type][i]) {
+			    local->inode_ctx->pre_op_done[type][i]) {
 				/* either inherit exactly, or don't */
 				ret = _gf_false;
 				goto unlock;
 			}
 		}
 
-		fd_ctx->inherited[type]++;
+		local->inode_ctx->inherited[type]++;
 
 		ret = _gf_true;
 
 		local->transaction.inherited = _gf_true;
 	}
 unlock:
-	UNLOCK(&fd->lock);
+	UNLOCK(&local->inode->lock);
 
 	return ret;
 }
@@ -966,22 +1028,16 @@ afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this)
 {
 	afr_local_t *local = NULL;
 	afr_private_t *priv = NULL;
-	fd_t *fd = NULL;
-	afr_fd_ctx_t *fd_ctx = NULL;
 	int i = 0;
 	gf_boolean_t ret = _gf_false;
 	int type = 0;
 
 	local = frame->local;
 	priv = this->private;
-	fd = local->fd;
 
-	if (!fd)
-		return _gf_false;
-
-	fd_ctx = afr_fd_ctx_get (fd, this);
-	if (!fd_ctx)
-		return _gf_false;
+        if (local->transaction.type == AFR_ENTRY_TRANSACTION ||
+            local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION)
+                return _gf_false;
 
 	if (local->transaction.inherited)
 		/* was already inherited in afr_changelog_pre_op */
@@ -997,26 +1053,26 @@ afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this)
 
 	ret = _gf_false;
 
-	LOCK(&fd->lock);
+	LOCK(&local->inode->lock);
 	{
-		if (!fd_ctx->on_disk[type]) {
+		if (!local->inode_ctx->on_disk[type]) {
 			for (i = 0; i < priv->child_count; i++)
-				fd_ctx->pre_op_done[type][i] =
+				local->inode_ctx->pre_op_done[type][i] =
                                         (!local->transaction.failed_subvols[i]);
 		} else {
 			for (i = 0; i < priv->child_count; i++)
-				if (fd_ctx->pre_op_done[type][i] !=
+				if (local->inode_ctx->pre_op_done[type][i] !=
 				    (!local->transaction.failed_subvols[i])) {
 					local->transaction.no_uninherit = 1;
 					goto unlock;
 				}
 		}
-		fd_ctx->on_disk[type]++;
+		local->inode_ctx->on_disk[type]++;
 
 		ret = _gf_true;
 	}
 unlock:
-	UNLOCK(&fd->lock);
+	UNLOCK(&local->inode->lock);
 
 	return ret;
 }
@@ -1324,6 +1380,9 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
 
         afr_init_optimistic_changelog_for_txn (this, local);
 
+        if (afr_changelog_pre_op_inherit (frame, this))
+                goto next;
+
         /* This condition should not be met with present code, as
          * transaction.done will be called if locks are not acquired on even a
          * single node.
@@ -1349,9 +1408,6 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
 		goto err;
 	}
 
-	if (afr_changelog_pre_op_inherit (frame, this))
-		goto next;
-
         if (call_count < priv->child_count)
                 pre_nop = _gf_false;
 
@@ -1408,7 +1464,7 @@ err:
 	local->op_ret = -1;
 	local->op_errno = op_errno;
 
-	afr_unlock (frame, this);
+        afr_handle_lock_acquire_failure (local, _gf_true);
 
 	if (xdata_req)
 		dict_unref (xdata_req);
@@ -1418,31 +1474,6 @@ err:
 
 
 int
-afr_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
-{
-        afr_internal_lock_t *int_lock = NULL;
-        afr_local_t         *local    = NULL;
-
-        local    = frame->local;
-        int_lock = &local->internal_lock;
-
-        if (int_lock->lock_op_ret < 0) {
-                gf_msg (this->name, GF_LOG_INFO,
-                        0, AFR_MSG_BLOCKING_LKS_FAILED,
-                        "Blocking inodelks failed.");
-                afr_transaction_done (frame, this);
-        } else {
-
-                gf_msg_debug (this->name, 0,
-                              "Blocking inodelks done. Proceeding to FOP");
-                afr_internal_lock_finish (frame, this);
-        }
-
-        return 0;
-}
-
-
-int
 afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
 {
         afr_internal_lock_t *int_lock = NULL;
@@ -1455,7 +1486,7 @@ afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
         if (int_lock->lock_op_ret < 0) {
                 gf_msg_debug (this->name, 0,
                               "Non blocking inodelks failed. Proceeding to blocking");
-                int_lock->lock_cbk = afr_post_blocking_inodelk_cbk;
+                int_lock->lock_cbk = afr_internal_lock_finish;
                 afr_blocking_lock (frame, this);
         } else {
 
@@ -1469,31 +1500,6 @@ afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
 
 
 int
-afr_post_blocking_entrylk_cbk (call_frame_t *frame, xlator_t *this)
-{
-        afr_internal_lock_t *int_lock = NULL;
-        afr_local_t         *local    = NULL;
-
-        local    = frame->local;
-        int_lock = &local->internal_lock;
-
-        if (int_lock->lock_op_ret < 0) {
-                gf_msg (this->name, GF_LOG_INFO, 0,
-                        AFR_MSG_BLOCKING_LKS_FAILED,
-                        "Blocking entrylks failed.");
-                afr_transaction_done (frame, this);
-        } else {
-
-                gf_msg_debug (this->name, 0,
-                             "Blocking entrylks done. Proceeding to FOP");
-                afr_internal_lock_finish (frame, this);
-        }
-
-        return 0;
-}
-
-
-int
 afr_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this)
 {
         afr_internal_lock_t *int_lock = NULL;
@@ -1506,7 +1512,7 @@ afr_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this)
         if (int_lock->lock_op_ret < 0) {
                 gf_msg_debug (this->name, 0,
                               "Non blocking entrylks failed. Proceeding to blocking");
-                int_lock->lock_cbk = afr_post_blocking_entrylk_cbk;
+                int_lock->lock_cbk = afr_internal_lock_finish;
                 afr_blocking_lock (frame, this);
         } else {
 
@@ -1567,29 +1573,28 @@ int
 afr_set_transaction_flock (xlator_t *this, afr_local_t *local)
 {
         afr_internal_lock_t *int_lock = NULL;
-        afr_inodelk_t       *inodelk  = NULL;
         afr_private_t       *priv     = NULL;
 
         int_lock = &local->internal_lock;
-        inodelk = afr_get_inodelk (int_lock, int_lock->domain);
         priv = this->private;
 
-        if ((priv->arbiter_count || priv->full_lock) &&
+        if ((priv->arbiter_count || local->transaction.eager_lock_on ||
+             priv->full_lock) &&
             local->transaction.type == AFR_DATA_TRANSACTION) {
                 /*Lock entire file to avoid network split brains.*/
-                inodelk->flock.l_len   = 0;
-                inodelk->flock.l_start = 0;
+                int_lock->flock.l_len   = 0;
+                int_lock->flock.l_start = 0;
         } else {
-                inodelk->flock.l_len   = local->transaction.len;
-                inodelk->flock.l_start = local->transaction.start;
+                int_lock->flock.l_len   = local->transaction.len;
+                int_lock->flock.l_start = local->transaction.start;
         }
-        inodelk->flock.l_type  = F_WRLCK;
+        int_lock->flock.l_type  = F_WRLCK;
 
         return 0;
 }
 
 int
-afr_lock_rec (call_frame_t *frame, xlator_t *this)
+afr_lock (call_frame_t *frame, xlator_t *this)
 {
         afr_internal_lock_t *int_lock = NULL;
         afr_local_t         *local    = NULL;
@@ -1630,74 +1635,153 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this)
         return 0;
 }
 
+static gf_boolean_t
+afr_locals_overlap (afr_local_t *local1, afr_local_t *local2)
+{
+        uint64_t start1 = local1->transaction.start;
+        uint64_t start2 = local2->transaction.start;
+        uint64_t end1 = 0;
+        uint64_t end2 = 0;
+
+        if (local1->transaction.len)
+                end1 = start1 + local1->transaction.len - 1;
+        else
+                end1 = ULLONG_MAX;
+
+        if (local2->transaction.len)
+                end2 = start2 + local2->transaction.len - 1;
+        else
+                end2 = ULLONG_MAX;
 
-int
-afr_lock (call_frame_t *frame, xlator_t *this)
+        return ((end1 >= start2) && (end2 >= start1));
+}
+
+gf_boolean_t
+afr_has_lock_conflict (afr_local_t *local, gf_boolean_t waitlist_check)
 {
-        afr_set_lock_number (frame, this);
+        afr_local_t     *each = NULL;
+        afr_lock_t      *lock = NULL;
 
-        return afr_lock_rec (frame, this);
+        lock = &local->inode_ctx->lock[local->transaction.type];
+        /*
+         * Once full file lock is acquired in eager-lock phase, overlapping
+         * writes do not compete for inode-locks, instead are transferred to the
+         * next writes. Because of this overlapping writes are not ordered.
+         * This can cause inconsistencies in replication.
+         * Example:
+         * Two overlapping writes w1, w2 are sent in parallel on same fd
+         * in two threads t1, t2.
+         * Both threads can execute afr_writev_wind in the following manner.
+         * t1 winds w1 on brick-0
+         * t2 winds w2 on brick-0
+         * t2 winds w2 on brick-1
+         * t1 winds w1 on brick-1
+         *
+         * This check makes sure the locks are not transferred for
+         * overlapping writes.
+         */
+        list_for_each_entry (each, &lock->owners, transaction.owner_list) {
+                if (afr_locals_overlap (each, local)) {
+                        return _gf_true;
+                }
+        }
+
+        if (!waitlist_check)
+                return _gf_false;
+        list_for_each_entry (each, &lock->waiting, transaction.wait_list) {
+                if (afr_locals_overlap (each, local)) {
+                        return _gf_true;
+                }
+        }
+        return _gf_false;
 }
 
 
 /* }}} */
-
-int
-afr_internal_lock_finish (call_frame_t *frame, xlator_t *this)
+static void
+afr_copy_inodelk_vars (afr_internal_lock_t *dst, afr_internal_lock_t *src,
+                       xlator_t *this)
 {
-        afr_changelog_pre_op (frame, this);
+        afr_private_t *priv = this->private;
 
-        return 0;
+        dst->domain = src->domain;
+        dst->flock.l_len  = src->flock.l_len;
+        dst->flock.l_start  = src->flock.l_start;
+        dst->flock.l_type  = src->flock.l_type;
+        dst->lock_count = src->lock_count;
+        memcpy (dst->locked_nodes, src->locked_nodes,
+                priv->child_count * sizeof (*dst->locked_nodes));
 }
 
-
 void
-afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this)
+__afr_transaction_wake_shared (afr_local_t *local, struct list_head *shared)
 {
-        afr_local_t    *local = NULL;
-        afr_private_t  *priv = NULL;
+        gf_boolean_t conflict = _gf_false;
+        afr_local_t *each = NULL;
+        afr_lock_t *lock = &local->inode_ctx->lock[local->transaction.type];
 
-        /* call this function from any of the related optimizations
-           which benefit from delaying post op are enabled, namely:
-
-           - changelog piggybacking
-           - eager locking
-        */
+        while (!conflict) {
+                if (list_empty (&lock->waiting))
+                        return;
+                each = list_entry(lock->waiting.next, afr_local_t,
+                                  transaction.wait_list);
+                if (afr_has_lock_conflict (each, _gf_false)) {
+                        conflict = _gf_true;
+                }
+                if (conflict && !list_empty (&lock->owners))
+                        return;
+                afr_copy_inodelk_vars (&each->internal_lock,
+                                       &local->internal_lock,
+                                       each->transaction.frame->this);
+                list_move_tail (&each->transaction.wait_list, shared);
+                list_add_tail(&each->transaction.owner_list, &lock->owners);
+        }
+}
 
-        priv = this->private;
-        if (!priv)
-                return;
+static void
+afr_lock_resume_shared (struct list_head *list)
+{
+        afr_local_t *each = NULL;
 
-        if (!priv->post_op_delay_secs)
-                return;
+        while (!list_empty(list)) {
+                each = list_entry(list->next, afr_local_t,
+                                  transaction.wait_list);
+                list_del_init(&each->transaction.wait_list);
+                afr_changelog_pre_op (each->transaction.frame,
+                                      each->transaction.frame->this);
+        }
+}
 
-        local = frame->local;
-        if (!local)
-                return;
+int
+afr_internal_lock_finish (call_frame_t *frame, xlator_t *this)
+{
+        afr_local_t *local = frame->local;
+        afr_lock_t   *lock  = NULL;
 
-        if (!local->transaction.eager_lock_on)
-                return;
 
-        if (!local->fd)
-                return;
+        local->internal_lock.lock_cbk = NULL;
+        if (!local->transaction.eager_lock_on) {
+                if (local->internal_lock.lock_op_ret < 0) {
+                        afr_transaction_done (frame, this);
+                        return 0;
+                }
+                afr_changelog_pre_op (frame, this);
+        } else {
+                lock = &local->inode_ctx->lock[local->transaction.type];
+                if (local->internal_lock.lock_op_ret < 0) {
+                        afr_handle_lock_acquire_failure (local, _gf_false);
+                } else {
+                        lock->event_generation = local->event_generation;
+                        afr_changelog_pre_op (frame, this);
+                }
+        }
 
-        if (local->op == GF_FOP_WRITE)
-                local->delayed_post_op = _gf_true;
+        return 0;
 }
 
 gf_boolean_t
-afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this)
+afr_are_multiple_fds_opened (afr_local_t *local, xlator_t *this)
 {
-        afr_fd_ctx_t *fd_ctx = NULL;
-
-        if (!fd) {
-                /* If false is returned, it may keep on taking eager-lock
-                 * which may lead to starvation, so return true to avoid that.
-                 */
-                gf_msg_callingfn (this->name, GF_LOG_ERROR, EBADF,
-                                  AFR_MSG_INVALID_ARG, "Invalid fd");
-                return _gf_true;
-        }
         /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock
          * is taken mount2 opened the same file, it won't be able to
          * perform any data operations until mount1 releases eager-lock.
@@ -1705,11 +1789,7 @@ afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this)
          * if open-fd-count is > 1
          */
 
-        fd_ctx = afr_fd_ctx_get (fd, this);
-        if (!fd_ctx)
-                return _gf_true;
-
-        if (fd_ctx->open_fd_count > 1)
+        if (local->inode_ctx->open_fd_count > 1)
                 return _gf_true;
 
         return _gf_false;
@@ -1717,24 +1797,45 @@ afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this)
 
 
 gf_boolean_t
-is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this)
+afr_is_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this,
+                                         int delay)
 {
-        afr_local_t      *local = NULL;
-        gf_boolean_t      res = _gf_false;
+        afr_local_t  *local = NULL;
+        afr_lock_t   *lock  = NULL;
+        gf_boolean_t res    = _gf_false;
 
         local = frame->local;
-        if (!local)
+        lock = &local->inode_ctx->lock[local->transaction.type];
+
+        if (!afr_txn_nothing_failed (frame, this)) {
+                lock->release = _gf_true;
                 goto out;
+        }
 
-        if (!local->delayed_post_op)
+        if (afr_are_multiple_fds_opened (local, this)) {
+                lock->release = _gf_true;
                 goto out;
+        }
 
-        //Mark pending changelog ASAP
-        if (!afr_txn_nothing_failed (frame, this))
+        if (!list_empty (&lock->owners))
+                goto out;
+        else
+                GF_ASSERT (list_empty (&lock->waiting));
+
+        if (lock->release) {
+                goto out;
+        }
+
+        if (!delay) {
                 goto out;
+        }
 
-        if (local->fd && afr_are_multiple_fds_opened (local->fd, this))
+        if ((local->op != GF_FOP_WRITE) &&
+            (local->op != GF_FOP_FXATTROP)) {
+                /*Only allow writes but shard does [f]xattrops on writes, so
+                 * they are fine too*/
                 goto out;
+        }
 
         res = _gf_true;
 out:
@@ -1745,50 +1846,61 @@ out:
 void
 afr_delayed_changelog_wake_up_cbk (void *data)
 {
-        fd_t           *fd = NULL;
+        afr_lock_t  *lock  = NULL;
+        afr_local_t *local = data;
+        afr_local_t *timer_local = NULL;
+        struct list_head shared;
 
-        fd = data;
-
-        afr_delayed_changelog_wake_up (THIS, fd);
+        INIT_LIST_HEAD (&shared);
+        lock = &local->inode_ctx->lock[local->transaction.type];
+        LOCK (&local->inode->lock);
+        {
+                timer_local = list_entry(lock->post_op.next,
+                                         afr_local_t,
+                                        transaction.owner_list);
+                if (list_empty (&lock->owners) && (local == timer_local)) {
+                        GF_ASSERT (list_empty (&lock->waiting));
+                        /*Last owner*/
+                        lock->release = _gf_true;
+                        lock->delay_timer = NULL;
+                }
+        }
+        UNLOCK (&local->inode->lock);
+        afr_changelog_post_op_now (local->transaction.frame,
+                                   local->transaction.frame->this);
 }
 
 
 /* SET operation */
 int
-afr_fd_report_unstable_write (xlator_t *this, fd_t *fd)
+afr_fd_report_unstable_write (xlator_t *this, afr_local_t *local)
 {
-        afr_fd_ctx_t *fdctx = NULL;
-
-        fdctx = afr_fd_ctx_get (fd, this);
-
-        LOCK(&fd->lock);
+        LOCK(&local->inode->lock);
         {
-                fdctx->witnessed_unstable_write = _gf_true;
+                local->inode_ctx->witnessed_unstable_write = _gf_true;
         }
-        UNLOCK(&fd->lock);
+        UNLOCK(&local->inode->lock);
 
         return 0;
 }
 
 /* TEST and CLEAR operation */
 gf_boolean_t
-afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd)
+afr_fd_has_witnessed_unstable_write (xlator_t *this, inode_t *inode)
 {
-        afr_fd_ctx_t *fdctx = NULL;
+        afr_inode_ctx_t *ctx = NULL;
         gf_boolean_t witness = _gf_false;
 
-        fdctx = afr_fd_ctx_get (fd, this);
-        if (!fdctx)
-                return _gf_true;
-
-        LOCK(&fd->lock);
+        LOCK(&inode->lock);
         {
-                if (fdctx->witnessed_unstable_write) {
+                (void)__afr_inode_ctx_get (this, inode, &ctx);
+
+                if (ctx->witnessed_unstable_write) {
                         witness = _gf_true;
-                        fdctx->witnessed_unstable_write = _gf_false;
+                        ctx->witnessed_unstable_write = _gf_false;
                 }
         }
-        UNLOCK (&fd->lock);
+        UNLOCK (&inode->lock);
 
         return witness;
 }
@@ -1931,7 +2043,7 @@ afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this)
            mark a flag in the fdctx whenever an unstable write is witnessed.
            */
 
-        if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) {
+        if (!afr_fd_has_witnessed_unstable_write (this, local->inode)) {
                 afr_changelog_post_op_now (frame, this);
                 return 0;
         }
@@ -1949,87 +2061,64 @@ afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this)
         return 0;
 }
 
-
 void
-afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd,
-                               call_stub_t *stub)
+afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
 {
-	afr_fd_ctx_t      *fd_ctx = NULL;
-	call_frame_t      *prev_frame = NULL;
-	struct timespec    delta = {0, };
-	afr_private_t     *priv = NULL;
-	afr_local_t       *local = NULL;
+	struct timespec delta   = {0, };
+	afr_private_t   *priv   = NULL;
+	afr_local_t     *local  = frame->local;
+        afr_lock_t      *lock   = NULL;
+        gf_boolean_t    post_op = _gf_true;
+        struct list_head  shared;
 
 	priv = this->private;
-
-	fd_ctx = afr_fd_ctx_get (fd, this);
-	if (!fd_ctx)
-                goto out;
-
 	delta.tv_sec = priv->post_op_delay_secs;
 	delta.tv_nsec = 0;
 
-	pthread_mutex_lock (&fd_ctx->delay_lock);
-	{
-		prev_frame = fd_ctx->delay_frame;
-		fd_ctx->delay_frame = NULL;
-		if (fd_ctx->delay_timer)
-			gf_timer_call_cancel (this->ctx, fd_ctx->delay_timer);
-		fd_ctx->delay_timer = NULL;
-		if (!frame)
-			goto unlock;
-		fd_ctx->delay_timer = gf_timer_call_after (this->ctx, delta,
-							   afr_delayed_changelog_wake_up_cbk,
-							   fd);
-		fd_ctx->delay_frame = frame;
-	}
-unlock:
-	pthread_mutex_unlock (&fd_ctx->delay_lock);
-
-out:
-	if (prev_frame) {
-		local = prev_frame->local;
-		local->transaction.resume_stub = stub;
-		afr_changelog_post_op_now (prev_frame, this);
-	} else if (stub) {
-		call_resume (stub);
-	}
-}
-
-
-void
-afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
-{
-        afr_local_t  *local = NULL;
-
-        local = frame->local;
-
-        if (is_afr_delayed_changelog_post_op_needed (frame, this))
-                afr_delayed_changelog_post_op (this, frame, local->fd, NULL);
-        else
-                afr_changelog_post_op_safe (frame, this);
-}
-
+        INIT_LIST_HEAD (&shared);
+        if (!local->transaction.eager_lock_on)
+                goto out;
 
+        lock = &local->inode_ctx->lock[local->transaction.type];
+        LOCK (&local->inode->lock);
+	{
+                list_del_init (&local->transaction.owner_list);
+                list_add (&local->transaction.owner_list, &lock->post_op);
+                __afr_transaction_wake_shared (local, &shared);
+
+                if (!afr_is_delayed_changelog_post_op_needed (frame, this,
+                                                              delta.tv_sec)) {
+                        if (list_empty (&lock->owners))
+                                lock->release = _gf_true;
+                        goto unlock;
+                }
 
-/* Wake up the sleeping/delayed post-op, and also register
-   a stub to have it resumed after this transaction
-   completely finishes.
+                GF_ASSERT (lock->delay_timer == NULL);
+		lock->delay_timer = gf_timer_call_after (this->ctx, delta,
+                                              afr_delayed_changelog_wake_up_cbk,
+                                              local);
+                if (!lock->delay_timer) {
+                        lock->release = _gf_true;
+                } else {
+                        post_op = _gf_false;
+                }
 
-   The @stub gets saved in @local and gets resumed in
-   afr_local_cleanup()
-   */
-void
-afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub)
-{
-        afr_delayed_changelog_post_op (this, NULL, fd, stub);
-}
+	}
+unlock:
+        UNLOCK (&local->inode->lock);
 
+        if (!list_empty (&shared)) {
+                afr_lock_resume_shared (&shared);
+        }
 
-void
-afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd)
-{
-        afr_delayed_changelog_post_op (this, NULL, fd, NULL);
+out:
+        if (post_op) {
+                if (!local->transaction.eager_lock_on || lock->release) {
+                        afr_changelog_post_op_safe (frame, this);
+                } else {
+                        afr_changelog_post_op_now (frame, this);
+                }
+        }
 }
 
 int
@@ -2039,13 +2128,6 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this)
 
         local    = frame->local;
 
-        if (local->transaction.eager_lock_on) {
-                /* We don't need to retain "local" in the
-                   fd list anymore, writes to all subvols
-                   are finished by now */
-                afr_remove_eager_lock_stub (local);
-        }
-
         afr_restore_lk_owner (frame);
 
         afr_handle_symmetric_errors (frame, this);
@@ -2076,114 +2158,149 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,
 	local->transaction.failed_subvols[child_index] = 1;
 }
 
-
-
 static gf_boolean_t
-afr_locals_overlap (afr_local_t *local1, afr_local_t *local2)
+__need_previous_lock_unlocked (afr_local_t *local)
 {
-        uint64_t start1 = local1->transaction.start;
-        uint64_t start2 = local2->transaction.start;
-        uint64_t end1 = 0;
-        uint64_t end2 = 0;
-
-        if (local1->transaction.len)
-                end1 = start1 + local1->transaction.len - 1;
-        else
-                end1 = ULLONG_MAX;
+        afr_lock_t      *lock = NULL;
 
-        if (local2->transaction.len)
-                end2 = start2 + local2->transaction.len - 1;
-        else
-                end2 = ULLONG_MAX;
+        if (!local->transaction.eager_lock_on)
+                return _gf_true;
 
-        return ((end1 >= start2) && (end2 >= start1));
+        lock = &local->inode_ctx->lock[local->transaction.type];
+        if (!lock->acquired)
+                return _gf_false;
+        if (lock->acquired && lock->event_generation != local->event_generation)
+                return _gf_true;
+        return _gf_false;
 }
 
 void
-afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this)
+__afr_eager_lock_handle (afr_local_t *local, gf_boolean_t *take_lock,
+                         gf_boolean_t *do_pre_op, afr_local_t **timer_local)
 {
-        afr_private_t *priv = NULL;
-        afr_fd_ctx_t  *fdctx = NULL;
-        afr_local_t   *each = NULL;
+        afr_lock_t      *lock = NULL;
+        afr_local_t     *owner_local = NULL;
+        xlator_t        *this = local->transaction.frame->this;
 
-        priv = this->private;
-
-        if (!local->fd)
-                return;
-
-        if (local->transaction.type != AFR_DATA_TRANSACTION)
-                return;
+        if (local->fd && !afr_are_multiple_fds_opened (local, this)) {
+                local->transaction.eager_lock_on = _gf_true;
+        }
 
-        if (!priv->eager_lock)
-                return;
+        lock = &local->inode_ctx->lock[local->transaction.type];
+        if (__need_previous_lock_unlocked (local)) {
+                if (!list_empty (&lock->owners)) {
+                        lock->release = _gf_true;
+                } else if (lock->delay_timer) {
+                        lock->release = _gf_true;
+                        if (gf_timer_call_cancel (this->ctx,
+                                                  lock->delay_timer)) {
+                                /* It will be put in frozen list
+                                 * in the code flow below*/
+                        } else {
+                                *timer_local = list_entry(lock->post_op.next,
+                                                          afr_local_t,
+                                                        transaction.owner_list);
+                                lock->delay_timer = NULL;
+                        }
+                }
+                if (!local->transaction.eager_lock_on)
+                        goto out;
+        }
 
-        fdctx = afr_fd_ctx_get (local->fd, this);
-        if (!fdctx)
-                return;
+        if (lock->release) {
+                list_add_tail (&local->transaction.wait_list,
+                               &lock->frozen);
+                *take_lock = _gf_false;
+                goto out;
+        }
 
-        if (afr_are_multiple_fds_opened (local->fd, this))
-                return;
-        /*
-         * Once full file lock is acquired in eager-lock phase, overlapping
-         * writes do not compete for inode-locks, instead are transferred to the
-         * next writes. Because of this overlapping writes are not ordered.
-         * This can cause inconsistencies in replication.
-         * Example:
-         * Two overlapping writes w1, w2 are sent in parallel on same fd
-         * in two threads t1, t2.
-         * Both threads can execute afr_writev_wind in the following manner.
-         * t1 winds w1 on brick-0
-         * t2 winds w2 on brick-0
-         * t2 winds w2 on brick-1
-         * t1 winds w1 on brick-1
-         *
-         * This check makes sure the locks are not transferred for
-         * overlapping writes.
-         */
-        LOCK (&local->fd->lock);
-        {
-                list_for_each_entry (each, &fdctx->eager_locked,
-                                     transaction.eager_locked) {
-                        if (afr_locals_overlap (each, local)) {
-                                local->transaction.eager_lock_on = _gf_false;
-                                goto unlock;
-                        }
+        if (lock->delay_timer) {
+                *take_lock = _gf_false;
+                if (gf_timer_call_cancel (this->ctx,
+                                          lock->delay_timer)) {
+                        list_add_tail (&local->transaction.wait_list,
+                                       &lock->frozen);
+                } else {
+                        *timer_local = list_entry(lock->post_op.next,
+                                                  afr_local_t,
+                                                  transaction.owner_list);
+                        afr_copy_inodelk_vars (&local->internal_lock,
+                                               &(*timer_local)->internal_lock,
+                                               this);
+                        lock->delay_timer = NULL;
+                        *do_pre_op = _gf_true;
+                        list_add_tail (&local->transaction.owner_list,
+                                       &lock->owners);
                 }
+                goto out;
+        }
 
-                local->transaction.eager_lock_on = _gf_true;
-                list_add_tail (&local->transaction.eager_locked,
-                               &fdctx->eager_locked);
+        if (!list_empty (&lock->owners)) {
+                if (!lock->acquired ||
+                    afr_has_lock_conflict (local, _gf_true)) {
+                        list_add_tail (&local->transaction.wait_list,
+                                       &lock->waiting);
+                        *take_lock = _gf_false;
+                        goto out;
+                }
+                owner_local = list_entry (lock->owners.next,
+                                          afr_local_t,
+                                          transaction.owner_list);
+                afr_copy_inodelk_vars (&local->internal_lock,
+                                       &owner_local->internal_lock,
+                                       this);
+                *take_lock = _gf_false;
+                *do_pre_op = _gf_true;
         }
-unlock:
-        UNLOCK (&local->fd->lock);
+
+        if (lock->acquired)
+                GF_ASSERT (!(*take_lock));
+        list_add_tail (&local->transaction.owner_list, &lock->owners);
+out:
+        return;
 }
 
 void
-afr_transaction_start (call_frame_t *frame, xlator_t *this)
+afr_transaction_start (afr_local_t *local, xlator_t *this)
 {
-        afr_local_t   *local = frame->local;
-        fd_t          *fd    = NULL;
+        afr_private_t   *priv = NULL;
+        gf_boolean_t    take_lock  = _gf_true;
+        gf_boolean_t    do_pre_op  = _gf_false;
+        afr_local_t     *timer_local = NULL;
 
-        afr_transaction_eager_lock_init (local, this);
+        priv = this->private;
 
-        if (local->fd && local->transaction.eager_lock_on)
-                afr_set_lk_owner (frame, this, local->fd);
-        else
-                afr_set_lk_owner (frame, this, frame->root);
+        if (local->transaction.type != AFR_DATA_TRANSACTION &&
+            local->transaction.type != AFR_METADATA_TRANSACTION)
+                goto lock_phase;
 
-        if (!local->transaction.eager_lock_on && local->loc.inode) {
-                fd = fd_lookup (local->loc.inode, frame->root->pid);
-                if (fd == NULL)
-                        fd = fd_lookup_anonymous (local->loc.inode,
-                                                  GF_ANON_FD_FLAGS);
+        if (!priv->eager_lock)
+                goto lock_phase;
 
-                if (fd) {
-                        afr_delayed_changelog_wake_up (this, fd);
-                        fd_unref (fd);
-                }
+        LOCK (&local->inode->lock);
+        {
+                __afr_eager_lock_handle (local, &take_lock, &do_pre_op,
+                                         &timer_local);
         }
+        UNLOCK (&local->inode->lock);
+lock_phase:
+        if (!local->transaction.eager_lock_on) {
+                afr_set_lk_owner (local->transaction.frame, this,
+                                  local->transaction.frame->root);
+        } else {
+                afr_set_lk_owner (local->transaction.frame, this, local->inode);
+        }
+
 
-        afr_lock (frame, this);
+        if (take_lock) {
+                afr_lock (local->transaction.frame, this);
+        } else if (do_pre_op) {
+                afr_changelog_pre_op (local->transaction.frame, this);
+        }
+        /*Always call delayed_changelog_wake_up_cbk after calling pre-op above
+         * so that any inheriting can happen*/
+        if (timer_local)
+                afr_delayed_changelog_wake_up_cbk (timer_local);
 }
 
 int
@@ -2196,7 +2313,7 @@ afr_write_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err)
                 goto fail;
         }
 
-        afr_transaction_start (frame, this);
+        afr_transaction_start (local, this);
         return 0;
 fail:
         local->transaction.unwind (frame, this);
@@ -2214,6 +2331,7 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
 
         local = frame->local;
         priv  = this->private;
+        local->transaction.frame = frame;
 
         local->transaction.type   = type;
 
@@ -2226,11 +2344,9 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
         if (ret < 0)
                 goto out;
 
-        if (type == AFR_ENTRY_TRANSACTION ||
-            type == AFR_ENTRY_RENAME_TRANSACTION) {
-                afr_transaction_start (frame, this);
-                ret = 0;
-                goto out;
+
+        if (type != AFR_METADATA_TRANSACTION) {
+                goto txn_start;
         }
 
         ret = afr_inode_get_readable (frame, local->inode, this,
@@ -2240,10 +2356,13 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
                                                   event_generation)) {
                 afr_inode_refresh (frame, this, local->inode, local->loc.gfid,
                                    afr_write_txn_refresh_done);
-        } else {
-                afr_transaction_start (frame, this);
+                ret = 0;
+                goto out;
         }
+
+txn_start:
         ret = 0;
+        afr_transaction_start (local, this);
 out:
         return ret;
 }
diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h
index ddcb1eb..a27e9a3 100644
--- a/xlators/cluster/afr/src/afr-transaction.h
+++ b/xlators/cluster/afr/src/afr-transaction.h
@@ -17,12 +17,6 @@ void
 afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,
 			    int child_index);
 
-int
-afr_lock_server_count (afr_private_t *priv, afr_transaction_type type);
-
-afr_inodelk_t*
-afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom);
-
 int32_t
 afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type);
 
@@ -30,9 +24,6 @@ int
 afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending);
 
 void
-afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this);
-
-void
 afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd);
 
 void
@@ -57,4 +48,8 @@ afr_pick_error_xdata (afr_local_t *local, afr_private_t *priv,
                       inode_t *inode2, unsigned char *readable2);
 int
 afr_transaction_resume (call_frame_t *frame, xlator_t *this);
+int
+afr_lock (call_frame_t *frame, xlator_t *this);
+void
+afr_delayed_changelog_wake_up_cbk (void *data);
 #endif /* __TRANSACTION_H__ */
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 5ff57c0..6be59dc 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -230,19 +230,12 @@ int
 afr_entry_lockee_cmp (const void *l1, const void *l2);
 
 typedef struct {
-        char    *domain; /* Domain on which inodelk is taken */
-        struct gf_flock flock;
-        unsigned char *locked_nodes;
-        int32_t lock_count;
-} afr_inodelk_t;
-
-typedef struct {
         loc_t *lk_loc;
 
         int                     lockee_count;
         afr_entry_lockee_t      lockee[AFR_LOCKEE_COUNT_MAX];
 
-        afr_inodelk_t       inodelk[AFR_DOM_COUNT_MAX];
+        struct gf_flock flock;
         const char *lk_basename;
         const char *lower_basename;
         const char *higher_basename;
@@ -255,7 +248,6 @@ typedef struct {
         int32_t lock_count;
         int32_t entrylk_lock_count;
 
-        uint64_t lock_number;
         int32_t lk_call_count;
         int32_t lk_expected_count;
         int32_t lk_attempted_count;
@@ -292,37 +284,9 @@ typedef enum {
 } afr_fd_open_status_t;
 
 typedef struct {
-        unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS];
-	int inherited[AFR_NUM_CHANGE_LOGS];
-	int on_disk[AFR_NUM_CHANGE_LOGS];
         afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */
-
-        unsigned int *lock_piggyback;
-        unsigned int *lock_acquired;
-
         int flags;
 
-	/* used for delayed-post-op optimization */
-	pthread_mutex_t    delay_lock;
-	gf_timer_t        *delay_timer;
-	call_frame_t      *delay_frame;
-
-	/* set if any write on this fd was a non stable write
-	   (i.e, without O_SYNC or O_DSYNC)
-	*/
-	gf_boolean_t      witnessed_unstable_write;
-
-	/* @open_fd_count:
-	   Number of open FDs queried from the server, as queried through
-	   xdata in FOPs. Currently, used to decide if eager-locking must be
-	   temporarily disabled.
-	*/
-        uint32_t        open_fd_count;
-
-
-	/* list of frames currently in progress */
-	struct list_head  eager_locked;
-
 	/* the subvolume on which the latest sequence of readdirs (starting
 	   at offset 0) has begun. Till the next readdir request with 0 offset
 	   arrives, we continue to read off this subvol.
@@ -336,6 +300,20 @@ typedef enum {
         AFR_FOP_LOCK_QUORUM_FAILED,
 } afr_fop_lock_state_t;
 
+typedef struct _afr_inode_lock_t {
+        unsigned int event_generation;
+        gf_boolean_t    release;
+        gf_boolean_t    acquired;
+        gf_timer_t        *delay_timer;
+        struct list_head  owners; /*Transactions that are performing fop*/
+        struct list_head  post_op;/*Transactions that are done with the fop
+                                   *So can not conflict with the fops*/
+        struct list_head waiting;/*Transaction that are waiting for
+                                   *conflicting transactions to complete*/
+        struct list_head frozen;/*Transactions that need to go as part of
+                                 * next batch of eager-lock*/
+} afr_lock_t;
+
 typedef struct _afr_inode_ctx {
         uint64_t        read_subvol;
         uint64_t        write_subvol;
@@ -343,6 +321,23 @@ typedef struct _afr_inode_ctx {
         int             spb_choice;
         gf_timer_t      *timer;
         gf_boolean_t    need_refresh;
+        unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS];
+        int inherited[AFR_NUM_CHANGE_LOGS];
+        int on_disk[AFR_NUM_CHANGE_LOGS];
+
+        /* set if any write on this fd was a non stable write
+           (i.e, without O_SYNC or O_DSYNC)
+        */
+        gf_boolean_t      witnessed_unstable_write;
+
+        /* @open_fd_count:
+           Number of open FDs queried from the server, as queried through
+           xdata in FOPs. Currently, used to decide if eager-locking must be
+           temporarily disabled.
+        */
+        uint32_t        open_fd_count;
+        /*Only 2 types of transactions support eager-locks now. DATA/METADATA*/
+        afr_lock_t lock[2];
 } afr_inode_ctx_t;
 
 
@@ -457,7 +452,6 @@ typedef struct _afr_local {
         dict_t  *dict;
 
         int      optimistic_change_log;
-	gf_boolean_t      delayed_post_op;
 
 	/* Is the current writev() going to perform a stable write?
 	   i.e, is fd->flags or @flags writev param have O_SYNC or
@@ -693,7 +687,7 @@ typedef struct _afr_local {
                 off_t start, len;
 
                 gf_boolean_t    eager_lock_on;
-                int *eager_lock;
+                gf_boolean_t    do_eager_unlock;
 
                 char *basename;
                 char *new_basename;
@@ -707,7 +701,8 @@ typedef struct _afr_local {
 		   of the transaction frame */
 		call_stub_t      *resume_stub;
 
-		struct list_head  eager_locked;
+		struct list_head  owner_list;
+                struct list_head  wait_list;
 
                 unsigned char   *pre_op;
 
@@ -768,7 +763,8 @@ typedef struct _afr_local {
 		*/
 		afr_changelog_resume_t changelog_resume;
 
-                call_frame_t *main_frame;
+                call_frame_t *main_frame; /*Fop frame*/
+                call_frame_t *frame; /*Transaction frame*/
 
                 int (*wind) (call_frame_t *frame, xlator_t *this, int subvol);
 
@@ -1009,7 +1005,7 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd);
 		afr_local_cleanup (frame->local, THIS);		       \
 		mem_put (frame->local);				       \
 		frame->local = NULL; };				       \
-	frame->local;})
+	frame->local; })
 
 #define AFR_STACK_RESET(frame)                                         \
         do {                                                           \
@@ -1096,22 +1092,10 @@ afr_filter_xattrs (dict_t *xattr);
 #define AFR_QUORUM_AUTO INT_MAX
 
 int
-afr_fd_report_unstable_write (xlator_t *this, fd_t *fd);
+afr_fd_report_unstable_write (xlator_t *this, afr_local_t *local);
 
 gf_boolean_t
-afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd);
-
-void
-afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub);
-
-int
-afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count);
-
-void
-afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this);
-
-void
-afr_remove_eager_lock_stub (afr_local_t *local);
+afr_fd_has_witnessed_unstable_write (xlator_t *this, inode_t *inode);
 
 void
 afr_reply_wipe (struct afr_reply *reply);
-- 
1.8.3.1