e7a346
From 30fb0e640ae94d9591e9bb64800b0971e52d5416 Mon Sep 17 00:00:00 2001
e7a346
From: Pranith Kumar K <pkarampu@redhat.com>
e7a346
Date: Wed, 31 Jan 2018 16:41:14 +0530
e7a346
Subject: [PATCH 194/201] cluster/afr: Make AFR eager-locking similar to EC
e7a346
e7a346
Problem:
e7a346
1) Afr's eager-lock only works for data transactions.
e7a346
2) When there are conflicting writes, write with conflicting region initiates
e7a346
unlock of eager-lock leading to extra pre-ops and post-ops on the file. When
e7a346
eager-lock goes off, it leads to extra fsyncs for random-write workload in afr.
e7a346
e7a346
Solution (that is modeled after EC):
e7a346
In EC, when there is a conflicting write, it waits for the current write to
e7a346
complete before it winds the conflicted write. This leads to better utilization
e7a346
of network and disk, because we will not be doing extra xattrops and FSYNCs and
e7a346
inodelk/unlock. Moved fd based counters to inode based counters.
e7a346
e7a346
I tried to model the solution based on EC's locking, but it is not similar to
e7a346
AFR because we had to keep backward compatibility.
e7a346
e7a346
Lifecycle of lock:
e7a346
==================
e7a346
First transaction is added to inode->owners list and an inodelk will be sent on
e7a346
the wire. All the next transactions will be put in inode->waiters list until
e7a346
the first transaction completes inodelk and [f]xattrop completely.  Once
e7a346
[f]xattrop also completes, all the requests in the inode->waiters list are
e7a346
checked if it conflict with any of the existing locks which are in
e7a346
inode->owners list and if not are added to inode->owners list and resumed with
e7a346
doing transaction. When these transactions complete fop phase they will be
e7a346
moved to inode->post_op list and resume the transactions that were paused
e7a346
because of conflicts. Post-op and unlock will not be issued on the wire until
e7a346
that is the last transaction on that inode. Last transaction when it has to
e7a346
perform post-op can choose to sleep for deyed-post-op-secs value. During that
e7a346
time if any other transaction comes, it will wake up the sleeping transaction
e7a346
and takes over the ownership of the lock and the cycle continues. If the
e7a346
dealyed-post-op-secs expire, then the timer thread will wakeup the sleeping
e7a346
transaction and it will set lock->release to true and starts doing post-op and
e7a346
then unlock. During this time if any other transactions come, they will be put
e7a346
in inode->frozen list. Once the previous unlock comes it will move the frozen
e7a346
list to waiters list and moves the first element from this waiters-list to
e7a346
owners-list and attempts the lock and the cycle continues. This is the general
e7a346
idea.  There is logic at the time of dealying and at the time of new
e7a346
transaction or in flush fop to wakeup existing sleeping transactions or
e7a346
choosing whether to delay a transaction etc, which is subjected to change based
e7a346
on future enhancements etc.
e7a346
e7a346
 >Fixes: #418
e7a346
 >BUG: 1549606
e7a346
e7a346
Upstream-patch: https://review.gluster.org/19503
e7a346
BUG: 1491785
e7a346
Change-Id: I88b570bbcf332a27c82d2767dfa82472f60055dc
e7a346
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
e7a346
Reviewed-on: https://code.engineering.redhat.com/gerrit/131945
e7a346
Tested-by: RHGS Build Bot <nigelb@redhat.com>
e7a346
---
e7a346
 tests/bugs/replicate/bug-966018.t              |  36 -
e7a346
 xlators/cluster/afr/src/afr-common.c           | 315 ++++-----
e7a346
 xlators/cluster/afr/src/afr-inode-write.c      |   6 +-
e7a346
 xlators/cluster/afr/src/afr-lk-common.c        | 348 +++-------
e7a346
 xlators/cluster/afr/src/afr-self-heal-common.c |  13 +-
e7a346
 xlators/cluster/afr/src/afr-self-heal-data.c   |  14 +-
e7a346
 xlators/cluster/afr/src/afr-self-heal.h        |   2 +-
e7a346
 xlators/cluster/afr/src/afr-transaction.c      | 913 ++++++++++++++-----------
e7a346
 xlators/cluster/afr/src/afr-transaction.h      |  13 +-
e7a346
 xlators/cluster/afr/src/afr.h                  |  96 ++-
e7a346
 10 files changed, 813 insertions(+), 943 deletions(-)
e7a346
 delete mode 100644 tests/bugs/replicate/bug-966018.t
e7a346
e7a346
diff --git a/tests/bugs/replicate/bug-966018.t b/tests/bugs/replicate/bug-966018.t
e7a346
deleted file mode 100644
e7a346
index 1b5296b..0000000
e7a346
--- a/tests/bugs/replicate/bug-966018.t
e7a346
+++ /dev/null
e7a346
@@ -1,36 +0,0 @@
e7a346
-#!/bin/bash
e7a346
-
e7a346
-. $(dirname $0)/../../include.rc
e7a346
-. $(dirname $0)/../../volume.rc
e7a346
-. $(dirname $0)/../../nfs.rc
e7a346
-
e7a346
-#This tests if cluster.eager-lock blocks metadata operations on nfs/fuse mounts.
e7a346
-#If it is not woken up, INODELK from the next command waits
e7a346
-#for post-op-delay secs.
e7a346
-
e7a346
-cleanup;
e7a346
-TEST glusterd
e7a346
-TEST pidof glusterd
e7a346
-
e7a346
-TEST $CLI volume create $V0 replica 2 $H0:$B0/r2_0 $H0:$B0/r2_1
e7a346
-TEST $CLI volume set $V0 ensure-durability off
e7a346
-TEST $CLI volume set $V0 cluster.eager-lock on
e7a346
-TEST $CLI volume set $V0 cluster.post-op-delay-secs 3
e7a346
-TEST $CLI volume set $V0 nfs.disable false
e7a346
-
e7a346
-TEST $CLI volume start $V0
e7a346
-TEST $CLI volume profile $V0 start
e7a346
-EXPECT_WITHIN $NFS_EXPORT_TIMEOUT "1" is_nfs_export_available;
e7a346
-TEST mount_nfs $H0:/$V0 $N0 nolock;
e7a346
-TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id=$V0 $M0
e7a346
-echo 1 > $N0/1 && chmod +x $N0/1
e7a346
-echo 1 > $M0/1 && chmod +x $M0/1
e7a346
-
e7a346
-#Check that INODELK MAX latency is not in the order of seconds
e7a346
-#Test if the MAX INODELK fop latency is of the order of seconds.
e7a346
-inodelk_max_latency=$($CLI volume profile $V0 info | grep INODELK | awk 'BEGIN {max = 0} {if ($6 > max) max=$6;} END {print max}' | cut -d. -f 1 | egrep "[0-9]{7,}")
e7a346
-
e7a346
-TEST [ -z $inodelk_max_latency ]
e7a346
-EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $N0
e7a346
-
e7a346
-cleanup;
e7a346
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
e7a346
index 06863b6..6025a60 100644
e7a346
--- a/xlators/cluster/afr/src/afr-common.c
e7a346
+++ b/xlators/cluster/afr/src/afr-common.c
e7a346
@@ -126,37 +126,77 @@ afr_is_possibly_under_txn (afr_transaction_type type, afr_local_t *local,
e7a346
         return _gf_false;
e7a346
 }
e7a346
 
e7a346
+static void
e7a346
+afr_inode_ctx_destroy (afr_inode_ctx_t *ctx)
e7a346
+{
e7a346
+        int i = 0;
e7a346
+
e7a346
+        if (!ctx)
e7a346
+                return;
e7a346
+
e7a346
+        for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
e7a346
+                GF_FREE (ctx->pre_op_done[i]);
e7a346
+        }
e7a346
+
e7a346
+        GF_FREE (ctx);
e7a346
+}
e7a346
+
e7a346
 int
e7a346
 __afr_inode_ctx_get (xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx)
e7a346
 {
e7a346
-        uint64_t                ctx_int = 0;
e7a346
-        int                     ret     = -1;
e7a346
-        afr_inode_ctx_t        *tmp_ctx = NULL;
e7a346
+        uint64_t        ctx_int   = 0;
e7a346
+        int             ret       = -1;
e7a346
+        int             i         = -1;
e7a346
+        int             num_locks = -1;
e7a346
+        afr_inode_ctx_t *ictx     = NULL;
e7a346
+        afr_lock_t      *lock     = NULL;
e7a346
+        afr_private_t   *priv     = this->private;
e7a346
 
e7a346
         ret = __inode_ctx_get (inode, this, &ctx_int);
e7a346
-        if (ret) {
e7a346
-                tmp_ctx = GF_CALLOC (1, sizeof (afr_inode_ctx_t),
e7a346
-                                     gf_afr_mt_inode_ctx_t);
e7a346
-                if (!tmp_ctx)
e7a346
-                        goto out;
e7a346
+        if (ret == 0) {
e7a346
+                *ctx = (afr_inode_ctx_t *)ctx_int;
e7a346
+                return 0;
e7a346
+        }
e7a346
 
e7a346
-                ctx_int = (long) tmp_ctx;
e7a346
-                ret = __inode_ctx_set (inode, this, &ctx_int);
e7a346
-                if (ret) {
e7a346
-                        GF_FREE (tmp_ctx);
e7a346
+        ictx = GF_CALLOC (1, sizeof (afr_inode_ctx_t), gf_afr_mt_inode_ctx_t);
e7a346
+        if (!ictx)
e7a346
+                goto out;
e7a346
+
e7a346
+        for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
e7a346
+                ictx->pre_op_done[i] = GF_CALLOC (sizeof *ictx->pre_op_done[i],
e7a346
+                                                  priv->child_count,
e7a346
+                                                  gf_afr_mt_int32_t);
e7a346
+                if (!ictx->pre_op_done[i]) {
e7a346
+                        ret = -ENOMEM;
e7a346
                         goto out;
e7a346
                 }
e7a346
-                tmp_ctx->spb_choice = -1;
e7a346
-                tmp_ctx->read_subvol = 0;
e7a346
-                tmp_ctx->write_subvol = 0;
e7a346
-                tmp_ctx->lock_count = 0;
e7a346
-        } else {
e7a346
-                tmp_ctx = (afr_inode_ctx_t *) ctx_int;
e7a346
         }
e7a346
 
e7a346
-        *ctx = tmp_ctx;
e7a346
+        num_locks = sizeof(ictx->lock)/sizeof(afr_lock_t);
e7a346
+        for (i = 0; i < num_locks; i++) {
e7a346
+                lock = &ictx->lock[i];
e7a346
+                INIT_LIST_HEAD (&lock->post_op);
e7a346
+                INIT_LIST_HEAD (&lock->frozen);
e7a346
+                INIT_LIST_HEAD (&lock->waiting);
e7a346
+                INIT_LIST_HEAD (&lock->owners);
e7a346
+        }
e7a346
+
e7a346
+        ctx_int = (uint64_t)ictx;
e7a346
+        ret = __inode_ctx_set (inode, this, &ctx_int);
e7a346
+        if (ret) {
e7a346
+                goto out;
e7a346
+        }
e7a346
+
e7a346
+        ictx->spb_choice = -1;
e7a346
+        ictx->read_subvol = 0;
e7a346
+        ictx->write_subvol = 0;
e7a346
+        ictx->lock_count = 0;
e7a346
         ret = 0;
e7a346
+        *ctx = ictx;
e7a346
 out:
e7a346
+        if (ret) {
e7a346
+                afr_inode_ctx_destroy (ictx);
e7a346
+        }
e7a346
         return ret;
e7a346
 }
e7a346
 
e7a346
@@ -1745,10 +1785,6 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)
e7a346
 
e7a346
         GF_FREE (local->internal_lock.locked_nodes);
e7a346
 
e7a346
-        for (i = 0; local->internal_lock.inodelk[i].domain; i++) {
e7a346
-                GF_FREE (local->internal_lock.inodelk[i].locked_nodes);
e7a346
-        }
e7a346
-
e7a346
         GF_FREE (local->internal_lock.lower_locked_nodes);
e7a346
 
e7a346
         afr_entry_lockee_cleanup (&local->internal_lock);
e7a346
@@ -1765,7 +1801,6 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)
e7a346
                 GF_FREE (local->transaction.changelog_xdata);
e7a346
         }
e7a346
 
e7a346
-        GF_FREE (local->transaction.eager_lock);
e7a346
         GF_FREE (local->transaction.failed_subvols);
e7a346
 
e7a346
         GF_FREE (local->transaction.basename);
e7a346
@@ -1812,16 +1847,6 @@ afr_local_replies_wipe (afr_local_t *local, afr_private_t *priv)
e7a346
 	memset (local->replies, 0, sizeof(*local->replies) * priv->child_count);
e7a346
 }
e7a346
 
e7a346
-void
e7a346
-afr_remove_eager_lock_stub (afr_local_t *local)
e7a346
-{
e7a346
-        LOCK (&local->fd->lock);
e7a346
-        {
e7a346
-                list_del_init (&local->transaction.eager_locked);
e7a346
-        }
e7a346
-        UNLOCK (&local->fd->lock);
e7a346
-}
e7a346
-
e7a346
 static gf_boolean_t
e7a346
 afr_fop_lock_is_unlock (call_frame_t *frame)
e7a346
 {
e7a346
@@ -1926,10 +1951,6 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this)
e7a346
 
e7a346
 	syncbarrier_destroy (&local->barrier);
e7a346
 
e7a346
-        if (local->transaction.eager_lock_on &&
e7a346
-            !list_empty (&local->transaction.eager_locked))
e7a346
-                afr_remove_eager_lock_stub (local);
e7a346
-
e7a346
         afr_local_transaction_cleanup (local, this);
e7a346
 
e7a346
         priv = this->private;
e7a346
@@ -3160,22 +3181,8 @@ out:
e7a346
 void
e7a346
 _afr_cleanup_fd_ctx (afr_fd_ctx_t *fd_ctx)
e7a346
 {
e7a346
-        int i = 0;
e7a346
-
e7a346
-
e7a346
-	for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++)
e7a346
-		GF_FREE (fd_ctx->pre_op_done[i]);
e7a346
-
e7a346
         GF_FREE (fd_ctx->opened_on);
e7a346
-
e7a346
-        GF_FREE (fd_ctx->lock_piggyback);
e7a346
-
e7a346
-        GF_FREE (fd_ctx->lock_acquired);
e7a346
-
e7a346
-	pthread_mutex_destroy (&fd_ctx->delay_lock);
e7a346
-
e7a346
         GF_FREE (fd_ctx);
e7a346
-
e7a346
         return;
e7a346
 }
e7a346
 
e7a346
@@ -3193,15 +3200,7 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd)
e7a346
         fd_ctx = (afr_fd_ctx_t *)(long) ctx;
e7a346
 
e7a346
         if (fd_ctx) {
e7a346
-                /*no need to take any locks*/
e7a346
-                if (!list_empty (&fd_ctx->eager_locked))
e7a346
-                        gf_msg (this->name, GF_LOG_WARNING, 0,
e7a346
-                                AFR_MSG_INVALID_DATA, "%s: Stale "
e7a346
-                                "Eager-lock stubs found",
e7a346
-                                uuid_utoa (fd->inode->gfid));
e7a346
-
e7a346
                 _afr_cleanup_fd_ctx (fd_ctx);
e7a346
-
e7a346
         }
e7a346
 
e7a346
 out:
e7a346
@@ -3282,23 +3281,6 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd)
e7a346
                 goto out;
e7a346
         }
e7a346
 
e7a346
-        ret = pthread_mutex_init (&fd_ctx->delay_lock, NULL);
e7a346
-        if (ret) {
e7a346
-                GF_FREE (fd_ctx);
e7a346
-                fd_ctx = NULL;
e7a346
-                goto out;
e7a346
-        }
e7a346
-
e7a346
-	for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
e7a346
-		fd_ctx->pre_op_done[i] = GF_CALLOC (sizeof (*fd_ctx->pre_op_done[i]),
e7a346
-						    priv->child_count,
e7a346
-						    gf_afr_mt_int32_t);
e7a346
-		if (!fd_ctx->pre_op_done[i]) {
e7a346
-			ret = -ENOMEM;
e7a346
-			goto out;
e7a346
-		}
e7a346
-	}
e7a346
-
e7a346
         fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on),
e7a346
                                        priv->child_count,
e7a346
                                        gf_afr_mt_int32_t);
e7a346
@@ -3314,26 +3296,8 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd)
e7a346
 			fd_ctx->opened_on[i] = AFR_FD_NOT_OPENED;
e7a346
 	}
e7a346
 
e7a346
-        fd_ctx->lock_piggyback = GF_CALLOC (sizeof (*fd_ctx->lock_piggyback),
e7a346
-                                            priv->child_count,
e7a346
-                                            gf_afr_mt_char);
e7a346
-        if (!fd_ctx->lock_piggyback) {
e7a346
-                ret = -ENOMEM;
e7a346
-                goto out;
e7a346
-        }
e7a346
-
e7a346
-        fd_ctx->lock_acquired = GF_CALLOC (sizeof (*fd_ctx->lock_acquired),
e7a346
-                                           priv->child_count,
e7a346
-                                           gf_afr_mt_char);
e7a346
-        if (!fd_ctx->lock_acquired) {
e7a346
-                ret = -ENOMEM;
e7a346
-                goto out;
e7a346
-        }
e7a346
-
e7a346
 	fd_ctx->readdir_subvol = -1;
e7a346
 
e7a346
-        INIT_LIST_HEAD (&fd_ctx->eager_locked);
e7a346
-
e7a346
         ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx);
e7a346
         if (ret)
e7a346
                 gf_msg_debug (this->name, 0,
e7a346
@@ -3405,12 +3369,70 @@ afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
e7a346
         return 0;
e7a346
 }
e7a346
 
e7a346
+afr_local_t*
e7a346
+afr_wakeup_same_fd_delayed_op (xlator_t *this, afr_lock_t *lock, fd_t *fd)
e7a346
+{
e7a346
+        afr_local_t *local = NULL;
e7a346
+
e7a346
+        if (lock->delay_timer) {
e7a346
+                local = list_entry(lock->post_op.next, afr_local_t,
e7a346
+                                   transaction.owner_list);
e7a346
+                if (fd == local->fd) {
e7a346
+                        if (gf_timer_call_cancel (this->ctx,
e7a346
+                                                  lock->delay_timer)) {
e7a346
+                                local = NULL;
e7a346
+                        } else {
e7a346
+                                lock->delay_timer = NULL;
e7a346
+                        }
e7a346
+                } else {
e7a346
+                        local = NULL;
e7a346
+                }
e7a346
+        }
e7a346
+
e7a346
+        return local;
e7a346
+}
e7a346
+
e7a346
+void
e7a346
+afr_delayed_changelog_wake_resume (xlator_t *this, inode_t *inode,
e7a346
+                                   call_stub_t *stub)
e7a346
+{
e7a346
+        afr_inode_ctx_t *ctx = NULL;
e7a346
+        afr_lock_t      *lock = NULL;
e7a346
+        afr_local_t     *metadata_local = NULL;
e7a346
+        afr_local_t     *data_local = NULL;
e7a346
+        LOCK (&inode->lock);
e7a346
+        {
e7a346
+                (void)__afr_inode_ctx_get (this, inode, &ctx;;
e7a346
+                lock = &ctx->lock[AFR_DATA_TRANSACTION];
e7a346
+                data_local = afr_wakeup_same_fd_delayed_op (this, lock,
e7a346
+                                                            stub->args.fd);
e7a346
+                lock = &ctx->lock[AFR_METADATA_TRANSACTION];
e7a346
+                metadata_local = afr_wakeup_same_fd_delayed_op (this, lock,
e7a346
+                                                                stub->args.fd);
e7a346
+        }
e7a346
+        UNLOCK (&inode->lock);
e7a346
+
e7a346
+        if (data_local) {
e7a346
+                data_local->transaction.resume_stub = stub;
e7a346
+        } else if (metadata_local) {
e7a346
+                metadata_local->transaction.resume_stub = stub;
e7a346
+        } else {
e7a346
+                call_resume (stub);
e7a346
+        }
e7a346
+        if (data_local) {
e7a346
+                afr_delayed_changelog_wake_up_cbk (data_local);
e7a346
+        }
e7a346
+        if (metadata_local) {
e7a346
+                afr_delayed_changelog_wake_up_cbk (metadata_local);
e7a346
+        }
e7a346
+}
e7a346
+
e7a346
 int
e7a346
 afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
e7a346
 {
e7a346
-        afr_local_t   *local = NULL;
e7a346
-        call_stub_t   *stub = NULL;
e7a346
-        int            op_errno   = ENOMEM;
e7a346
+        afr_local_t *local   = NULL;
e7a346
+        call_stub_t *stub    = NULL;
e7a346
+        int         op_errno = ENOMEM;
e7a346
 
e7a346
 	local = AFR_FRAME_INIT (frame, op_errno);
e7a346
 	if (!local)
e7a346
@@ -3426,7 +3448,7 @@ afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
e7a346
         if (!stub)
e7a346
                 goto out;
e7a346
 
e7a346
-        afr_delayed_changelog_wake_resume (this, fd, stub);
e7a346
+        afr_delayed_changelog_wake_resume (this, fd->inode, stub);
e7a346
 
e7a346
 	return 0;
e7a346
 out:
e7a346
@@ -3434,7 +3456,6 @@ out:
e7a346
         return 0;
e7a346
 }
e7a346
 
e7a346
-
e7a346
 int
e7a346
 afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
 		  int32_t op_ret, int32_t op_errno, dict_t *xdata)
e7a346
@@ -4497,7 +4518,7 @@ afr_forget (xlator_t *this, inode_t *inode)
e7a346
                 return 0;
e7a346
 
e7a346
         ctx = (afr_inode_ctx_t *)ctx_int;
e7a346
-        GF_FREE (ctx);
e7a346
+        afr_inode_ctx_destroy (ctx);
e7a346
         return 0;
e7a346
 }
e7a346
 
e7a346
@@ -5310,21 +5331,6 @@ out:
e7a346
 }
e7a346
 
e7a346
 int
e7a346
-afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count)
e7a346
-{
e7a346
-        int             ret = -ENOMEM;
e7a346
-
e7a346
-        lk->domain = dom;
e7a346
-        lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes),
e7a346
-                                      child_count, gf_afr_mt_char);
e7a346
-        if (NULL == lk->locked_nodes)
e7a346
-                goto out;
e7a346
-        ret = 0;
e7a346
-out:
e7a346
-        return ret;
e7a346
-}
e7a346
-
e7a346
-int
e7a346
 afr_transaction_local_init (afr_local_t *local, xlator_t *this)
e7a346
 {
e7a346
         int            ret = -ENOMEM;
e7a346
@@ -5335,25 +5341,9 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this)
e7a346
         if (ret < 0)
e7a346
                 goto out;
e7a346
 
e7a346
-        if ((local->transaction.type == AFR_DATA_TRANSACTION) ||
e7a346
-            (local->transaction.type == AFR_METADATA_TRANSACTION)) {
e7a346
-                ret = afr_inodelk_init (&local->internal_lock.inodelk[0],
e7a346
-                                        this->name, priv->child_count);
e7a346
-                if (ret < 0)
e7a346
-                        goto out;
e7a346
-        }
e7a346
-
e7a346
         ret = -ENOMEM;
e7a346
 	local->pre_op_compat = priv->pre_op_compat;
e7a346
 
e7a346
-        local->transaction.eager_lock =
e7a346
-                GF_CALLOC (sizeof (*local->transaction.eager_lock),
e7a346
-                           priv->child_count,
e7a346
-                           gf_afr_mt_int32_t);
e7a346
-
e7a346
-        if (!local->transaction.eager_lock)
e7a346
-                goto out;
e7a346
-
e7a346
         local->transaction.pre_op = GF_CALLOC (sizeof (*local->transaction.pre_op),
e7a346
                                                priv->child_count,
e7a346
                                                gf_afr_mt_char);
e7a346
@@ -5385,9 +5375,9 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this)
e7a346
         if (!local->pending)
e7a346
                 goto out;
e7a346
 
e7a346
-	INIT_LIST_HEAD (&local->transaction.eager_locked);
e7a346
-
e7a346
         ret = 0;
e7a346
+        INIT_LIST_HEAD (&local->transaction.wait_list);
e7a346
+        INIT_LIST_HEAD (&local->transaction.owner_list);
e7a346
 out:
e7a346
         return ret;
e7a346
 }
e7a346
@@ -5422,24 +5412,6 @@ out:
e7a346
         return;
e7a346
 }
e7a346
 
e7a346
-void
e7a346
-afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this)
e7a346
-{
e7a346
-        afr_local_t     *local = NULL;
e7a346
-        afr_fd_ctx_t    *fd_ctx   = NULL;
e7a346
-
e7a346
-        local = frame->local;
e7a346
-
e7a346
-        if (!local->fd)
e7a346
-		return;
e7a346
-
e7a346
-	fd_ctx = afr_fd_ctx_get (local->fd, this);
e7a346
-	if (!fd_ctx)
e7a346
-		return;
e7a346
-
e7a346
-	fd_ctx->open_fd_count = local->open_fd_count;
e7a346
-}
e7a346
-
e7a346
 int**
e7a346
 afr_mark_pending_changelog (afr_private_t *priv, unsigned char *pending,
e7a346
                             dict_t *xattr, ia_type_t iat)
e7a346
@@ -5548,7 +5520,7 @@ out:
e7a346
 
e7a346
 int
e7a346
 afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this,
e7a346
-                                  inode_t *inode, gf_boolean_t *dsh,
e7a346
+                                  fd_t *fd, gf_boolean_t *dsh,
e7a346
                                   gf_boolean_t *pflag)
e7a346
 {
e7a346
         int ret = -1;
e7a346
@@ -5558,8 +5530,8 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this,
e7a346
         unsigned char *healed_sinks = NULL;
e7a346
         unsigned char *undid_pending = NULL;
e7a346
         afr_private_t   *priv = NULL;
e7a346
-        fd_t          *fd = NULL;
e7a346
         struct afr_reply *locked_replies = NULL;
e7a346
+        inode_t *inode = fd->inode;
e7a346
 
e7a346
         priv = this->private;
e7a346
         data_lock = alloca0 (priv->child_count);
e7a346
@@ -5568,18 +5540,6 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this,
e7a346
         healed_sinks = alloca0 (priv->child_count);
e7a346
         undid_pending = alloca0 (priv->child_count);
e7a346
 
e7a346
-        /* Heal-info does an open() on the file being examined so that the
e7a346
-         * current eager-lock holding client, if present, at some point sees
e7a346
-         * open-fd count being > 1 and releases the eager-lock so that heal-info
e7a346
-         * doesn't remain blocked forever until IO completes.
e7a346
-         */
e7a346
-        ret = afr_selfheal_data_open (this, inode, &fd;;
e7a346
-        if (ret < 0) {
e7a346
-                gf_msg_debug (this->name, -ret, "%s: Failed to open",
e7a346
-                              uuid_utoa (inode->gfid));
e7a346
-                goto out;
e7a346
-        }
e7a346
-
e7a346
         locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count);
e7a346
 
e7a346
         ret = afr_selfheal_inodelk (frame, this, inode, this->name,
e7a346
@@ -5602,8 +5562,6 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this,
e7a346
 out:
e7a346
         if (locked_replies)
e7a346
                 afr_replies_wipe (locked_replies, priv->child_count);
e7a346
-        if (fd)
e7a346
-                fd_unref (fd);
e7a346
         return ret;
e7a346
 }
e7a346
 
e7a346
@@ -5688,6 +5646,7 @@ afr_selfheal_locked_inspect (call_frame_t *frame, xlator_t *this, uuid_t gfid,
e7a346
 
e7a346
 {
e7a346
         int ret             = -1;
e7a346
+        fd_t *fd            = NULL;
e7a346
         gf_boolean_t    dsh = _gf_false;
e7a346
         gf_boolean_t    msh = _gf_false;
e7a346
         gf_boolean_t    esh = _gf_false;
e7a346
@@ -5699,6 +5658,21 @@ afr_selfheal_locked_inspect (call_frame_t *frame, xlator_t *this, uuid_t gfid,
e7a346
 
e7a346
         /* For every heal type hold locks and check if it indeed needs heal */
e7a346
 
e7a346
+
e7a346
+        /* Heal-info does an open() on the file being examined so that the
e7a346
+         * current eager-lock holding client, if present, at some point sees
e7a346
+         * open-fd count being > 1 and releases the eager-lock so that heal-info
e7a346
+         * doesn't remain blocked forever until IO completes.
e7a346
+         */
e7a346
+        if ((*inode)->ia_type == IA_IFREG) {
e7a346
+                ret = afr_selfheal_data_open (this, *inode, &fd;;
e7a346
+                if (ret < 0) {
e7a346
+                        gf_msg_debug (this->name, -ret, "%s: Failed to open",
e7a346
+                                      uuid_utoa ((*inode)->gfid));
e7a346
+                        goto out;
e7a346
+                }
e7a346
+        }
e7a346
+
e7a346
         if (msh) {
e7a346
                 ret = afr_selfheal_locked_metadata_inspect (frame, this,
e7a346
                                                             *inode, &msh,
e7a346
@@ -5708,7 +5682,7 @@ afr_selfheal_locked_inspect (call_frame_t *frame, xlator_t *this, uuid_t gfid,
e7a346
         }
e7a346
 
e7a346
         if (dsh) {
e7a346
-                ret = afr_selfheal_locked_data_inspect (frame, this, *inode,
e7a346
+                ret = afr_selfheal_locked_data_inspect (frame, this, fd,
e7a346
                                                         &dsh, pending);
e7a346
                 if (ret == -EIO || (ret == -EAGAIN))
e7a346
                         goto out;
e7a346
@@ -5723,6 +5697,8 @@ out:
e7a346
         *data_selfheal = dsh;
e7a346
         *entry_selfheal = esh;
e7a346
         *metadata_selfheal = msh;
e7a346
+        if (fd)
e7a346
+                fd_unref (fd);
e7a346
         return ret;
e7a346
 }
e7a346
 
e7a346
@@ -6352,6 +6328,7 @@ afr_write_subvol_reset (call_frame_t *frame, xlator_t *this)
e7a346
         local = frame->local;
e7a346
         LOCK(&local->inode->lock);
e7a346
         {
e7a346
+                GF_ASSERT (local->inode_ctx->lock_count > 0);
e7a346
                 local->inode_ctx->lock_count--;
e7a346
 
e7a346
                 if (!local->inode_ctx->lock_count)
e7a346
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
e7a346
index 2402bb2..b52b6ca 100644
e7a346
--- a/xlators/cluster/afr/src/afr-inode-write.c
e7a346
+++ b/xlators/cluster/afr/src/afr-inode-write.c
e7a346
@@ -341,14 +341,14 @@ afr_process_post_writev (call_frame_t *frame, xlator_t *this)
e7a346
                    the xattrs are not reliably pointing at
e7a346
                    a stale file.
e7a346
                 */
e7a346
-                afr_fd_report_unstable_write (this, local->fd);
e7a346
+                afr_fd_report_unstable_write (this, local);
e7a346
 
e7a346
         __afr_inode_write_finalize (frame, this);
e7a346
 
e7a346
         afr_writev_handle_short_writes (frame, this);
e7a346
 
e7a346
         if (local->update_open_fd_count)
e7a346
-                afr_handle_open_fd_count (frame, this);
e7a346
+                local->inode_ctx->open_fd_count = local->open_fd_count;
e7a346
 
e7a346
 }
e7a346
 
e7a346
@@ -2590,7 +2590,7 @@ afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
e7a346
         local->op = GF_FOP_FSYNC;
e7a346
         local->cont.fsync.datasync = datasync;
e7a346
 
e7a346
-	if (afr_fd_has_witnessed_unstable_write (this, fd)) {
e7a346
+	if (afr_fd_has_witnessed_unstable_write (this, fd->inode)) {
e7a346
 		/* don't care. we only wanted to CLEAR the bit */
e7a346
 	}
e7a346
 
e7a346
diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c
e7a346
index 260815f..be3de01 100644
e7a346
--- a/xlators/cluster/afr/src/afr-lk-common.c
e7a346
+++ b/xlators/cluster/afr/src/afr-lk-common.c
e7a346
@@ -52,31 +52,6 @@ afr_entry_lockee_cmp (const void *l1, const void *l2)
e7a346
 
e7a346
 int afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index);
e7a346
 
e7a346
-static int
e7a346
-afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this);
e7a346
-
e7a346
-static uint64_t afr_lock_number = 1;
e7a346
-
e7a346
-static uint64_t
e7a346
-get_afr_lock_number ()
e7a346
-{
e7a346
-        return (++afr_lock_number);
e7a346
-}
e7a346
-
e7a346
-int
e7a346
-afr_set_lock_number (call_frame_t *frame, xlator_t *this)
e7a346
-{
e7a346
-        afr_local_t         *local    = NULL;
e7a346
-        afr_internal_lock_t *int_lock = NULL;
e7a346
-
e7a346
-        local    = frame->local;
e7a346
-        int_lock = &local->internal_lock;
e7a346
-
e7a346
-        int_lock->lock_number = get_afr_lock_number ();
e7a346
-
e7a346
-        return 0;
e7a346
-}
e7a346
-
e7a346
 void
e7a346
 afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner)
e7a346
 {
e7a346
@@ -203,21 +178,16 @@ initialize_inodelk_variables (call_frame_t *frame, xlator_t *this)
e7a346
         afr_local_t         *local    = NULL;
e7a346
         afr_internal_lock_t *int_lock = NULL;
e7a346
         afr_private_t       *priv     = NULL;
e7a346
-        afr_inodelk_t       *inodelk  = NULL;
e7a346
 
e7a346
         priv     = this->private;
e7a346
         local    = frame->local;
e7a346
         int_lock = &local->internal_lock;
e7a346
 
e7a346
-        inodelk = afr_get_inodelk (int_lock, int_lock->domain);
e7a346
-
e7a346
-        inodelk->lock_count    = 0;
e7a346
+        int_lock->lock_count    = 0;
e7a346
         int_lock->lk_attempted_count = 0;
e7a346
         int_lock->lock_op_ret   = -1;
e7a346
         int_lock->lock_op_errno = 0;
e7a346
 
e7a346
-        memset (inodelk->locked_nodes, 0,
e7a346
-                sizeof (*inodelk->locked_nodes) * priv->child_count);
e7a346
         memset (int_lock->locked_nodes, 0,
e7a346
                 sizeof (*int_lock->locked_nodes) * priv->child_count);
e7a346
 
e7a346
@@ -286,12 +256,7 @@ void
e7a346
 afr_update_uninodelk (afr_local_t *local, afr_internal_lock_t *int_lock,
e7a346
                     int32_t child_index)
e7a346
 {
e7a346
-        afr_inodelk_t       *inodelk = NULL;
e7a346
-
e7a346
-        inodelk = afr_get_inodelk (int_lock, int_lock->domain);
e7a346
-        inodelk->locked_nodes[child_index] &= LOCKED_NO;
e7a346
-        if (local->transaction.eager_lock)
e7a346
-                local->transaction.eager_lock[child_index] = 0;
e7a346
+        int_lock->locked_nodes[child_index] &= LOCKED_NO;
e7a346
 
e7a346
 }
e7a346
 
e7a346
@@ -331,35 +296,27 @@ static int
e7a346
 afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)
e7a346
 {
e7a346
         afr_internal_lock_t *int_lock = NULL;
e7a346
-        afr_inodelk_t       *inodelk  = NULL;
e7a346
         afr_local_t         *local    = NULL;
e7a346
         afr_private_t       *priv     = NULL;
e7a346
         struct gf_flock flock = {0,};
e7a346
-        struct gf_flock full_flock = {0,};
e7a346
-        struct gf_flock *flock_use = NULL;
e7a346
         int call_count = 0;
e7a346
         int i = 0;
e7a346
-        int piggyback = 0;
e7a346
-        afr_fd_ctx_t        *fd_ctx      = NULL;
e7a346
-
e7a346
 
e7a346
         local    = frame->local;
e7a346
         int_lock = &local->internal_lock;
e7a346
         priv     = this->private;
e7a346
 
e7a346
-        inodelk = afr_get_inodelk (int_lock, int_lock->domain);
e7a346
-
e7a346
-        flock.l_start = inodelk->flock.l_start;
e7a346
-        flock.l_len   = inodelk->flock.l_len;
e7a346
+        flock.l_start = int_lock->flock.l_start;
e7a346
+        flock.l_len   = int_lock->flock.l_len;
e7a346
         flock.l_type  = F_UNLCK;
e7a346
 
e7a346
-        full_flock.l_type = F_UNLCK;
e7a346
-        call_count = afr_locked_nodes_count (inodelk->locked_nodes,
e7a346
+        call_count = afr_locked_nodes_count (int_lock->locked_nodes,
e7a346
                                              priv->child_count);
e7a346
 
e7a346
         int_lock->lk_call_count = call_count;
e7a346
 
e7a346
         if (!call_count) {
e7a346
+                GF_ASSERT (!local->transaction.do_eager_unlock);
e7a346
                 gf_msg_trace (this->name, 0,
e7a346
                               "No internal locks unlocked");
e7a346
 
e7a346
@@ -367,64 +324,28 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)
e7a346
                 goto out;
e7a346
         }
e7a346
 
e7a346
-        if (local->fd)
e7a346
-                fd_ctx = afr_fd_ctx_get (local->fd, this);
e7a346
-
e7a346
         for (i = 0; i < priv->child_count; i++) {
e7a346
-                if ((inodelk->locked_nodes[i] & LOCKED_YES) != LOCKED_YES)
e7a346
+                if ((int_lock->locked_nodes[i] & LOCKED_YES) != LOCKED_YES)
e7a346
                         continue;
e7a346
 
e7a346
                 if (local->fd) {
e7a346
-                        flock_use = &flock;
e7a346
-                        if (!local->transaction.eager_lock[i]) {
e7a346
-                                goto wind;
e7a346
-                        }
e7a346
-
e7a346
-                        piggyback = 0;
e7a346
-
e7a346
-                        LOCK (&local->fd->lock);
e7a346
-                        {
e7a346
-                                if (fd_ctx->lock_piggyback[i]) {
e7a346
-                                        fd_ctx->lock_piggyback[i]--;
e7a346
-                                        piggyback = 1;
e7a346
-                                } else {
e7a346
-                                        fd_ctx->lock_acquired[i]--;
e7a346
-                                }
e7a346
-                        }
e7a346
-                        UNLOCK (&local->fd->lock);
e7a346
-
e7a346
-                        if (piggyback) {
e7a346
-                                afr_unlock_inodelk_cbk (frame, (void *) (long) i,
e7a346
-                                                        this, 1, 0, NULL);
e7a346
-                                if (!--call_count)
e7a346
-                                        break;
e7a346
-                                continue;
e7a346
-                        }
e7a346
-
e7a346
-                        flock_use = &full_flock;
e7a346
-                wind:
e7a346
                         STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk,
e7a346
                                            (void *) (long)i,
e7a346
                                            priv->children[i],
e7a346
                                            priv->children[i]->fops->finodelk,
e7a346
                                            int_lock->domain, local->fd,
e7a346
-                                           F_SETLK, flock_use, NULL);
e7a346
-
e7a346
-                        if (!--call_count)
e7a346
-                                break;
e7a346
-
e7a346
+                                           F_SETLK, &flock, NULL);
e7a346
                 } else {
e7a346
-
e7a346
                         STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk,
e7a346
                                            (void *) (long)i,
e7a346
                                            priv->children[i],
e7a346
                                            priv->children[i]->fops->inodelk,
e7a346
                                            int_lock->domain, &local->loc,
e7a346
                                            F_SETLK, &flock, NULL);
e7a346
-
e7a346
-                        if (!--call_count)
e7a346
-                                break;
e7a346
                 }
e7a346
+
e7a346
+                if (!--call_count)
e7a346
+                        break;
e7a346
         }
e7a346
 out:
e7a346
         return 0;
e7a346
@@ -512,6 +433,18 @@ out:
e7a346
 
e7a346
 }
e7a346
 
e7a346
+int32_t
e7a346
+afr_unlock_now (call_frame_t *frame, xlator_t *this)
e7a346
+{
e7a346
+        afr_local_t *local = frame->local;
e7a346
+
e7a346
+        if (afr_is_inodelk_transaction(local->transaction.type))
e7a346
+                afr_unlock_inodelk (frame, this);
e7a346
+        else
e7a346
+                afr_unlock_entrylk (frame, this);
e7a346
+        return 0;
e7a346
+}
e7a346
+
e7a346
 static int32_t
e7a346
 afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
               int32_t op_ret, int32_t op_errno, dict_t *xdata)
e7a346
@@ -553,7 +486,7 @@ afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
 
e7a346
         if ((op_ret == -1) &&
e7a346
             (op_errno == ENOSYS)) {
e7a346
-                afr_unlock (frame, this);
e7a346
+                afr_unlock_now (frame, this);
e7a346
         } else {
e7a346
                 if (op_ret == 0) {
e7a346
                         if (local->transaction.type == AFR_ENTRY_TRANSACTION ||
e7a346
@@ -598,38 +531,6 @@ afr_blocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
         return 0;
e7a346
 }
e7a346
 
e7a346
-static int
e7a346
-afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this)
e7a346
-{
e7a346
-        afr_internal_lock_t *int_lock = NULL;
e7a346
-        afr_inodelk_t       *inodelk  = NULL;
e7a346
-        afr_local_t         *local    = NULL;
e7a346
-        afr_private_t       *priv     = NULL;
e7a346
-
e7a346
-        priv     = this->private;
e7a346
-        local    = frame->local;
e7a346
-        int_lock = &local->internal_lock;
e7a346
-
e7a346
-        switch (local->transaction.type) {
e7a346
-        case AFR_DATA_TRANSACTION:
e7a346
-        case AFR_METADATA_TRANSACTION:
e7a346
-                inodelk = afr_get_inodelk (int_lock, int_lock->domain);
e7a346
-                memcpy (inodelk->locked_nodes, int_lock->locked_nodes,
e7a346
-                        sizeof (*inodelk->locked_nodes) * priv->child_count);
e7a346
-                inodelk->lock_count = int_lock->lock_count;
e7a346
-                break;
e7a346
-
e7a346
-        case AFR_ENTRY_RENAME_TRANSACTION:
e7a346
-        case AFR_ENTRY_TRANSACTION:
e7a346
-                /*entrylk_count is being used in both non-blocking and blocking
e7a346
-                 * modes */
e7a346
-                break;
e7a346
-        }
e7a346
-
e7a346
-        return 0;
e7a346
-
e7a346
-}
e7a346
-
e7a346
 static gf_boolean_t
e7a346
 afr_is_entrylk (afr_transaction_type trans_type)
e7a346
 {
e7a346
@@ -733,7 +634,6 @@ int
e7a346
 afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
e7a346
 {
e7a346
         afr_internal_lock_t *int_lock    = NULL;
e7a346
-        afr_inodelk_t       *inodelk     = NULL;
e7a346
         afr_local_t         *local       = NULL;
e7a346
         afr_private_t       *priv        = NULL;
e7a346
         struct gf_flock flock = {0,};
e7a346
@@ -752,10 +652,9 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
e7a346
 
e7a346
 
e7a346
         if (!is_entrylk) {
e7a346
-                inodelk = afr_get_inodelk (int_lock, int_lock->domain);
e7a346
-                flock.l_start = inodelk->flock.l_start;
e7a346
-                flock.l_len   = inodelk->flock.l_len;
e7a346
-                flock.l_type  = inodelk->flock.l_type;
e7a346
+                flock.l_start = int_lock->flock.l_start;
e7a346
+                flock.l_len   = int_lock->flock.l_len;
e7a346
+                flock.l_type  = int_lock->flock.l_type;
e7a346
         }
e7a346
 
e7a346
         if (local->fd) {
e7a346
@@ -770,9 +669,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
e7a346
                         local->op_ret           = -1;
e7a346
                         int_lock->lock_op_ret   = -1;
e7a346
 
e7a346
-                        afr_copy_locked_nodes (frame, this);
e7a346
-
e7a346
-                        afr_unlock (frame, this);
e7a346
+                        afr_unlock_now (frame, this);
e7a346
 
e7a346
                         return 0;
e7a346
                 }
e7a346
@@ -784,9 +681,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
e7a346
                         local->op_ret           = -1;
e7a346
                         int_lock->lock_op_ret   = -1;
e7a346
 
e7a346
-                        afr_copy_locked_nodes (frame, this);
e7a346
-
e7a346
-                        afr_unlock(frame, this);
e7a346
+                        afr_unlock_now(frame, this);
e7a346
 
e7a346
                         return 0;
e7a346
                 }
e7a346
@@ -798,8 +693,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
e7a346
                 gf_msg_debug (this->name, 0,
e7a346
                               "we're done locking");
e7a346
 
e7a346
-                afr_copy_locked_nodes (frame, this);
e7a346
-
e7a346
                 int_lock->lock_op_ret = 0;
e7a346
                 int_lock->lock_cbk (frame, this);
e7a346
                 return 0;
e7a346
@@ -815,7 +708,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
e7a346
         case AFR_METADATA_TRANSACTION:
e7a346
 
e7a346
                 if (local->fd) {
e7a346
-
e7a346
                         STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk,
e7a346
                                            (void *) (long) child_index,
e7a346
                                            priv->children[child_index],
e7a346
@@ -824,7 +716,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
e7a346
                                            F_SETLKW, &flock, NULL);
e7a346
 
e7a346
                 } else {
e7a346
-
e7a346
                         STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk,
e7a346
                                            (void *) (long) child_index,
e7a346
                                            priv->children[child_index],
e7a346
@@ -841,7 +732,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
e7a346
                  *and 'fd-less' children */
e7a346
 
e7a346
                 if (local->fd) {
e7a346
-
e7a346
                         STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk,
e7a346
                                            (void *) (long) cookie,
e7a346
                                            priv->children[child_index],
e7a346
@@ -850,7 +740,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
e7a346
                                            int_lock->lockee[lockee_no].basename,
e7a346
                                            ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
e7a346
                 } else {
e7a346
-
e7a346
                         STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk,
e7a346
                                            (void *) (long) cookie,
e7a346
                                            priv->children[child_index],
e7a346
@@ -922,7 +811,6 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
         local    = frame->local;
e7a346
         int_lock = &local->internal_lock;
e7a346
 
e7a346
-
e7a346
 	LOCK (&frame->lock);
e7a346
 	{
e7a346
 		if (op_ret < 0 ) {
e7a346
@@ -969,7 +857,7 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
                                       "with blocking calls",
e7a346
                                       int_lock->lock_count);
e7a346
 
e7a346
-                        afr_unlock(frame, this);
e7a346
+                        afr_unlock_now(frame, this);
e7a346
                 }
e7a346
         }
e7a346
 
e7a346
@@ -1009,7 +897,7 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)
e7a346
                         local->op_errno         = EINVAL;
e7a346
                         int_lock->lock_op_errno = EINVAL;
e7a346
 
e7a346
-			afr_unlock (frame, this);
e7a346
+			afr_unlock_now (frame, this);
e7a346
                         return -1;
e7a346
                 }
e7a346
 
e7a346
@@ -1021,7 +909,7 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)
e7a346
                         gf_msg (this->name, GF_LOG_INFO, 0,
e7a346
                                 AFR_MSG_INFO_COMMON,
e7a346
                                 "fd not open on any subvolumes. aborting.");
e7a346
-                        afr_unlock (frame, this);
e7a346
+                        afr_unlock_now (frame, this);
e7a346
                         goto out;
e7a346
                 }
e7a346
 
e7a346
@@ -1031,7 +919,6 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)
e7a346
                         index = i%copies;
e7a346
                         lockee_no = i/copies;
e7a346
                         if (local->child_up[index]) {
e7a346
-
e7a346
                                 STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk,
e7a346
                                                    (void *) (long) i,
e7a346
                                                    priv->children[index],
e7a346
@@ -1053,7 +940,6 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)
e7a346
                         index = i%copies;
e7a346
                         lockee_no = i/copies;
e7a346
                         if (local->child_up[index]) {
e7a346
-
e7a346
                                 STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk,
e7a346
                                                    (void *) (long) i,
e7a346
                                                    priv->children[index],
e7a346
@@ -1077,18 +963,12 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
                              int32_t op_ret, int32_t op_errno, dict_t *xdata)
e7a346
 {
e7a346
         afr_internal_lock_t *int_lock    = NULL;
e7a346
-        afr_inodelk_t       *inodelk     = NULL;
e7a346
         afr_local_t         *local       = NULL;
e7a346
-        afr_fd_ctx_t        *fd_ctx      = NULL;
e7a346
         int                  call_count  = 0;
e7a346
         int                  child_index = (long) cookie;
e7a346
 
e7a346
         local    = frame->local;
e7a346
         int_lock = &local->internal_lock;
e7a346
-        inodelk = afr_get_inodelk (int_lock, int_lock->domain);
e7a346
-
e7a346
-	if (local->fd)
e7a346
-		fd_ctx = afr_fd_ctx_get (local->fd, this);
e7a346
 
e7a346
         LOCK (&frame->lock);
e7a346
         {
e7a346
@@ -1105,43 +985,27 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
 				int_lock->lock_op_errno      = op_errno;
e7a346
 				local->op_errno              = op_errno;
e7a346
 			}
e7a346
-			if (local->transaction.eager_lock)
e7a346
-				local->transaction.eager_lock[child_index] = 0;
e7a346
 		} else {
e7a346
-			inodelk->locked_nodes[child_index] |= LOCKED_YES;
e7a346
-			inodelk->lock_count++;
e7a346
-
e7a346
-			if (local->transaction.eager_lock &&
e7a346
-			    local->transaction.eager_lock[child_index] &&
e7a346
-			    local->fd) {
e7a346
-				/* piggybacked */
e7a346
-				if (op_ret == 1) {
e7a346
-					/* piggybacked */
e7a346
-				} else if (op_ret == 0) {
e7a346
-					/* lock acquired from server */
e7a346
-                                        fd_ctx->lock_acquired[child_index]++;
e7a346
-				}
e7a346
-			}
e7a346
-
e7a346
-                        if (local->transaction.type == AFR_DATA_TRANSACTION &&
e7a346
-                            op_ret == 0) {
e7a346
-                                LOCK(&local->inode->lock);
e7a346
-                                {
e7a346
-                                        local->inode_ctx->lock_count++;
e7a346
-                                }
e7a346
-                                UNLOCK (&local->inode->lock);
e7a346
-                        }
e7a346
+			int_lock->locked_nodes[child_index] |= LOCKED_YES;
e7a346
+			int_lock->lock_count++;
e7a346
 		}
e7a346
 
e7a346
                 call_count = --int_lock->lk_call_count;
e7a346
         }
e7a346
         UNLOCK (&frame->lock);
e7a346
 
e7a346
+        if (op_ret == 0 && local->transaction.type == AFR_DATA_TRANSACTION) {
e7a346
+                LOCK (&local->inode->lock);
e7a346
+                {
e7a346
+                        local->inode_ctx->lock_count++;
e7a346
+                }
e7a346
+                UNLOCK (&local->inode->lock);
e7a346
+        }
e7a346
         if (call_count == 0) {
e7a346
                 gf_msg_trace (this->name, 0,
e7a346
                               "Last inode locking reply received");
e7a346
                 /* all locks successful. Proceed to call FOP */
e7a346
-                if (inodelk->lock_count == int_lock->lk_expected_count) {
e7a346
+                if (int_lock->lock_count == int_lock->lk_expected_count) {
e7a346
                         gf_msg_trace (this->name, 0,
e7a346
                                       "All servers locked. Calling the cbk");
e7a346
                         int_lock->lock_op_ret = 0;
e7a346
@@ -1155,7 +1019,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
                                       "Trying again with blocking calls",
e7a346
                                       int_lock->lock_count);
e7a346
 
e7a346
-                        afr_unlock(frame, this);
e7a346
+                        afr_unlock_now(frame, this);
e7a346
                 }
e7a346
         }
e7a346
 
e7a346
@@ -1166,30 +1030,17 @@ int
e7a346
 afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
e7a346
 {
e7a346
         afr_internal_lock_t *int_lock = NULL;
e7a346
-        afr_inodelk_t       *inodelk  = NULL;
e7a346
         afr_local_t         *local    = NULL;
e7a346
         afr_private_t       *priv     = NULL;
e7a346
         afr_fd_ctx_t        *fd_ctx   = NULL;
e7a346
         int32_t             call_count = 0;
e7a346
         int                 i          = 0;
e7a346
         int                 ret        = 0;
e7a346
-        struct              gf_flock flock = {0,};
e7a346
-        struct              gf_flock full_flock = {0,};
e7a346
-        struct              gf_flock *flock_use = NULL;
e7a346
-        int                 piggyback = 0;
e7a346
 
e7a346
         local    = frame->local;
e7a346
         int_lock = &local->internal_lock;
e7a346
         priv     = this->private;
e7a346
 
e7a346
-        inodelk = afr_get_inodelk (int_lock, int_lock->domain);
e7a346
-
e7a346
-        flock.l_start = inodelk->flock.l_start;
e7a346
-        flock.l_len   = inodelk->flock.l_len;
e7a346
-        flock.l_type  = inodelk->flock.l_type;
e7a346
-
e7a346
-        full_flock.l_type = inodelk->flock.l_type;
e7a346
-
e7a346
         initialize_inodelk_variables (frame, this);
e7a346
 
e7a346
         if (local->fd) {
e7a346
@@ -1205,88 +1056,48 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
e7a346
                         local->op_errno         = EINVAL;
e7a346
                         int_lock->lock_op_errno = EINVAL;
e7a346
 
e7a346
-			afr_unlock (frame, this);
e7a346
+			afr_unlock_now (frame, this);
e7a346
                         ret = -1;
e7a346
                         goto out;
e7a346
                 }
e7a346
+        }
e7a346
 
e7a346
-                call_count = internal_lock_count (frame, this);
e7a346
-                int_lock->lk_call_count = call_count;
e7a346
-                int_lock->lk_expected_count = call_count;
e7a346
-
e7a346
-                if (!call_count) {
e7a346
-                        gf_msg (this->name, GF_LOG_INFO, 0,
e7a346
-                                AFR_MSG_SUBVOLS_DOWN,
e7a346
-                                "All bricks are down, aborting.");
e7a346
-                        afr_unlock (frame, this);
e7a346
-                        goto out;
e7a346
-                }
e7a346
-
e7a346
-                /* Send non-blocking inodelk calls only on up children
e7a346
-                   and where the fd has been opened */
e7a346
-                for (i = 0; i < priv->child_count; i++) {
e7a346
-                        if (!local->child_up[i])
e7a346
-                                continue;
e7a346
-
e7a346
-                        flock_use = &flock;
e7a346
-                        if (!local->transaction.eager_lock_on) {
e7a346
-                                goto wind;
e7a346
-                        }
e7a346
-
e7a346
-                        piggyback = 0;
e7a346
-                        local->transaction.eager_lock[i] = 1;
e7a346
-
e7a346
-			afr_set_delayed_post_op (frame, this);
e7a346
+        call_count = internal_lock_count (frame, this);
e7a346
+        int_lock->lk_call_count = call_count;
e7a346
+        int_lock->lk_expected_count = call_count;
e7a346
 
e7a346
-                        LOCK (&local->fd->lock);
e7a346
-                        {
e7a346
-                                if (fd_ctx->lock_acquired[i]) {
e7a346
-                                        fd_ctx->lock_piggyback[i]++;
e7a346
-                                        piggyback = 1;
e7a346
-                                }
e7a346
-                        }
e7a346
-                        UNLOCK (&local->fd->lock);
e7a346
+        if (!call_count) {
e7a346
+                gf_msg (this->name, GF_LOG_INFO, 0,
e7a346
+                        AFR_MSG_SUBVOLS_DOWN,
e7a346
+                        "All bricks are down, aborting.");
e7a346
+                afr_unlock_now (frame, this);
e7a346
+                goto out;
e7a346
+        }
e7a346
 
e7a346
-                        if (piggyback) {
e7a346
-                                /* (op_ret == 1) => indicate piggybacked lock */
e7a346
-                                afr_nonblocking_inodelk_cbk (frame, (void *) (long) i,
e7a346
-                                                             this, 1, 0, NULL);
e7a346
-                                if (!--call_count)
e7a346
-                                        break;
e7a346
-                                continue;
e7a346
-                        }
e7a346
-                        flock_use = &full_flock;
e7a346
-                wind:
e7a346
+        /* Send non-blocking inodelk calls only on up children
e7a346
+           and where the fd has been opened */
e7a346
+        for (i = 0; i < priv->child_count; i++) {
e7a346
+                if (!local->child_up[i])
e7a346
+                        continue;
e7a346
 
e7a346
+                if (local->fd) {
e7a346
                         STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk,
e7a346
                                            (void *) (long) i,
e7a346
                                            priv->children[i],
e7a346
                                            priv->children[i]->fops->finodelk,
e7a346
                                            int_lock->domain, local->fd,
e7a346
-                                           F_SETLK, flock_use, NULL);
e7a346
-
e7a346
-                        if (!--call_count)
e7a346
-                                break;
e7a346
-                }
e7a346
-        } else {
e7a346
-                call_count = internal_lock_count (frame, this);
e7a346
-                int_lock->lk_call_count = call_count;
e7a346
-                int_lock->lk_expected_count = call_count;
e7a346
-
e7a346
-                for (i = 0; i < priv->child_count; i++) {
e7a346
-                        if (!local->child_up[i])
e7a346
-                                continue;
e7a346
+                                           F_SETLK, &int_lock->flock, NULL);
e7a346
+                } else {
e7a346
 
e7a346
                         STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk,
e7a346
                                            (void *) (long) i,
e7a346
                                            priv->children[i],
e7a346
                                            priv->children[i]->fops->inodelk,
e7a346
                                            int_lock->domain, &local->loc,
e7a346
-                                           F_SETLK, &flock, NULL);
e7a346
-
e7a346
-                        if (!--call_count)
e7a346
-                                break;
e7a346
+                                           F_SETLK, &int_lock->flock, NULL);
e7a346
                 }
e7a346
+                if (!--call_count)
e7a346
+                        break;
e7a346
         }
e7a346
 out:
e7a346
         return ret;
e7a346
@@ -1296,13 +1107,32 @@ int32_t
e7a346
 afr_unlock (call_frame_t *frame, xlator_t *this)
e7a346
 {
e7a346
         afr_local_t *local = NULL;
e7a346
+        afr_lock_t  *lock  = NULL;
e7a346
 
e7a346
         local = frame->local;
e7a346
 
e7a346
-        if (afr_is_inodelk_transaction(local->transaction.type))
e7a346
-                afr_unlock_inodelk (frame, this);
e7a346
-        else
e7a346
-                afr_unlock_entrylk (frame, this);
e7a346
+        if (!local->transaction.eager_lock_on)
e7a346
+                goto out;
e7a346
+        lock = &local->inode_ctx->lock[local->transaction.type];
e7a346
+        LOCK (&local->inode->lock);
e7a346
+        {
e7a346
+                list_del_init (&local->transaction.owner_list);
e7a346
+                if (list_empty (&lock->owners) && list_empty (&lock->post_op)) {
e7a346
+                        local->transaction.do_eager_unlock = _gf_true;
e7a346
+        /*TODO: Need to get metadata use on_disk and inherit/uninherit
e7a346
+         *GF_ASSERT (!local->inode_ctx->on_disk[local->transaction.type]);
e7a346
+         *GF_ASSERT (!local->inode_ctx->inherited[local->transaction.type]);
e7a346
+        */
e7a346
+                        GF_ASSERT (lock->release);
e7a346
+                }
e7a346
+        }
e7a346
+        UNLOCK (&local->inode->lock);
e7a346
+        if (!local->transaction.do_eager_unlock) {
e7a346
+                local->internal_lock.lock_cbk (frame, this);
e7a346
+                return 0;
e7a346
+        }
e7a346
 
e7a346
+out:
e7a346
+        afr_unlock_now (frame, this);
e7a346
         return 0;
e7a346
 }
e7a346
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
e7a346
index f61b237..32fd24a 100644
e7a346
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
e7a346
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
e7a346
@@ -2463,6 +2463,7 @@ afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid)
e7a346
         int           data_ret          = 1;
e7a346
         int           or_ret            = 0;
e7a346
         inode_t      *inode             = NULL;
e7a346
+        fd_t         *fd                = NULL;
e7a346
 	gf_boolean_t  data_selfheal     = _gf_false;
e7a346
 	gf_boolean_t  metadata_selfheal = _gf_false;
e7a346
 	gf_boolean_t  entry_selfheal    = _gf_false;
e7a346
@@ -2487,8 +2488,16 @@ afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid)
e7a346
                 goto out;
e7a346
         }
e7a346
 
e7a346
+        if (inode->ia_type == IA_IFREG) {
e7a346
+                ret = afr_selfheal_data_open (this, inode, &fd;;
e7a346
+                if (!fd) {
e7a346
+                        ret = -EIO;
e7a346
+                        goto out;
e7a346
+                }
e7a346
+        }
e7a346
+
e7a346
 	if (data_selfheal && dataheal_enabled)
e7a346
-                data_ret = afr_selfheal_data (frame, this, inode);
e7a346
+                data_ret = afr_selfheal_data (frame, this, fd);
e7a346
 
e7a346
 	if (metadata_selfheal && priv->metadata_self_heal)
e7a346
                 metadata_ret = afr_selfheal_metadata (frame, this, inode);
e7a346
@@ -2510,6 +2519,8 @@ afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid)
e7a346
 out:
e7a346
         if (inode)
e7a346
                 inode_unref (inode);
e7a346
+        if (fd)
e7a346
+                fd_unref (fd);
e7a346
         return ret;
e7a346
 }
e7a346
 /*
e7a346
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
e7a346
index bcd0dec..f872a98 100644
e7a346
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
e7a346
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
e7a346
@@ -856,22 +856,15 @@ out:
e7a346
 }
e7a346
 
e7a346
 int
e7a346
-afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode)
e7a346
+afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd)
e7a346
 {
e7a346
 	afr_private_t *priv = NULL;
e7a346
 	unsigned char *locked_on = NULL;
e7a346
 	int ret = 0;
e7a346
-	fd_t *fd = NULL;
e7a346
+        inode_t *inode = fd->inode;
e7a346
 
e7a346
 	priv = this->private;
e7a346
 
e7a346
-	ret = afr_selfheal_data_open (this, inode, &fd;;
e7a346
-	if (!fd) {
e7a346
-                gf_msg_debug (this->name, -ret, "%s: Failed to open",
e7a346
-                              uuid_utoa (inode->gfid));
e7a346
-                return -EIO;
e7a346
-        }
e7a346
-
e7a346
 	locked_on = alloca0 (priv->child_count);
e7a346
 
e7a346
 	ret = afr_selfheal_tie_breaker_inodelk (frame, this, inode,
e7a346
@@ -898,8 +891,5 @@ unlock:
e7a346
 	afr_selfheal_uninodelk (frame, this, inode, priv->sh_domain, 0, 0,
e7a346
 	                        locked_on);
e7a346
 
e7a346
-	if (fd)
e7a346
-		fd_unref (fd);
e7a346
-
e7a346
 	return ret;
e7a346
 }
e7a346
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
e7a346
index 188a334..b015976 100644
e7a346
--- a/xlators/cluster/afr/src/afr-self-heal.h
e7a346
+++ b/xlators/cluster/afr/src/afr-self-heal.h
e7a346
@@ -102,7 +102,7 @@ afr_selfheal_name (xlator_t *this, uuid_t gfid, const char *name,
e7a346
                    void *gfid_req, dict_t *xdata);
e7a346
 
e7a346
 int
e7a346
-afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode);
e7a346
+afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd);
e7a346
 
e7a346
 int
e7a346
 afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode);
e7a346
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
e7a346
index acbfe1a..993029d 100644
e7a346
--- a/xlators/cluster/afr/src/afr-transaction.c
e7a346
+++ b/xlators/cluster/afr/src/afr-transaction.c
e7a346
@@ -25,6 +25,18 @@ typedef enum {
e7a346
         AFR_TRANSACTION_POST_OP,
e7a346
 } afr_xattrop_type_t;
e7a346
 
e7a346
+static void
e7a346
+afr_lock_resume_shared (struct list_head *list);
e7a346
+
e7a346
+void
e7a346
+__afr_transaction_wake_shared (afr_local_t *local, struct list_head *shared);
e7a346
+
e7a346
+void
e7a346
+afr_changelog_post_op (call_frame_t *frame, xlator_t *this);
e7a346
+
e7a346
+int
e7a346
+afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this);
e7a346
+
e7a346
 gf_boolean_t
e7a346
 afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this);
e7a346
 
e7a346
@@ -168,13 +180,14 @@ afr_transaction_fop (call_frame_t *frame, xlator_t *this)
e7a346
         return 0;
e7a346
 }
e7a346
 
e7a346
-
e7a346
 int
e7a346
 afr_transaction_done (call_frame_t *frame, xlator_t *this)
e7a346
 {
e7a346
-        afr_local_t *local = NULL;
e7a346
-        afr_private_t *priv = NULL;
e7a346
-        gf_boolean_t unwind = _gf_false;
e7a346
+        afr_local_t   *local      = NULL;
e7a346
+        afr_private_t *priv       = NULL;
e7a346
+        gf_boolean_t  unwind      = _gf_false;
e7a346
+        afr_lock_t    *lock       = NULL;
e7a346
+        afr_local_t   *lock_local = NULL;
e7a346
 
e7a346
         priv  = this->private;
e7a346
         local = frame->local;
e7a346
@@ -188,6 +201,31 @@ afr_transaction_done (call_frame_t *frame, xlator_t *this)
e7a346
                 if (unwind)/*It definitely did post-op*/
e7a346
                         afr_zero_fill_stat (local);
e7a346
         }
e7a346
+
e7a346
+        if (local->transaction.do_eager_unlock) {
e7a346
+                lock = &local->inode_ctx->lock[local->transaction.type];
e7a346
+                LOCK (&local->inode->lock);
e7a346
+                {
e7a346
+                        lock->acquired = _gf_false;
e7a346
+                        lock->release = _gf_false;
e7a346
+                        list_splice_init (&lock->frozen,
e7a346
+                                          &lock->waiting);
e7a346
+                        if (list_empty (&lock->waiting))
e7a346
+                                goto unlock;
e7a346
+                        lock_local = list_entry (lock->waiting.next,
e7a346
+                                                 afr_local_t,
e7a346
+                                                transaction.wait_list);
e7a346
+                        list_del_init (&lock_local->transaction.wait_list);
e7a346
+                        list_add (&lock_local->transaction.owner_list,
e7a346
+                                  &lock->owners);
e7a346
+                }
e7a346
+unlock:
e7a346
+                UNLOCK (&local->inode->lock);
e7a346
+        }
e7a346
+        if (lock_local) {
e7a346
+                afr_lock (lock_local->transaction.frame,
e7a346
+                          lock_local->transaction.frame->this);
e7a346
+        }
e7a346
         local->transaction.unwind (frame, this);
e7a346
 
e7a346
         AFR_STACK_DESTROY (frame);
e7a346
@@ -195,6 +233,52 @@ afr_transaction_done (call_frame_t *frame, xlator_t *this)
e7a346
         return 0;
e7a346
 }
e7a346
 
e7a346
+static void
e7a346
+afr_lock_fail_shared (afr_local_t *local, struct list_head *list)
e7a346
+{
e7a346
+        afr_local_t *each = NULL;
e7a346
+
e7a346
+        while (!list_empty(list)) {
e7a346
+                each = list_entry (list->next, afr_local_t,
e7a346
+                                   transaction.wait_list);
e7a346
+                list_del_init(&each->transaction.wait_list);
e7a346
+                each->op_ret = -1;
e7a346
+                each->op_errno = local->op_errno;
e7a346
+                afr_transaction_done (each->transaction.frame,
e7a346
+                                      each->transaction.frame->this);
e7a346
+        }
e7a346
+}
e7a346
+
e7a346
+static void
e7a346
+afr_handle_lock_acquire_failure (afr_local_t *local, gf_boolean_t locked)
e7a346
+{
e7a346
+        struct list_head shared;
e7a346
+        afr_lock_t *lock = NULL;
e7a346
+
e7a346
+        if (!local->transaction.eager_lock_on)
e7a346
+                goto out;
e7a346
+
e7a346
+        lock = &local->inode_ctx->lock[local->transaction.type];
e7a346
+
e7a346
+        INIT_LIST_HEAD (&shared);
e7a346
+        LOCK (&local->inode->lock);
e7a346
+        {
e7a346
+                list_splice_init (&lock->waiting, &shared);
e7a346
+        }
e7a346
+        UNLOCK (&local->inode->lock);
e7a346
+
e7a346
+        afr_lock_fail_shared (local, &shared);
e7a346
+        local->transaction.do_eager_unlock = _gf_true;
e7a346
+out:
e7a346
+        if (locked) {
e7a346
+                local->internal_lock.lock_cbk = afr_transaction_done;
e7a346
+                afr_unlock (local->transaction.frame,
e7a346
+                            local->transaction.frame->this);
e7a346
+        } else {
e7a346
+                afr_transaction_done (local->transaction.frame,
e7a346
+                                      local->transaction.frame->this);
e7a346
+        }
e7a346
+}
e7a346
 
e7a346
 call_frame_t*
e7a346
 afr_transaction_detach_fop_frame (call_frame_t *frame)
e7a346
@@ -334,6 +418,7 @@ afr_txn_arbitrate_fop (call_frame_t *frame, xlator_t *this)
e7a346
         afr_local_t *local = NULL;
e7a346
         afr_private_t *priv = NULL;
e7a346
         int pre_op_sources_count = 0;
e7a346
+        int i = 0;
e7a346
 
e7a346
         priv = this->private;
e7a346
         local = frame->local;
e7a346
@@ -345,11 +430,11 @@ afr_txn_arbitrate_fop (call_frame_t *frame, xlator_t *this)
e7a346
         /* If arbiter is the only source, do not proceed. */
e7a346
         if (pre_op_sources_count < 2 &&
e7a346
             local->transaction.pre_op_sources[ARBITER_BRICK_INDEX]) {
e7a346
-                local->internal_lock.lock_cbk = afr_transaction_done;
e7a346
                 local->op_ret = -1;
e7a346
                 local->op_errno =  ENOTCONN;
e7a346
-                afr_restore_lk_owner (frame);
e7a346
-                afr_unlock (frame, this);
e7a346
+                for (i = 0; i < priv->child_count; i++)
e7a346
+                        local->transaction.failed_subvols[i] = 1;
e7a346
+                afr_changelog_post_op (frame, this);/*uninherit should happen*/
e7a346
         } else {
e7a346
                 afr_transaction_fop (frame, this);
e7a346
         }
e7a346
@@ -362,14 +447,16 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)
e7a346
 {
e7a346
         afr_local_t   *local = NULL;
e7a346
         afr_private_t *priv  = NULL;
e7a346
-        fd_t          *fd    = NULL;
e7a346
         int           i      = 0;
e7a346
         int           ret    = 0;
e7a346
+        int     failure_count = 0;
e7a346
+        struct list_head shared;
e7a346
+        afr_lock_t *lock = NULL;
e7a346
 
e7a346
         local = frame->local;
e7a346
         priv = this->private;
e7a346
-        fd    = local->fd;
e7a346
 
e7a346
+        INIT_LIST_HEAD (&shared);
e7a346
         if (local->transaction.type == AFR_DATA_TRANSACTION &&
e7a346
             !local->transaction.inherited) {
e7a346
                 ret = afr_write_subvol_set (frame, this);
e7a346
@@ -394,22 +481,31 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)
e7a346
 		   just now, before OP */
e7a346
 		afr_changelog_pre_op_update (frame, this);
e7a346
 
e7a346
-        /* The wake up needs to happen independent of
e7a346
-           what type of fop arrives here. If it was
e7a346
-           a write, then it has already inherited the
e7a346
-           lock and changelog. If it was not a write,
e7a346
-           then the presumption of the optimization (of
e7a346
-           optimizing for successive write operations)
e7a346
-           fails.
e7a346
-        */
e7a346
-        if (fd)
e7a346
-                afr_delayed_changelog_wake_up (this, fd);
e7a346
+        if (!local->transaction.eager_lock_on ||
e7a346
+            local->transaction.inherited)
e7a346
+                goto fop;
e7a346
+        failure_count = AFR_COUNT (local->transaction.failed_subvols,
e7a346
+                                   priv->child_count);
e7a346
+        if (failure_count == priv->child_count) {
e7a346
+                afr_handle_lock_acquire_failure (local, _gf_true);
e7a346
+        } else {
e7a346
+                lock = &local->inode_ctx->lock[local->transaction.type];
e7a346
+                LOCK (&local->inode->lock);
e7a346
+                {
e7a346
+                        lock->acquired = _gf_true;
e7a346
+                        __afr_transaction_wake_shared (local, &shared);
e7a346
+                }
e7a346
+                UNLOCK (&local->inode->lock);
e7a346
+        }
e7a346
+
e7a346
+fop:
e7a346
         if (priv->arbiter_count == 1) {
e7a346
                 afr_txn_arbitrate_fop (frame, this);
e7a346
         } else {
e7a346
                 afr_transaction_fop (frame, this);
e7a346
         }
e7a346
 
e7a346
+        afr_lock_resume_shared (&shared);
e7a346
 	return 0;
e7a346
 }
e7a346
 
e7a346
@@ -486,30 +582,14 @@ afr_changelog_post_op_done (call_frame_t *frame, xlator_t *this)
e7a346
 }
e7a346
 
e7a346
 
e7a346
-afr_inodelk_t*
e7a346
-afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom)
e7a346
-{
e7a346
-        afr_inodelk_t *inodelk = NULL;
e7a346
-        int           i = 0;
e7a346
-
e7a346
-        for (i = 0; int_lock->inodelk[i].domain; i++) {
e7a346
-                inodelk = &int_lock->inodelk[i];
e7a346
-                if (strcmp (dom, inodelk->domain) == 0)
e7a346
-                        return inodelk;
e7a346
-        }
e7a346
-        return NULL;
e7a346
-}
e7a346
-
e7a346
 unsigned char*
e7a346
 afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock)
e7a346
 {
e7a346
         unsigned char *locked_nodes = NULL;
e7a346
-        afr_inodelk_t *inodelk = NULL;
e7a346
         switch (type) {
e7a346
         case AFR_DATA_TRANSACTION:
e7a346
         case AFR_METADATA_TRANSACTION:
e7a346
-                inodelk = afr_get_inodelk (int_lock, int_lock->domain);
e7a346
-                locked_nodes = inodelk->locked_nodes;
e7a346
+                locked_nodes = int_lock->locked_nodes;
e7a346
         break;
e7a346
 
e7a346
         case AFR_ENTRY_TRANSACTION:
e7a346
@@ -834,27 +914,19 @@ afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this)
e7a346
 {
e7a346
 	afr_local_t *local = NULL;
e7a346
 	afr_private_t *priv = NULL;
e7a346
-	fd_t *fd = NULL;
e7a346
+        afr_inode_ctx_t *ctx = NULL;
e7a346
 	int i = 0;
e7a346
 	gf_boolean_t ret = _gf_false;
e7a346
-	afr_fd_ctx_t *fd_ctx = NULL;
e7a346
 	int type = 0;
e7a346
 
e7a346
 	local = frame->local;
e7a346
 	priv = this->private;
e7a346
-	fd = local->fd;
e7a346
+        ctx = local->inode_ctx;
e7a346
 
e7a346
 	type = afr_index_for_transaction_type (local->transaction.type);
e7a346
 	if (type != AFR_DATA_TRANSACTION)
e7a346
 		return !local->transaction.dirtied;
e7a346
 
e7a346
-	if (!fd)
e7a346
-		return !local->transaction.dirtied;
e7a346
-
e7a346
-	fd_ctx = afr_fd_ctx_get (fd, this);
e7a346
-	if (!fd_ctx)
e7a346
-		return _gf_false;
e7a346
-
e7a346
 	if (local->transaction.no_uninherit)
e7a346
 		return _gf_false;
e7a346
 
e7a346
@@ -868,34 +940,34 @@ afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this)
e7a346
 	if (local->transaction.uninherit_done)
e7a346
 		return local->transaction.uninherit_value;
e7a346
 
e7a346
-	LOCK(&fd->lock);
e7a346
+	LOCK(&local->inode->lock);
e7a346
 	{
e7a346
 		for (i = 0; i < priv->child_count; i++) {
e7a346
 			if (local->transaction.pre_op[i] !=
e7a346
-			    fd_ctx->pre_op_done[type][i]) {
e7a346
+			    ctx->pre_op_done[type][i]) {
e7a346
 				ret = !local->transaction.dirtied;
e7a346
 				goto unlock;
e7a346
 			}
e7a346
 		}
e7a346
 
e7a346
-		if (fd_ctx->inherited[type]) {
e7a346
+		if (ctx->inherited[type]) {
e7a346
 			ret = _gf_true;
e7a346
-			fd_ctx->inherited[type]--;
e7a346
-		} else if (fd_ctx->on_disk[type]) {
e7a346
+			ctx->inherited[type]--;
e7a346
+		} else if (ctx->on_disk[type]) {
e7a346
 			ret = _gf_false;
e7a346
-			fd_ctx->on_disk[type]--;
e7a346
+			ctx->on_disk[type]--;
e7a346
 		} else {
e7a346
 			/* ASSERT */
e7a346
 			ret = _gf_false;
e7a346
 		}
e7a346
 
e7a346
-		if (!fd_ctx->inherited[type] && !fd_ctx->on_disk[type]) {
e7a346
+		if (!ctx->inherited[type] && !ctx->on_disk[type]) {
e7a346
 			for (i = 0; i < priv->child_count; i++)
e7a346
-				fd_ctx->pre_op_done[type][i] = 0;
e7a346
+				ctx->pre_op_done[type][i] = 0;
e7a346
 		}
e7a346
 	}
e7a346
 unlock:
e7a346
-	UNLOCK(&fd->lock);
e7a346
+	UNLOCK(&local->inode->lock);
e7a346
 
e7a346
 	local->transaction.uninherit_done = _gf_true;
e7a346
 	local->transaction.uninherit_value = ret;
e7a346
@@ -909,31 +981,21 @@ afr_changelog_pre_op_inherit (call_frame_t *frame, xlator_t *this)
e7a346
 {
e7a346
 	afr_local_t *local = NULL;
e7a346
 	afr_private_t *priv = NULL;
e7a346
-	fd_t *fd = NULL;
e7a346
 	int i = 0;
e7a346
 	gf_boolean_t ret = _gf_false;
e7a346
-	afr_fd_ctx_t *fd_ctx = NULL;
e7a346
 	int type = 0;
e7a346
 
e7a346
 	local = frame->local;
e7a346
 	priv = this->private;
e7a346
-	fd = local->fd;
e7a346
 
e7a346
 	if (local->transaction.type != AFR_DATA_TRANSACTION)
e7a346
 		return _gf_false;
e7a346
 
e7a346
 	type = afr_index_for_transaction_type (local->transaction.type);
e7a346
 
e7a346
-	if (!fd)
e7a346
-		return _gf_false;
e7a346
-
e7a346
-	fd_ctx = afr_fd_ctx_get (fd, this);
e7a346
-	if (!fd_ctx)
e7a346
-		return _gf_false;
e7a346
-
e7a346
-	LOCK(&fd->lock);
e7a346
+	LOCK(&local->inode->lock);
e7a346
 	{
e7a346
-		if (!fd_ctx->on_disk[type]) {
e7a346
+		if (!local->inode_ctx->on_disk[type]) {
e7a346
 			/* nothing to inherit yet */
e7a346
 			ret = _gf_false;
e7a346
 			goto unlock;
e7a346
@@ -941,21 +1003,21 @@ afr_changelog_pre_op_inherit (call_frame_t *frame, xlator_t *this)
e7a346
 
e7a346
 		for (i = 0; i < priv->child_count; i++) {
e7a346
 			if (local->transaction.pre_op[i] !=
e7a346
-			    fd_ctx->pre_op_done[type][i]) {
e7a346
+			    local->inode_ctx->pre_op_done[type][i]) {
e7a346
 				/* either inherit exactly, or don't */
e7a346
 				ret = _gf_false;
e7a346
 				goto unlock;
e7a346
 			}
e7a346
 		}
e7a346
 
e7a346
-		fd_ctx->inherited[type]++;
e7a346
+		local->inode_ctx->inherited[type]++;
e7a346
 
e7a346
 		ret = _gf_true;
e7a346
 
e7a346
 		local->transaction.inherited = _gf_true;
e7a346
 	}
e7a346
 unlock:
e7a346
-	UNLOCK(&fd->lock);
e7a346
+	UNLOCK(&local->inode->lock);
e7a346
 
e7a346
 	return ret;
e7a346
 }
e7a346
@@ -966,22 +1028,16 @@ afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this)
e7a346
 {
e7a346
 	afr_local_t *local = NULL;
e7a346
 	afr_private_t *priv = NULL;
e7a346
-	fd_t *fd = NULL;
e7a346
-	afr_fd_ctx_t *fd_ctx = NULL;
e7a346
 	int i = 0;
e7a346
 	gf_boolean_t ret = _gf_false;
e7a346
 	int type = 0;
e7a346
 
e7a346
 	local = frame->local;
e7a346
 	priv = this->private;
e7a346
-	fd = local->fd;
e7a346
 
e7a346
-	if (!fd)
e7a346
-		return _gf_false;
e7a346
-
e7a346
-	fd_ctx = afr_fd_ctx_get (fd, this);
e7a346
-	if (!fd_ctx)
e7a346
-		return _gf_false;
e7a346
+        if (local->transaction.type == AFR_ENTRY_TRANSACTION ||
e7a346
+            local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION)
e7a346
+                return _gf_false;
e7a346
 
e7a346
 	if (local->transaction.inherited)
e7a346
 		/* was already inherited in afr_changelog_pre_op */
e7a346
@@ -997,26 +1053,26 @@ afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this)
e7a346
 
e7a346
 	ret = _gf_false;
e7a346
 
e7a346
-	LOCK(&fd->lock);
e7a346
+	LOCK(&local->inode->lock);
e7a346
 	{
e7a346
-		if (!fd_ctx->on_disk[type]) {
e7a346
+		if (!local->inode_ctx->on_disk[type]) {
e7a346
 			for (i = 0; i < priv->child_count; i++)
e7a346
-				fd_ctx->pre_op_done[type][i] =
e7a346
+				local->inode_ctx->pre_op_done[type][i] =
e7a346
                                         (!local->transaction.failed_subvols[i]);
e7a346
 		} else {
e7a346
 			for (i = 0; i < priv->child_count; i++)
e7a346
-				if (fd_ctx->pre_op_done[type][i] !=
e7a346
+				if (local->inode_ctx->pre_op_done[type][i] !=
e7a346
 				    (!local->transaction.failed_subvols[i])) {
e7a346
 					local->transaction.no_uninherit = 1;
e7a346
 					goto unlock;
e7a346
 				}
e7a346
 		}
e7a346
-		fd_ctx->on_disk[type]++;
e7a346
+		local->inode_ctx->on_disk[type]++;
e7a346
 
e7a346
 		ret = _gf_true;
e7a346
 	}
e7a346
 unlock:
e7a346
-	UNLOCK(&fd->lock);
e7a346
+	UNLOCK(&local->inode->lock);
e7a346
 
e7a346
 	return ret;
e7a346
 }
e7a346
@@ -1324,6 +1380,9 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
e7a346
 
e7a346
         afr_init_optimistic_changelog_for_txn (this, local);
e7a346
 
e7a346
+        if (afr_changelog_pre_op_inherit (frame, this))
e7a346
+                goto next;
e7a346
+
e7a346
         /* This condition should not be met with present code, as
e7a346
          * transaction.done will be called if locks are not acquired on even a
e7a346
          * single node.
e7a346
@@ -1349,9 +1408,6 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
e7a346
 		goto err;
e7a346
 	}
e7a346
 
e7a346
-	if (afr_changelog_pre_op_inherit (frame, this))
e7a346
-		goto next;
e7a346
-
e7a346
         if (call_count < priv->child_count)
e7a346
                 pre_nop = _gf_false;
e7a346
 
e7a346
@@ -1408,7 +1464,7 @@ err:
e7a346
 	local->op_ret = -1;
e7a346
 	local->op_errno = op_errno;
e7a346
 
e7a346
-	afr_unlock (frame, this);
e7a346
+        afr_handle_lock_acquire_failure (local, _gf_true);
e7a346
 
e7a346
 	if (xdata_req)
e7a346
 		dict_unref (xdata_req);
e7a346
@@ -1418,31 +1474,6 @@ err:
e7a346
 
e7a346
 
e7a346
 int
e7a346
-afr_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
e7a346
-{
e7a346
-        afr_internal_lock_t *int_lock = NULL;
e7a346
-        afr_local_t         *local    = NULL;
e7a346
-
e7a346
-        local    = frame->local;
e7a346
-        int_lock = &local->internal_lock;
e7a346
-
e7a346
-        if (int_lock->lock_op_ret < 0) {
e7a346
-                gf_msg (this->name, GF_LOG_INFO,
e7a346
-                        0, AFR_MSG_BLOCKING_LKS_FAILED,
e7a346
-                        "Blocking inodelks failed.");
e7a346
-                afr_transaction_done (frame, this);
e7a346
-        } else {
e7a346
-
e7a346
-                gf_msg_debug (this->name, 0,
e7a346
-                              "Blocking inodelks done. Proceeding to FOP");
e7a346
-                afr_internal_lock_finish (frame, this);
e7a346
-        }
e7a346
-
e7a346
-        return 0;
e7a346
-}
e7a346
-
e7a346
-
e7a346
-int
e7a346
 afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
e7a346
 {
e7a346
         afr_internal_lock_t *int_lock = NULL;
e7a346
@@ -1455,7 +1486,7 @@ afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
e7a346
         if (int_lock->lock_op_ret < 0) {
e7a346
                 gf_msg_debug (this->name, 0,
e7a346
                               "Non blocking inodelks failed. Proceeding to blocking");
e7a346
-                int_lock->lock_cbk = afr_post_blocking_inodelk_cbk;
e7a346
+                int_lock->lock_cbk = afr_internal_lock_finish;
e7a346
                 afr_blocking_lock (frame, this);
e7a346
         } else {
e7a346
 
e7a346
@@ -1469,31 +1500,6 @@ afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
e7a346
 
e7a346
 
e7a346
 int
e7a346
-afr_post_blocking_entrylk_cbk (call_frame_t *frame, xlator_t *this)
e7a346
-{
e7a346
-        afr_internal_lock_t *int_lock = NULL;
e7a346
-        afr_local_t         *local    = NULL;
e7a346
-
e7a346
-        local    = frame->local;
e7a346
-        int_lock = &local->internal_lock;
e7a346
-
e7a346
-        if (int_lock->lock_op_ret < 0) {
e7a346
-                gf_msg (this->name, GF_LOG_INFO, 0,
e7a346
-                        AFR_MSG_BLOCKING_LKS_FAILED,
e7a346
-                        "Blocking entrylks failed.");
e7a346
-                afr_transaction_done (frame, this);
e7a346
-        } else {
e7a346
-
e7a346
-                gf_msg_debug (this->name, 0,
e7a346
-                             "Blocking entrylks done. Proceeding to FOP");
e7a346
-                afr_internal_lock_finish (frame, this);
e7a346
-        }
e7a346
-
e7a346
-        return 0;
e7a346
-}
e7a346
-
e7a346
-
e7a346
-int
e7a346
 afr_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this)
e7a346
 {
e7a346
         afr_internal_lock_t *int_lock = NULL;
e7a346
@@ -1506,7 +1512,7 @@ afr_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this)
e7a346
         if (int_lock->lock_op_ret < 0) {
e7a346
                 gf_msg_debug (this->name, 0,
e7a346
                               "Non blocking entrylks failed. Proceeding to blocking");
e7a346
-                int_lock->lock_cbk = afr_post_blocking_entrylk_cbk;
e7a346
+                int_lock->lock_cbk = afr_internal_lock_finish;
e7a346
                 afr_blocking_lock (frame, this);
e7a346
         } else {
e7a346
 
e7a346
@@ -1567,29 +1573,28 @@ int
e7a346
 afr_set_transaction_flock (xlator_t *this, afr_local_t *local)
e7a346
 {
e7a346
         afr_internal_lock_t *int_lock = NULL;
e7a346
-        afr_inodelk_t       *inodelk  = NULL;
e7a346
         afr_private_t       *priv     = NULL;
e7a346
 
e7a346
         int_lock = &local->internal_lock;
e7a346
-        inodelk = afr_get_inodelk (int_lock, int_lock->domain);
e7a346
         priv = this->private;
e7a346
 
e7a346
-        if ((priv->arbiter_count || priv->full_lock) &&
e7a346
+        if ((priv->arbiter_count || local->transaction.eager_lock_on ||
e7a346
+             priv->full_lock) &&
e7a346
             local->transaction.type == AFR_DATA_TRANSACTION) {
e7a346
                 /*Lock entire file to avoid network split brains.*/
e7a346
-                inodelk->flock.l_len   = 0;
e7a346
-                inodelk->flock.l_start = 0;
e7a346
+                int_lock->flock.l_len   = 0;
e7a346
+                int_lock->flock.l_start = 0;
e7a346
         } else {
e7a346
-                inodelk->flock.l_len   = local->transaction.len;
e7a346
-                inodelk->flock.l_start = local->transaction.start;
e7a346
+                int_lock->flock.l_len   = local->transaction.len;
e7a346
+                int_lock->flock.l_start = local->transaction.start;
e7a346
         }
e7a346
-        inodelk->flock.l_type  = F_WRLCK;
e7a346
+        int_lock->flock.l_type  = F_WRLCK;
e7a346
 
e7a346
         return 0;
e7a346
 }
e7a346
 
e7a346
 int
e7a346
-afr_lock_rec (call_frame_t *frame, xlator_t *this)
e7a346
+afr_lock (call_frame_t *frame, xlator_t *this)
e7a346
 {
e7a346
         afr_internal_lock_t *int_lock = NULL;
e7a346
         afr_local_t         *local    = NULL;
e7a346
@@ -1630,74 +1635,153 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this)
e7a346
         return 0;
e7a346
 }
e7a346
 
e7a346
+static gf_boolean_t
e7a346
+afr_locals_overlap (afr_local_t *local1, afr_local_t *local2)
e7a346
+{
e7a346
+        uint64_t start1 = local1->transaction.start;
e7a346
+        uint64_t start2 = local2->transaction.start;
e7a346
+        uint64_t end1 = 0;
e7a346
+        uint64_t end2 = 0;
e7a346
+
e7a346
+        if (local1->transaction.len)
e7a346
+                end1 = start1 + local1->transaction.len - 1;
e7a346
+        else
e7a346
+                end1 = ULLONG_MAX;
e7a346
+
e7a346
+        if (local2->transaction.len)
e7a346
+                end2 = start2 + local2->transaction.len - 1;
e7a346
+        else
e7a346
+                end2 = ULLONG_MAX;
e7a346
 
e7a346
-int
e7a346
-afr_lock (call_frame_t *frame, xlator_t *this)
e7a346
+        return ((end1 >= start2) && (end2 >= start1));
e7a346
+}
e7a346
+
e7a346
+gf_boolean_t
e7a346
+afr_has_lock_conflict (afr_local_t *local, gf_boolean_t waitlist_check)
e7a346
 {
e7a346
-        afr_set_lock_number (frame, this);
e7a346
+        afr_local_t     *each = NULL;
e7a346
+        afr_lock_t      *lock = NULL;
e7a346
 
e7a346
-        return afr_lock_rec (frame, this);
e7a346
+        lock = &local->inode_ctx->lock[local->transaction.type];
e7a346
+        /*
e7a346
+         * Once full file lock is acquired in eager-lock phase, overlapping
e7a346
+         * writes do not compete for inode-locks, instead are transferred to the
e7a346
+         * next writes. Because of this overlapping writes are not ordered.
e7a346
+         * This can cause inconsistencies in replication.
e7a346
+         * Example:
e7a346
+         * Two overlapping writes w1, w2 are sent in parallel on same fd
e7a346
+         * in two threads t1, t2.
e7a346
+         * Both threads can execute afr_writev_wind in the following manner.
e7a346
+         * t1 winds w1 on brick-0
e7a346
+         * t2 winds w2 on brick-0
e7a346
+         * t2 winds w2 on brick-1
e7a346
+         * t1 winds w1 on brick-1
e7a346
+         *
e7a346
+         * This check makes sure the locks are not transferred for
e7a346
+         * overlapping writes.
e7a346
+         */
e7a346
+        list_for_each_entry (each, &lock->owners, transaction.owner_list) {
e7a346
+                if (afr_locals_overlap (each, local)) {
e7a346
+                        return _gf_true;
e7a346
+                }
e7a346
+        }
e7a346
+
e7a346
+        if (!waitlist_check)
e7a346
+                return _gf_false;
e7a346
+        list_for_each_entry (each, &lock->waiting, transaction.wait_list) {
e7a346
+                if (afr_locals_overlap (each, local)) {
e7a346
+                        return _gf_true;
e7a346
+                }
e7a346
+        }
e7a346
+        return _gf_false;
e7a346
 }
e7a346
 
e7a346
 
e7a346
 /* }}} */
e7a346
-
e7a346
-int
e7a346
-afr_internal_lock_finish (call_frame_t *frame, xlator_t *this)
e7a346
+static void
e7a346
+afr_copy_inodelk_vars (afr_internal_lock_t *dst, afr_internal_lock_t *src,
e7a346
+                       xlator_t *this)
e7a346
 {
e7a346
-        afr_changelog_pre_op (frame, this);
e7a346
+        afr_private_t *priv = this->private;
e7a346
 
e7a346
-        return 0;
e7a346
+        dst->domain = src->domain;
e7a346
+        dst->flock.l_len  = src->flock.l_len;
e7a346
+        dst->flock.l_start  = src->flock.l_start;
e7a346
+        dst->flock.l_type  = src->flock.l_type;
e7a346
+        dst->lock_count = src->lock_count;
e7a346
+        memcpy (dst->locked_nodes, src->locked_nodes,
e7a346
+                priv->child_count * sizeof (*dst->locked_nodes));
e7a346
 }
e7a346
 
e7a346
-
e7a346
 void
e7a346
-afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this)
e7a346
+__afr_transaction_wake_shared (afr_local_t *local, struct list_head *shared)
e7a346
 {
e7a346
-        afr_local_t    *local = NULL;
e7a346
-        afr_private_t  *priv = NULL;
e7a346
+        gf_boolean_t conflict = _gf_false;
e7a346
+        afr_local_t *each = NULL;
e7a346
+        afr_lock_t *lock = &local->inode_ctx->lock[local->transaction.type];
e7a346
 
e7a346
-        /* call this function from any of the related optimizations
e7a346
-           which benefit from delaying post op are enabled, namely:
e7a346
-
e7a346
-           - changelog piggybacking
e7a346
-           - eager locking
e7a346
-        */
e7a346
+        while (!conflict) {
e7a346
+                if (list_empty (&lock->waiting))
e7a346
+                        return;
e7a346
+                each = list_entry(lock->waiting.next, afr_local_t,
e7a346
+                                  transaction.wait_list);
e7a346
+                if (afr_has_lock_conflict (each, _gf_false)) {
e7a346
+                        conflict = _gf_true;
e7a346
+                }
e7a346
+                if (conflict && !list_empty (&lock->owners))
e7a346
+                        return;
e7a346
+                afr_copy_inodelk_vars (&each->internal_lock,
e7a346
+                                       &local->internal_lock,
e7a346
+                                       each->transaction.frame->this);
e7a346
+                list_move_tail (&each->transaction.wait_list, shared);
e7a346
+                list_add_tail(&each->transaction.owner_list, &lock->owners);
e7a346
+        }
e7a346
+}
e7a346
 
e7a346
-        priv = this->private;
e7a346
-        if (!priv)
e7a346
-                return;
e7a346
+static void
e7a346
+afr_lock_resume_shared (struct list_head *list)
e7a346
+{
e7a346
+        afr_local_t *each = NULL;
e7a346
 
e7a346
-        if (!priv->post_op_delay_secs)
e7a346
-                return;
e7a346
+        while (!list_empty(list)) {
e7a346
+                each = list_entry(list->next, afr_local_t,
e7a346
+                                  transaction.wait_list);
e7a346
+                list_del_init(&each->transaction.wait_list);
e7a346
+                afr_changelog_pre_op (each->transaction.frame,
e7a346
+                                      each->transaction.frame->this);
e7a346
+        }
e7a346
+}
e7a346
 
e7a346
-        local = frame->local;
e7a346
-        if (!local)
e7a346
-                return;
e7a346
+int
e7a346
+afr_internal_lock_finish (call_frame_t *frame, xlator_t *this)
e7a346
+{
e7a346
+        afr_local_t *local = frame->local;
e7a346
+        afr_lock_t   *lock  = NULL;
e7a346
 
e7a346
-        if (!local->transaction.eager_lock_on)
e7a346
-                return;
e7a346
 
e7a346
-        if (!local->fd)
e7a346
-                return;
e7a346
+        local->internal_lock.lock_cbk = NULL;
e7a346
+        if (!local->transaction.eager_lock_on) {
e7a346
+                if (local->internal_lock.lock_op_ret < 0) {
e7a346
+                        afr_transaction_done (frame, this);
e7a346
+                        return 0;
e7a346
+                }
e7a346
+                afr_changelog_pre_op (frame, this);
e7a346
+        } else {
e7a346
+                lock = &local->inode_ctx->lock[local->transaction.type];
e7a346
+                if (local->internal_lock.lock_op_ret < 0) {
e7a346
+                        afr_handle_lock_acquire_failure (local, _gf_false);
e7a346
+                } else {
e7a346
+                        lock->event_generation = local->event_generation;
e7a346
+                        afr_changelog_pre_op (frame, this);
e7a346
+                }
e7a346
+        }
e7a346
 
e7a346
-        if (local->op == GF_FOP_WRITE)
e7a346
-                local->delayed_post_op = _gf_true;
e7a346
+        return 0;
e7a346
 }
e7a346
 
e7a346
 gf_boolean_t
e7a346
-afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this)
e7a346
+afr_are_multiple_fds_opened (afr_local_t *local, xlator_t *this)
e7a346
 {
e7a346
-        afr_fd_ctx_t *fd_ctx = NULL;
e7a346
-
e7a346
-        if (!fd) {
e7a346
-                /* If false is returned, it may keep on taking eager-lock
e7a346
-                 * which may lead to starvation, so return true to avoid that.
e7a346
-                 */
e7a346
-                gf_msg_callingfn (this->name, GF_LOG_ERROR, EBADF,
e7a346
-                                  AFR_MSG_INVALID_ARG, "Invalid fd");
e7a346
-                return _gf_true;
e7a346
-        }
e7a346
         /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock
e7a346
          * is taken mount2 opened the same file, it won't be able to
e7a346
          * perform any data operations until mount1 releases eager-lock.
e7a346
@@ -1705,11 +1789,7 @@ afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this)
e7a346
          * if open-fd-count is > 1
e7a346
          */
e7a346
 
e7a346
-        fd_ctx = afr_fd_ctx_get (fd, this);
e7a346
-        if (!fd_ctx)
e7a346
-                return _gf_true;
e7a346
-
e7a346
-        if (fd_ctx->open_fd_count > 1)
e7a346
+        if (local->inode_ctx->open_fd_count > 1)
e7a346
                 return _gf_true;
e7a346
 
e7a346
         return _gf_false;
e7a346
@@ -1717,24 +1797,45 @@ afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this)
e7a346
 
e7a346
 
e7a346
 gf_boolean_t
e7a346
-is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this)
e7a346
+afr_is_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this,
e7a346
+                                         int delay)
e7a346
 {
e7a346
-        afr_local_t      *local = NULL;
e7a346
-        gf_boolean_t      res = _gf_false;
e7a346
+        afr_local_t  *local = NULL;
e7a346
+        afr_lock_t   *lock  = NULL;
e7a346
+        gf_boolean_t res    = _gf_false;
e7a346
 
e7a346
         local = frame->local;
e7a346
-        if (!local)
e7a346
+        lock = &local->inode_ctx->lock[local->transaction.type];
e7a346
+
e7a346
+        if (!afr_txn_nothing_failed (frame, this)) {
e7a346
+                lock->release = _gf_true;
e7a346
                 goto out;
e7a346
+        }
e7a346
 
e7a346
-        if (!local->delayed_post_op)
e7a346
+        if (afr_are_multiple_fds_opened (local, this)) {
e7a346
+                lock->release = _gf_true;
e7a346
                 goto out;
e7a346
+        }
e7a346
 
e7a346
-        //Mark pending changelog ASAP
e7a346
-        if (!afr_txn_nothing_failed (frame, this))
e7a346
+        if (!list_empty (&lock->owners))
e7a346
+                goto out;
e7a346
+        else
e7a346
+                GF_ASSERT (list_empty (&lock->waiting));
e7a346
+
e7a346
+        if (lock->release) {
e7a346
+                goto out;
e7a346
+        }
e7a346
+
e7a346
+        if (!delay) {
e7a346
                 goto out;
e7a346
+        }
e7a346
 
e7a346
-        if (local->fd && afr_are_multiple_fds_opened (local->fd, this))
e7a346
+        if ((local->op != GF_FOP_WRITE) &&
e7a346
+            (local->op != GF_FOP_FXATTROP)) {
e7a346
+                /*Only allow writes but shard does [f]xattrops on writes, so
e7a346
+                 * they are fine too*/
e7a346
                 goto out;
e7a346
+        }
e7a346
 
e7a346
         res = _gf_true;
e7a346
 out:
e7a346
@@ -1745,50 +1846,61 @@ out:
e7a346
 void
e7a346
 afr_delayed_changelog_wake_up_cbk (void *data)
e7a346
 {
e7a346
-        fd_t           *fd = NULL;
e7a346
+        afr_lock_t  *lock  = NULL;
e7a346
+        afr_local_t *local = data;
e7a346
+        afr_local_t *timer_local = NULL;
e7a346
+        struct list_head shared;
e7a346
 
e7a346
-        fd = data;
e7a346
-
e7a346
-        afr_delayed_changelog_wake_up (THIS, fd);
e7a346
+        INIT_LIST_HEAD (&shared);
e7a346
+        lock = &local->inode_ctx->lock[local->transaction.type];
e7a346
+        LOCK (&local->inode->lock);
e7a346
+        {
e7a346
+                timer_local = list_entry(lock->post_op.next,
e7a346
+                                         afr_local_t,
e7a346
+                                        transaction.owner_list);
e7a346
+                if (list_empty (&lock->owners) && (local == timer_local)) {
e7a346
+                        GF_ASSERT (list_empty (&lock->waiting));
e7a346
+                        /*Last owner*/
e7a346
+                        lock->release = _gf_true;
e7a346
+                        lock->delay_timer = NULL;
e7a346
+                }
e7a346
+        }
e7a346
+        UNLOCK (&local->inode->lock);
e7a346
+        afr_changelog_post_op_now (local->transaction.frame,
e7a346
+                                   local->transaction.frame->this);
e7a346
 }
e7a346
 
e7a346
 
e7a346
 /* SET operation */
e7a346
 int
e7a346
-afr_fd_report_unstable_write (xlator_t *this, fd_t *fd)
e7a346
+afr_fd_report_unstable_write (xlator_t *this, afr_local_t *local)
e7a346
 {
e7a346
-        afr_fd_ctx_t *fdctx = NULL;
e7a346
-
e7a346
-        fdctx = afr_fd_ctx_get (fd, this);
e7a346
-
e7a346
-        LOCK(&fd->lock);
e7a346
+        LOCK(&local->inode->lock);
e7a346
         {
e7a346
-                fdctx->witnessed_unstable_write = _gf_true;
e7a346
+                local->inode_ctx->witnessed_unstable_write = _gf_true;
e7a346
         }
e7a346
-        UNLOCK(&fd->lock);
e7a346
+        UNLOCK(&local->inode->lock);
e7a346
 
e7a346
         return 0;
e7a346
 }
e7a346
 
e7a346
 /* TEST and CLEAR operation */
e7a346
 gf_boolean_t
e7a346
-afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd)
e7a346
+afr_fd_has_witnessed_unstable_write (xlator_t *this, inode_t *inode)
e7a346
 {
e7a346
-        afr_fd_ctx_t *fdctx = NULL;
e7a346
+        afr_inode_ctx_t *ctx = NULL;
e7a346
         gf_boolean_t witness = _gf_false;
e7a346
 
e7a346
-        fdctx = afr_fd_ctx_get (fd, this);
e7a346
-        if (!fdctx)
e7a346
-                return _gf_true;
e7a346
-
e7a346
-        LOCK(&fd->lock);
e7a346
+        LOCK(&inode->lock);
e7a346
         {
e7a346
-                if (fdctx->witnessed_unstable_write) {
e7a346
+                (void)__afr_inode_ctx_get (this, inode, &ctx;;
e7a346
+
e7a346
+                if (ctx->witnessed_unstable_write) {
e7a346
                         witness = _gf_true;
e7a346
-                        fdctx->witnessed_unstable_write = _gf_false;
e7a346
+                        ctx->witnessed_unstable_write = _gf_false;
e7a346
                 }
e7a346
         }
e7a346
-        UNLOCK (&fd->lock);
e7a346
+        UNLOCK (&inode->lock);
e7a346
 
e7a346
         return witness;
e7a346
 }
e7a346
@@ -1931,7 +2043,7 @@ afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this)
e7a346
            mark a flag in the fdctx whenever an unstable write is witnessed.
e7a346
            */
e7a346
 
e7a346
-        if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) {
e7a346
+        if (!afr_fd_has_witnessed_unstable_write (this, local->inode)) {
e7a346
                 afr_changelog_post_op_now (frame, this);
e7a346
                 return 0;
e7a346
         }
e7a346
@@ -1949,87 +2061,64 @@ afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this)
e7a346
         return 0;
e7a346
 }
e7a346
 
e7a346
-
e7a346
 void
e7a346
-afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd,
e7a346
-                               call_stub_t *stub)
e7a346
+afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
e7a346
 {
e7a346
-	afr_fd_ctx_t      *fd_ctx = NULL;
e7a346
-	call_frame_t      *prev_frame = NULL;
e7a346
-	struct timespec    delta = {0, };
e7a346
-	afr_private_t     *priv = NULL;
e7a346
-	afr_local_t       *local = NULL;
e7a346
+	struct timespec delta   = {0, };
e7a346
+	afr_private_t   *priv   = NULL;
e7a346
+	afr_local_t     *local  = frame->local;
e7a346
+        afr_lock_t      *lock   = NULL;
e7a346
+        gf_boolean_t    post_op = _gf_true;
e7a346
+        struct list_head  shared;
e7a346
 
e7a346
 	priv = this->private;
e7a346
-
e7a346
-	fd_ctx = afr_fd_ctx_get (fd, this);
e7a346
-	if (!fd_ctx)
e7a346
-                goto out;
e7a346
-
e7a346
 	delta.tv_sec = priv->post_op_delay_secs;
e7a346
 	delta.tv_nsec = 0;
e7a346
 
e7a346
-	pthread_mutex_lock (&fd_ctx->delay_lock);
e7a346
-	{
e7a346
-		prev_frame = fd_ctx->delay_frame;
e7a346
-		fd_ctx->delay_frame = NULL;
e7a346
-		if (fd_ctx->delay_timer)
e7a346
-			gf_timer_call_cancel (this->ctx, fd_ctx->delay_timer);
e7a346
-		fd_ctx->delay_timer = NULL;
e7a346
-		if (!frame)
e7a346
-			goto unlock;
e7a346
-		fd_ctx->delay_timer = gf_timer_call_after (this->ctx, delta,
e7a346
-							   afr_delayed_changelog_wake_up_cbk,
e7a346
-							   fd);
e7a346
-		fd_ctx->delay_frame = frame;
e7a346
-	}
e7a346
-unlock:
e7a346
-	pthread_mutex_unlock (&fd_ctx->delay_lock);
e7a346
-
e7a346
-out:
e7a346
-	if (prev_frame) {
e7a346
-		local = prev_frame->local;
e7a346
-		local->transaction.resume_stub = stub;
e7a346
-		afr_changelog_post_op_now (prev_frame, this);
e7a346
-	} else if (stub) {
e7a346
-		call_resume (stub);
e7a346
-	}
e7a346
-}
e7a346
-
e7a346
-
e7a346
-void
e7a346
-afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
e7a346
-{
e7a346
-        afr_local_t  *local = NULL;
e7a346
-
e7a346
-        local = frame->local;
e7a346
-
e7a346
-        if (is_afr_delayed_changelog_post_op_needed (frame, this))
e7a346
-                afr_delayed_changelog_post_op (this, frame, local->fd, NULL);
e7a346
-        else
e7a346
-                afr_changelog_post_op_safe (frame, this);
e7a346
-}
e7a346
-
e7a346
+        INIT_LIST_HEAD (&shared);
e7a346
+        if (!local->transaction.eager_lock_on)
e7a346
+                goto out;
e7a346
 
e7a346
+        lock = &local->inode_ctx->lock[local->transaction.type];
e7a346
+        LOCK (&local->inode->lock);
e7a346
+	{
e7a346
+                list_del_init (&local->transaction.owner_list);
e7a346
+                list_add (&local->transaction.owner_list, &lock->post_op);
e7a346
+                __afr_transaction_wake_shared (local, &shared);
e7a346
+
e7a346
+                if (!afr_is_delayed_changelog_post_op_needed (frame, this,
e7a346
+                                                              delta.tv_sec)) {
e7a346
+                        if (list_empty (&lock->owners))
e7a346
+                                lock->release = _gf_true;
e7a346
+                        goto unlock;
e7a346
+                }
e7a346
 
e7a346
-/* Wake up the sleeping/delayed post-op, and also register
e7a346
-   a stub to have it resumed after this transaction
e7a346
-   completely finishes.
e7a346
+                GF_ASSERT (lock->delay_timer == NULL);
e7a346
+		lock->delay_timer = gf_timer_call_after (this->ctx, delta,
e7a346
+                                              afr_delayed_changelog_wake_up_cbk,
e7a346
+                                              local);
e7a346
+                if (!lock->delay_timer) {
e7a346
+                        lock->release = _gf_true;
e7a346
+                } else {
e7a346
+                        post_op = _gf_false;
e7a346
+                }
e7a346
 
e7a346
-   The @stub gets saved in @local and gets resumed in
e7a346
-   afr_local_cleanup()
e7a346
-   */
e7a346
-void
e7a346
-afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub)
e7a346
-{
e7a346
-        afr_delayed_changelog_post_op (this, NULL, fd, stub);
e7a346
-}
e7a346
+	}
e7a346
+unlock:
e7a346
+        UNLOCK (&local->inode->lock);
e7a346
 
e7a346
+        if (!list_empty (&shared)) {
e7a346
+                afr_lock_resume_shared (&shared);
e7a346
+        }
e7a346
 
e7a346
-void
e7a346
-afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd)
e7a346
-{
e7a346
-        afr_delayed_changelog_post_op (this, NULL, fd, NULL);
e7a346
+out:
e7a346
+        if (post_op) {
e7a346
+                if (!local->transaction.eager_lock_on || lock->release) {
e7a346
+                        afr_changelog_post_op_safe (frame, this);
e7a346
+                } else {
e7a346
+                        afr_changelog_post_op_now (frame, this);
e7a346
+                }
e7a346
+        }
e7a346
 }
e7a346
 
e7a346
 int
e7a346
@@ -2039,13 +2128,6 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this)
e7a346
 
e7a346
         local    = frame->local;
e7a346
 
e7a346
-        if (local->transaction.eager_lock_on) {
e7a346
-                /* We don't need to retain "local" in the
e7a346
-                   fd list anymore, writes to all subvols
e7a346
-                   are finished by now */
e7a346
-                afr_remove_eager_lock_stub (local);
e7a346
-        }
e7a346
-
e7a346
         afr_restore_lk_owner (frame);
e7a346
 
e7a346
         afr_handle_symmetric_errors (frame, this);
e7a346
@@ -2076,114 +2158,149 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,
e7a346
 	local->transaction.failed_subvols[child_index] = 1;
e7a346
 }
e7a346
 
e7a346
-
e7a346
-
e7a346
 static gf_boolean_t
e7a346
-afr_locals_overlap (afr_local_t *local1, afr_local_t *local2)
e7a346
+__need_previous_lock_unlocked (afr_local_t *local)
e7a346
 {
e7a346
-        uint64_t start1 = local1->transaction.start;
e7a346
-        uint64_t start2 = local2->transaction.start;
e7a346
-        uint64_t end1 = 0;
e7a346
-        uint64_t end2 = 0;
e7a346
-
e7a346
-        if (local1->transaction.len)
e7a346
-                end1 = start1 + local1->transaction.len - 1;
e7a346
-        else
e7a346
-                end1 = ULLONG_MAX;
e7a346
+        afr_lock_t      *lock = NULL;
e7a346
 
e7a346
-        if (local2->transaction.len)
e7a346
-                end2 = start2 + local2->transaction.len - 1;
e7a346
-        else
e7a346
-                end2 = ULLONG_MAX;
e7a346
+        if (!local->transaction.eager_lock_on)
e7a346
+                return _gf_true;
e7a346
 
e7a346
-        return ((end1 >= start2) && (end2 >= start1));
e7a346
+        lock = &local->inode_ctx->lock[local->transaction.type];
e7a346
+        if (!lock->acquired)
e7a346
+                return _gf_false;
e7a346
+        if (lock->acquired && lock->event_generation != local->event_generation)
e7a346
+                return _gf_true;
e7a346
+        return _gf_false;
e7a346
 }
e7a346
 
e7a346
 void
e7a346
-afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this)
e7a346
+__afr_eager_lock_handle (afr_local_t *local, gf_boolean_t *take_lock,
e7a346
+                         gf_boolean_t *do_pre_op, afr_local_t **timer_local)
e7a346
 {
e7a346
-        afr_private_t *priv = NULL;
e7a346
-        afr_fd_ctx_t  *fdctx = NULL;
e7a346
-        afr_local_t   *each = NULL;
e7a346
+        afr_lock_t      *lock = NULL;
e7a346
+        afr_local_t     *owner_local = NULL;
e7a346
+        xlator_t        *this = local->transaction.frame->this;
e7a346
 
e7a346
-        priv = this->private;
e7a346
-
e7a346
-        if (!local->fd)
e7a346
-                return;
e7a346
-
e7a346
-        if (local->transaction.type != AFR_DATA_TRANSACTION)
e7a346
-                return;
e7a346
+        if (local->fd && !afr_are_multiple_fds_opened (local, this)) {
e7a346
+                local->transaction.eager_lock_on = _gf_true;
e7a346
+        }
e7a346
 
e7a346
-        if (!priv->eager_lock)
e7a346
-                return;
e7a346
+        lock = &local->inode_ctx->lock[local->transaction.type];
e7a346
+        if (__need_previous_lock_unlocked (local)) {
e7a346
+                if (!list_empty (&lock->owners)) {
e7a346
+                        lock->release = _gf_true;
e7a346
+                } else if (lock->delay_timer) {
e7a346
+                        lock->release = _gf_true;
e7a346
+                        if (gf_timer_call_cancel (this->ctx,
e7a346
+                                                  lock->delay_timer)) {
e7a346
+                                /* It will be put in frozen list
e7a346
+                                 * in the code flow below*/
e7a346
+                        } else {
e7a346
+                                *timer_local = list_entry(lock->post_op.next,
e7a346
+                                                          afr_local_t,
e7a346
+                                                        transaction.owner_list);
e7a346
+                                lock->delay_timer = NULL;
e7a346
+                        }
e7a346
+                }
e7a346
+                if (!local->transaction.eager_lock_on)
e7a346
+                        goto out;
e7a346
+        }
e7a346
 
e7a346
-        fdctx = afr_fd_ctx_get (local->fd, this);
e7a346
-        if (!fdctx)
e7a346
-                return;
e7a346
+        if (lock->release) {
e7a346
+                list_add_tail (&local->transaction.wait_list,
e7a346
+                               &lock->frozen);
e7a346
+                *take_lock = _gf_false;
e7a346
+                goto out;
e7a346
+        }
e7a346
 
e7a346
-        if (afr_are_multiple_fds_opened (local->fd, this))
e7a346
-                return;
e7a346
-        /*
e7a346
-         * Once full file lock is acquired in eager-lock phase, overlapping
e7a346
-         * writes do not compete for inode-locks, instead are transferred to the
e7a346
-         * next writes. Because of this overlapping writes are not ordered.
e7a346
-         * This can cause inconsistencies in replication.
e7a346
-         * Example:
e7a346
-         * Two overlapping writes w1, w2 are sent in parallel on same fd
e7a346
-         * in two threads t1, t2.
e7a346
-         * Both threads can execute afr_writev_wind in the following manner.
e7a346
-         * t1 winds w1 on brick-0
e7a346
-         * t2 winds w2 on brick-0
e7a346
-         * t2 winds w2 on brick-1
e7a346
-         * t1 winds w1 on brick-1
e7a346
-         *
e7a346
-         * This check makes sure the locks are not transferred for
e7a346
-         * overlapping writes.
e7a346
-         */
e7a346
-        LOCK (&local->fd->lock);
e7a346
-        {
e7a346
-                list_for_each_entry (each, &fdctx->eager_locked,
e7a346
-                                     transaction.eager_locked) {
e7a346
-                        if (afr_locals_overlap (each, local)) {
e7a346
-                                local->transaction.eager_lock_on = _gf_false;
e7a346
-                                goto unlock;
e7a346
-                        }
e7a346
+        if (lock->delay_timer) {
e7a346
+                *take_lock = _gf_false;
e7a346
+                if (gf_timer_call_cancel (this->ctx,
e7a346
+                                          lock->delay_timer)) {
e7a346
+                        list_add_tail (&local->transaction.wait_list,
e7a346
+                                       &lock->frozen);
e7a346
+                } else {
e7a346
+                        *timer_local = list_entry(lock->post_op.next,
e7a346
+                                                  afr_local_t,
e7a346
+                                                  transaction.owner_list);
e7a346
+                        afr_copy_inodelk_vars (&local->internal_lock,
e7a346
+                                               &(*timer_local)->internal_lock,
e7a346
+                                               this);
e7a346
+                        lock->delay_timer = NULL;
e7a346
+                        *do_pre_op = _gf_true;
e7a346
+                        list_add_tail (&local->transaction.owner_list,
e7a346
+                                       &lock->owners);
e7a346
                 }
e7a346
+                goto out;
e7a346
+        }
e7a346
 
e7a346
-                local->transaction.eager_lock_on = _gf_true;
e7a346
-                list_add_tail (&local->transaction.eager_locked,
e7a346
-                               &fdctx->eager_locked);
e7a346
+        if (!list_empty (&lock->owners)) {
e7a346
+                if (!lock->acquired ||
e7a346
+                    afr_has_lock_conflict (local, _gf_true)) {
e7a346
+                        list_add_tail (&local->transaction.wait_list,
e7a346
+                                       &lock->waiting);
e7a346
+                        *take_lock = _gf_false;
e7a346
+                        goto out;
e7a346
+                }
e7a346
+                owner_local = list_entry (lock->owners.next,
e7a346
+                                          afr_local_t,
e7a346
+                                          transaction.owner_list);
e7a346
+                afr_copy_inodelk_vars (&local->internal_lock,
e7a346
+                                       &owner_local->internal_lock,
e7a346
+                                       this);
e7a346
+                *take_lock = _gf_false;
e7a346
+                *do_pre_op = _gf_true;
e7a346
         }
e7a346
-unlock:
e7a346
-        UNLOCK (&local->fd->lock);
e7a346
+
e7a346
+        if (lock->acquired)
e7a346
+                GF_ASSERT (!(*take_lock));
e7a346
+        list_add_tail (&local->transaction.owner_list, &lock->owners);
e7a346
+out:
e7a346
+        return;
e7a346
 }
e7a346
 
e7a346
 void
e7a346
-afr_transaction_start (call_frame_t *frame, xlator_t *this)
e7a346
+afr_transaction_start (afr_local_t *local, xlator_t *this)
e7a346
 {
e7a346
-        afr_local_t   *local = frame->local;
e7a346
-        fd_t          *fd    = NULL;
e7a346
+        afr_private_t   *priv = NULL;
e7a346
+        gf_boolean_t    take_lock  = _gf_true;
e7a346
+        gf_boolean_t    do_pre_op  = _gf_false;
e7a346
+        afr_local_t     *timer_local = NULL;
e7a346
 
e7a346
-        afr_transaction_eager_lock_init (local, this);
e7a346
+        priv = this->private;
e7a346
 
e7a346
-        if (local->fd && local->transaction.eager_lock_on)
e7a346
-                afr_set_lk_owner (frame, this, local->fd);
e7a346
-        else
e7a346
-                afr_set_lk_owner (frame, this, frame->root);
e7a346
+        if (local->transaction.type != AFR_DATA_TRANSACTION &&
e7a346
+            local->transaction.type != AFR_METADATA_TRANSACTION)
e7a346
+                goto lock_phase;
e7a346
 
e7a346
-        if (!local->transaction.eager_lock_on && local->loc.inode) {
e7a346
-                fd = fd_lookup (local->loc.inode, frame->root->pid);
e7a346
-                if (fd == NULL)
e7a346
-                        fd = fd_lookup_anonymous (local->loc.inode,
e7a346
-                                                  GF_ANON_FD_FLAGS);
e7a346
+        if (!priv->eager_lock)
e7a346
+                goto lock_phase;
e7a346
 
e7a346
-                if (fd) {
e7a346
-                        afr_delayed_changelog_wake_up (this, fd);
e7a346
-                        fd_unref (fd);
e7a346
-                }
e7a346
+        LOCK (&local->inode->lock);
e7a346
+        {
e7a346
+                __afr_eager_lock_handle (local, &take_lock, &do_pre_op,
e7a346
+                                         &timer_local);
e7a346
         }
e7a346
+        UNLOCK (&local->inode->lock);
e7a346
+lock_phase:
e7a346
+        if (!local->transaction.eager_lock_on) {
e7a346
+                afr_set_lk_owner (local->transaction.frame, this,
e7a346
+                                  local->transaction.frame->root);
e7a346
+        } else {
e7a346
+                afr_set_lk_owner (local->transaction.frame, this, local->inode);
e7a346
+        }
e7a346
+
e7a346
 
e7a346
-        afr_lock (frame, this);
e7a346
+        if (take_lock) {
e7a346
+                afr_lock (local->transaction.frame, this);
e7a346
+        } else if (do_pre_op) {
e7a346
+                afr_changelog_pre_op (local->transaction.frame, this);
e7a346
+        }
e7a346
+        /*Always call delayed_changelog_wake_up_cbk after calling pre-op above
e7a346
+         * so that any inheriting can happen*/
e7a346
+        if (timer_local)
e7a346
+                afr_delayed_changelog_wake_up_cbk (timer_local);
e7a346
 }
e7a346
 
e7a346
 int
e7a346
@@ -2196,7 +2313,7 @@ afr_write_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err)
e7a346
                 goto fail;
e7a346
         }
e7a346
 
e7a346
-        afr_transaction_start (frame, this);
e7a346
+        afr_transaction_start (local, this);
e7a346
         return 0;
e7a346
 fail:
e7a346
         local->transaction.unwind (frame, this);
e7a346
@@ -2214,6 +2331,7 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
e7a346
 
e7a346
         local = frame->local;
e7a346
         priv  = this->private;
e7a346
+        local->transaction.frame = frame;
e7a346
 
e7a346
         local->transaction.type   = type;
e7a346
 
e7a346
@@ -2226,11 +2344,9 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
e7a346
         if (ret < 0)
e7a346
                 goto out;
e7a346
 
e7a346
-        if (type == AFR_ENTRY_TRANSACTION ||
e7a346
-            type == AFR_ENTRY_RENAME_TRANSACTION) {
e7a346
-                afr_transaction_start (frame, this);
e7a346
-                ret = 0;
e7a346
-                goto out;
e7a346
+
e7a346
+        if (type != AFR_METADATA_TRANSACTION) {
e7a346
+                goto txn_start;
e7a346
         }
e7a346
 
e7a346
         ret = afr_inode_get_readable (frame, local->inode, this,
e7a346
@@ -2240,10 +2356,13 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
e7a346
                                                   event_generation)) {
e7a346
                 afr_inode_refresh (frame, this, local->inode, local->loc.gfid,
e7a346
                                    afr_write_txn_refresh_done);
e7a346
-        } else {
e7a346
-                afr_transaction_start (frame, this);
e7a346
+                ret = 0;
e7a346
+                goto out;
e7a346
         }
e7a346
+
e7a346
+txn_start:
e7a346
         ret = 0;
e7a346
+        afr_transaction_start (local, this);
e7a346
 out:
e7a346
         return ret;
e7a346
 }
e7a346
diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h
e7a346
index ddcb1eb..a27e9a3 100644
e7a346
--- a/xlators/cluster/afr/src/afr-transaction.h
e7a346
+++ b/xlators/cluster/afr/src/afr-transaction.h
e7a346
@@ -17,12 +17,6 @@ void
e7a346
 afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,
e7a346
 			    int child_index);
e7a346
 
e7a346
-int
e7a346
-afr_lock_server_count (afr_private_t *priv, afr_transaction_type type);
e7a346
-
e7a346
-afr_inodelk_t*
e7a346
-afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom);
e7a346
-
e7a346
 int32_t
e7a346
 afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type);
e7a346
 
e7a346
@@ -30,9 +24,6 @@ int
e7a346
 afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending);
e7a346
 
e7a346
 void
e7a346
-afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this);
e7a346
-
e7a346
-void
e7a346
 afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd);
e7a346
 
e7a346
 void
e7a346
@@ -57,4 +48,8 @@ afr_pick_error_xdata (afr_local_t *local, afr_private_t *priv,
e7a346
                       inode_t *inode2, unsigned char *readable2);
e7a346
 int
e7a346
 afr_transaction_resume (call_frame_t *frame, xlator_t *this);
e7a346
+int
e7a346
+afr_lock (call_frame_t *frame, xlator_t *this);
e7a346
+void
e7a346
+afr_delayed_changelog_wake_up_cbk (void *data);
e7a346
 #endif /* __TRANSACTION_H__ */
e7a346
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
e7a346
index 5ff57c0..6be59dc 100644
e7a346
--- a/xlators/cluster/afr/src/afr.h
e7a346
+++ b/xlators/cluster/afr/src/afr.h
e7a346
@@ -230,19 +230,12 @@ int
e7a346
 afr_entry_lockee_cmp (const void *l1, const void *l2);
e7a346
 
e7a346
 typedef struct {
e7a346
-        char    *domain; /* Domain on which inodelk is taken */
e7a346
-        struct gf_flock flock;
e7a346
-        unsigned char *locked_nodes;
e7a346
-        int32_t lock_count;
e7a346
-} afr_inodelk_t;
e7a346
-
e7a346
-typedef struct {
e7a346
         loc_t *lk_loc;
e7a346
 
e7a346
         int                     lockee_count;
e7a346
         afr_entry_lockee_t      lockee[AFR_LOCKEE_COUNT_MAX];
e7a346
 
e7a346
-        afr_inodelk_t       inodelk[AFR_DOM_COUNT_MAX];
e7a346
+        struct gf_flock flock;
e7a346
         const char *lk_basename;
e7a346
         const char *lower_basename;
e7a346
         const char *higher_basename;
e7a346
@@ -255,7 +248,6 @@ typedef struct {
e7a346
         int32_t lock_count;
e7a346
         int32_t entrylk_lock_count;
e7a346
 
e7a346
-        uint64_t lock_number;
e7a346
         int32_t lk_call_count;
e7a346
         int32_t lk_expected_count;
e7a346
         int32_t lk_attempted_count;
e7a346
@@ -292,37 +284,9 @@ typedef enum {
e7a346
 } afr_fd_open_status_t;
e7a346
 
e7a346
 typedef struct {
e7a346
-        unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS];
e7a346
-	int inherited[AFR_NUM_CHANGE_LOGS];
e7a346
-	int on_disk[AFR_NUM_CHANGE_LOGS];
e7a346
         afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */
e7a346
-
e7a346
-        unsigned int *lock_piggyback;
e7a346
-        unsigned int *lock_acquired;
e7a346
-
e7a346
         int flags;
e7a346
 
e7a346
-	/* used for delayed-post-op optimization */
e7a346
-	pthread_mutex_t    delay_lock;
e7a346
-	gf_timer_t        *delay_timer;
e7a346
-	call_frame_t      *delay_frame;
e7a346
-
e7a346
-	/* set if any write on this fd was a non stable write
e7a346
-	   (i.e, without O_SYNC or O_DSYNC)
e7a346
-	*/
e7a346
-	gf_boolean_t      witnessed_unstable_write;
e7a346
-
e7a346
-	/* @open_fd_count:
e7a346
-	   Number of open FDs queried from the server, as queried through
e7a346
-	   xdata in FOPs. Currently, used to decide if eager-locking must be
e7a346
-	   temporarily disabled.
e7a346
-	*/
e7a346
-        uint32_t        open_fd_count;
e7a346
-
e7a346
-
e7a346
-	/* list of frames currently in progress */
e7a346
-	struct list_head  eager_locked;
e7a346
-
e7a346
 	/* the subvolume on which the latest sequence of readdirs (starting
e7a346
 	   at offset 0) has begun. Till the next readdir request with 0 offset
e7a346
 	   arrives, we continue to read off this subvol.
e7a346
@@ -336,6 +300,20 @@ typedef enum {
e7a346
         AFR_FOP_LOCK_QUORUM_FAILED,
e7a346
 } afr_fop_lock_state_t;
e7a346
 
e7a346
+typedef struct _afr_inode_lock_t {
e7a346
+        unsigned int event_generation;
e7a346
+        gf_boolean_t    release;
e7a346
+        gf_boolean_t    acquired;
e7a346
+        gf_timer_t        *delay_timer;
e7a346
+        struct list_head  owners; /*Transactions that are performing fop*/
e7a346
+        struct list_head  post_op;/*Transactions that are done with the fop
e7a346
+                                   *So can not conflict with the fops*/
e7a346
+        struct list_head waiting;/*Transaction that are waiting for
e7a346
+                                   *conflicting transactions to complete*/
e7a346
+        struct list_head frozen;/*Transactions that need to go as part of
e7a346
+                                 * next batch of eager-lock*/
e7a346
+} afr_lock_t;
e7a346
+
e7a346
 typedef struct _afr_inode_ctx {
e7a346
         uint64_t        read_subvol;
e7a346
         uint64_t        write_subvol;
e7a346
@@ -343,6 +321,23 @@ typedef struct _afr_inode_ctx {
e7a346
         int             spb_choice;
e7a346
         gf_timer_t      *timer;
e7a346
         gf_boolean_t    need_refresh;
e7a346
+        unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS];
e7a346
+        int inherited[AFR_NUM_CHANGE_LOGS];
e7a346
+        int on_disk[AFR_NUM_CHANGE_LOGS];
e7a346
+
e7a346
+        /* set if any write on this fd was a non stable write
e7a346
+           (i.e, without O_SYNC or O_DSYNC)
e7a346
+        */
e7a346
+        gf_boolean_t      witnessed_unstable_write;
e7a346
+
e7a346
+        /* @open_fd_count:
e7a346
+           Number of open FDs queried from the server, as queried through
e7a346
+           xdata in FOPs. Currently, used to decide if eager-locking must be
e7a346
+           temporarily disabled.
e7a346
+        */
e7a346
+        uint32_t        open_fd_count;
e7a346
+        /*Only 2 types of transactions support eager-locks now. DATA/METADATA*/
e7a346
+        afr_lock_t lock[2];
e7a346
 } afr_inode_ctx_t;
e7a346
 
e7a346
 
e7a346
@@ -457,7 +452,6 @@ typedef struct _afr_local {
e7a346
         dict_t  *dict;
e7a346
 
e7a346
         int      optimistic_change_log;
e7a346
-	gf_boolean_t      delayed_post_op;
e7a346
 
e7a346
 	/* Is the current writev() going to perform a stable write?
e7a346
 	   i.e, is fd->flags or @flags writev param have O_SYNC or
e7a346
@@ -693,7 +687,7 @@ typedef struct _afr_local {
e7a346
                 off_t start, len;
e7a346
 
e7a346
                 gf_boolean_t    eager_lock_on;
e7a346
-                int *eager_lock;
e7a346
+                gf_boolean_t    do_eager_unlock;
e7a346
 
e7a346
                 char *basename;
e7a346
                 char *new_basename;
e7a346
@@ -707,7 +701,8 @@ typedef struct _afr_local {
e7a346
 		   of the transaction frame */
e7a346
 		call_stub_t      *resume_stub;
e7a346
 
e7a346
-		struct list_head  eager_locked;
e7a346
+		struct list_head  owner_list;
e7a346
+                struct list_head  wait_list;
e7a346
 
e7a346
                 unsigned char   *pre_op;
e7a346
 
e7a346
@@ -768,7 +763,8 @@ typedef struct _afr_local {
e7a346
 		*/
e7a346
 		afr_changelog_resume_t changelog_resume;
e7a346
 
e7a346
-                call_frame_t *main_frame;
e7a346
+                call_frame_t *main_frame; /*Fop frame*/
e7a346
+                call_frame_t *frame; /*Transaction frame*/
e7a346
 
e7a346
                 int (*wind) (call_frame_t *frame, xlator_t *this, int subvol);
e7a346
 
e7a346
@@ -1009,7 +1005,7 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd);
e7a346
 		afr_local_cleanup (frame->local, THIS);		       \
e7a346
 		mem_put (frame->local);				       \
e7a346
 		frame->local = NULL; };				       \
e7a346
-	frame->local;})
e7a346
+	frame->local; })
e7a346
 
e7a346
 #define AFR_STACK_RESET(frame)                                         \
e7a346
         do {                                                           \
e7a346
@@ -1096,22 +1092,10 @@ afr_filter_xattrs (dict_t *xattr);
e7a346
 #define AFR_QUORUM_AUTO INT_MAX
e7a346
 
e7a346
 int
e7a346
-afr_fd_report_unstable_write (xlator_t *this, fd_t *fd);
e7a346
+afr_fd_report_unstable_write (xlator_t *this, afr_local_t *local);
e7a346
 
e7a346
 gf_boolean_t
e7a346
-afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd);
e7a346
-
e7a346
-void
e7a346
-afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub);
e7a346
-
e7a346
-int
e7a346
-afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count);
e7a346
-
e7a346
-void
e7a346
-afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this);
e7a346
-
e7a346
-void
e7a346
-afr_remove_eager_lock_stub (afr_local_t *local);
e7a346
+afr_fd_has_witnessed_unstable_write (xlator_t *this, inode_t *inode);
e7a346
 
e7a346
 void
e7a346
 afr_reply_wipe (struct afr_reply *reply);
e7a346
-- 
e7a346
1.8.3.1
e7a346