a3470f
From 30fb0e640ae94d9591e9bb64800b0971e52d5416 Mon Sep 17 00:00:00 2001
a3470f
From: Pranith Kumar K <pkarampu@redhat.com>
a3470f
Date: Wed, 31 Jan 2018 16:41:14 +0530
a3470f
Subject: [PATCH 194/201] cluster/afr: Make AFR eager-locking similar to EC
a3470f
a3470f
Problem:
a3470f
1) Afr's eager-lock only works for data transactions.
a3470f
2) When there are conflicting writes, write with conflicting region initiates
a3470f
unlock of eager-lock leading to extra pre-ops and post-ops on the file. When
a3470f
eager-lock goes off, it leads to extra fsyncs for random-write workload in afr.
a3470f
a3470f
Solution (that is modeled after EC):
a3470f
In EC, when there is a conflicting write, it waits for the current write to
a3470f
complete before it winds the conflicted write. This leads to better utilization
a3470f
of network and disk, because we will not be doing extra xattrops and FSYNCs and
a3470f
inodelk/unlock. Moved fd based counters to inode based counters.
a3470f
a3470f
I tried to model the solution based on EC's locking, but it is not similar to
a3470f
AFR because we had to keep backward compatibility.
a3470f
a3470f
Lifecycle of lock:
a3470f
==================
a3470f
First transaction is added to inode->owners list and an inodelk will be sent on
a3470f
the wire. All the next transactions will be put in inode->waiters list until
a3470f
the first transaction completes inodelk and [f]xattrop completely.  Once
a3470f
[f]xattrop also completes, all the requests in the inode->waiters list are
a3470f
checked if it conflict with any of the existing locks which are in
a3470f
inode->owners list and if not are added to inode->owners list and resumed with
a3470f
doing transaction. When these transactions complete fop phase they will be
a3470f
moved to inode->post_op list and resume the transactions that were paused
a3470f
because of conflicts. Post-op and unlock will not be issued on the wire until
a3470f
that is the last transaction on that inode. Last transaction when it has to
a3470f
perform post-op can choose to sleep for deyed-post-op-secs value. During that
a3470f
time if any other transaction comes, it will wake up the sleeping transaction
a3470f
and takes over the ownership of the lock and the cycle continues. If the
a3470f
dealyed-post-op-secs expire, then the timer thread will wakeup the sleeping
a3470f
transaction and it will set lock->release to true and starts doing post-op and
a3470f
then unlock. During this time if any other transactions come, they will be put
a3470f
in inode->frozen list. Once the previous unlock comes it will move the frozen
a3470f
list to waiters list and moves the first element from this waiters-list to
a3470f
owners-list and attempts the lock and the cycle continues. This is the general
a3470f
idea.  There is logic at the time of dealying and at the time of new
a3470f
transaction or in flush fop to wakeup existing sleeping transactions or
a3470f
choosing whether to delay a transaction etc, which is subjected to change based
a3470f
on future enhancements etc.
a3470f
a3470f
 >Fixes: #418
a3470f
 >BUG: 1549606
a3470f
a3470f
Upstream-patch: https://review.gluster.org/19503
a3470f
BUG: 1491785
a3470f
Change-Id: I88b570bbcf332a27c82d2767dfa82472f60055dc
a3470f
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
a3470f
Reviewed-on: https://code.engineering.redhat.com/gerrit/131945
a3470f
Tested-by: RHGS Build Bot <nigelb@redhat.com>
a3470f
---
a3470f
 tests/bugs/replicate/bug-966018.t              |  36 -
a3470f
 xlators/cluster/afr/src/afr-common.c           | 315 ++++-----
a3470f
 xlators/cluster/afr/src/afr-inode-write.c      |   6 +-
a3470f
 xlators/cluster/afr/src/afr-lk-common.c        | 348 +++-------
a3470f
 xlators/cluster/afr/src/afr-self-heal-common.c |  13 +-
a3470f
 xlators/cluster/afr/src/afr-self-heal-data.c   |  14 +-
a3470f
 xlators/cluster/afr/src/afr-self-heal.h        |   2 +-
a3470f
 xlators/cluster/afr/src/afr-transaction.c      | 913 ++++++++++++++-----------
a3470f
 xlators/cluster/afr/src/afr-transaction.h      |  13 +-
a3470f
 xlators/cluster/afr/src/afr.h                  |  96 ++-
a3470f
 10 files changed, 813 insertions(+), 943 deletions(-)
a3470f
 delete mode 100644 tests/bugs/replicate/bug-966018.t
a3470f
a3470f
diff --git a/tests/bugs/replicate/bug-966018.t b/tests/bugs/replicate/bug-966018.t
a3470f
deleted file mode 100644
a3470f
index 1b5296b..0000000
a3470f
--- a/tests/bugs/replicate/bug-966018.t
a3470f
+++ /dev/null
a3470f
@@ -1,36 +0,0 @@
a3470f
-#!/bin/bash
a3470f
-
a3470f
-. $(dirname $0)/../../include.rc
a3470f
-. $(dirname $0)/../../volume.rc
a3470f
-. $(dirname $0)/../../nfs.rc
a3470f
-
a3470f
-#This tests if cluster.eager-lock blocks metadata operations on nfs/fuse mounts.
a3470f
-#If it is not woken up, INODELK from the next command waits
a3470f
-#for post-op-delay secs.
a3470f
-
a3470f
-cleanup;
a3470f
-TEST glusterd
a3470f
-TEST pidof glusterd
a3470f
-
a3470f
-TEST $CLI volume create $V0 replica 2 $H0:$B0/r2_0 $H0:$B0/r2_1
a3470f
-TEST $CLI volume set $V0 ensure-durability off
a3470f
-TEST $CLI volume set $V0 cluster.eager-lock on
a3470f
-TEST $CLI volume set $V0 cluster.post-op-delay-secs 3
a3470f
-TEST $CLI volume set $V0 nfs.disable false
a3470f
-
a3470f
-TEST $CLI volume start $V0
a3470f
-TEST $CLI volume profile $V0 start
a3470f
-EXPECT_WITHIN $NFS_EXPORT_TIMEOUT "1" is_nfs_export_available;
a3470f
-TEST mount_nfs $H0:/$V0 $N0 nolock;
a3470f
-TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id=$V0 $M0
a3470f
-echo 1 > $N0/1 && chmod +x $N0/1
a3470f
-echo 1 > $M0/1 && chmod +x $M0/1
a3470f
-
a3470f
-#Check that INODELK MAX latency is not in the order of seconds
a3470f
-#Test if the MAX INODELK fop latency is of the order of seconds.
a3470f
-inodelk_max_latency=$($CLI volume profile $V0 info | grep INODELK | awk 'BEGIN {max = 0} {if ($6 > max) max=$6;} END {print max}' | cut -d. -f 1 | egrep "[0-9]{7,}")
a3470f
-
a3470f
-TEST [ -z $inodelk_max_latency ]
a3470f
-EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $N0
a3470f
-
a3470f
-cleanup;
a3470f
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
a3470f
index 06863b6..6025a60 100644
a3470f
--- a/xlators/cluster/afr/src/afr-common.c
a3470f
+++ b/xlators/cluster/afr/src/afr-common.c
a3470f
@@ -126,37 +126,77 @@ afr_is_possibly_under_txn (afr_transaction_type type, afr_local_t *local,
a3470f
         return _gf_false;
a3470f
 }
a3470f
 
a3470f
+static void
a3470f
+afr_inode_ctx_destroy (afr_inode_ctx_t *ctx)
a3470f
+{
a3470f
+        int i = 0;
a3470f
+
a3470f
+        if (!ctx)
a3470f
+                return;
a3470f
+
a3470f
+        for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
a3470f
+                GF_FREE (ctx->pre_op_done[i]);
a3470f
+        }
a3470f
+
a3470f
+        GF_FREE (ctx);
a3470f
+}
a3470f
+
a3470f
 int
a3470f
 __afr_inode_ctx_get (xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx)
a3470f
 {
a3470f
-        uint64_t                ctx_int = 0;
a3470f
-        int                     ret     = -1;
a3470f
-        afr_inode_ctx_t        *tmp_ctx = NULL;
a3470f
+        uint64_t        ctx_int   = 0;
a3470f
+        int             ret       = -1;
a3470f
+        int             i         = -1;
a3470f
+        int             num_locks = -1;
a3470f
+        afr_inode_ctx_t *ictx     = NULL;
a3470f
+        afr_lock_t      *lock     = NULL;
a3470f
+        afr_private_t   *priv     = this->private;
a3470f
 
a3470f
         ret = __inode_ctx_get (inode, this, &ctx_int);
a3470f
-        if (ret) {
a3470f
-                tmp_ctx = GF_CALLOC (1, sizeof (afr_inode_ctx_t),
a3470f
-                                     gf_afr_mt_inode_ctx_t);
a3470f
-                if (!tmp_ctx)
a3470f
-                        goto out;
a3470f
+        if (ret == 0) {
a3470f
+                *ctx = (afr_inode_ctx_t *)ctx_int;
a3470f
+                return 0;
a3470f
+        }
a3470f
 
a3470f
-                ctx_int = (long) tmp_ctx;
a3470f
-                ret = __inode_ctx_set (inode, this, &ctx_int);
a3470f
-                if (ret) {
a3470f
-                        GF_FREE (tmp_ctx);
a3470f
+        ictx = GF_CALLOC (1, sizeof (afr_inode_ctx_t), gf_afr_mt_inode_ctx_t);
a3470f
+        if (!ictx)
a3470f
+                goto out;
a3470f
+
a3470f
+        for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
a3470f
+                ictx->pre_op_done[i] = GF_CALLOC (sizeof *ictx->pre_op_done[i],
a3470f
+                                                  priv->child_count,
a3470f
+                                                  gf_afr_mt_int32_t);
a3470f
+                if (!ictx->pre_op_done[i]) {
a3470f
+                        ret = -ENOMEM;
a3470f
                         goto out;
a3470f
                 }
a3470f
-                tmp_ctx->spb_choice = -1;
a3470f
-                tmp_ctx->read_subvol = 0;
a3470f
-                tmp_ctx->write_subvol = 0;
a3470f
-                tmp_ctx->lock_count = 0;
a3470f
-        } else {
a3470f
-                tmp_ctx = (afr_inode_ctx_t *) ctx_int;
a3470f
         }
a3470f
 
a3470f
-        *ctx = tmp_ctx;
a3470f
+        num_locks = sizeof(ictx->lock)/sizeof(afr_lock_t);
a3470f
+        for (i = 0; i < num_locks; i++) {
a3470f
+                lock = &ictx->lock[i];
a3470f
+                INIT_LIST_HEAD (&lock->post_op);
a3470f
+                INIT_LIST_HEAD (&lock->frozen);
a3470f
+                INIT_LIST_HEAD (&lock->waiting);
a3470f
+                INIT_LIST_HEAD (&lock->owners);
a3470f
+        }
a3470f
+
a3470f
+        ctx_int = (uint64_t)ictx;
a3470f
+        ret = __inode_ctx_set (inode, this, &ctx_int);
a3470f
+        if (ret) {
a3470f
+                goto out;
a3470f
+        }
a3470f
+
a3470f
+        ictx->spb_choice = -1;
a3470f
+        ictx->read_subvol = 0;
a3470f
+        ictx->write_subvol = 0;
a3470f
+        ictx->lock_count = 0;
a3470f
         ret = 0;
a3470f
+        *ctx = ictx;
a3470f
 out:
a3470f
+        if (ret) {
a3470f
+                afr_inode_ctx_destroy (ictx);
a3470f
+        }
a3470f
         return ret;
a3470f
 }
a3470f
 
a3470f
@@ -1745,10 +1785,6 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)
a3470f
 
a3470f
         GF_FREE (local->internal_lock.locked_nodes);
a3470f
 
a3470f
-        for (i = 0; local->internal_lock.inodelk[i].domain; i++) {
a3470f
-                GF_FREE (local->internal_lock.inodelk[i].locked_nodes);
a3470f
-        }
a3470f
-
a3470f
         GF_FREE (local->internal_lock.lower_locked_nodes);
a3470f
 
a3470f
         afr_entry_lockee_cleanup (&local->internal_lock);
a3470f
@@ -1765,7 +1801,6 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)
a3470f
                 GF_FREE (local->transaction.changelog_xdata);
a3470f
         }
a3470f
 
a3470f
-        GF_FREE (local->transaction.eager_lock);
a3470f
         GF_FREE (local->transaction.failed_subvols);
a3470f
 
a3470f
         GF_FREE (local->transaction.basename);
a3470f
@@ -1812,16 +1847,6 @@ afr_local_replies_wipe (afr_local_t *local, afr_private_t *priv)
a3470f
 	memset (local->replies, 0, sizeof(*local->replies) * priv->child_count);
a3470f
 }
a3470f
 
a3470f
-void
a3470f
-afr_remove_eager_lock_stub (afr_local_t *local)
a3470f
-{
a3470f
-        LOCK (&local->fd->lock);
a3470f
-        {
a3470f
-                list_del_init (&local->transaction.eager_locked);
a3470f
-        }
a3470f
-        UNLOCK (&local->fd->lock);
a3470f
-}
a3470f
-
a3470f
 static gf_boolean_t
a3470f
 afr_fop_lock_is_unlock (call_frame_t *frame)
a3470f
 {
a3470f
@@ -1926,10 +1951,6 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this)
a3470f
 
a3470f
 	syncbarrier_destroy (&local->barrier);
a3470f
 
a3470f
-        if (local->transaction.eager_lock_on &&
a3470f
-            !list_empty (&local->transaction.eager_locked))
a3470f
-                afr_remove_eager_lock_stub (local);
a3470f
-
a3470f
         afr_local_transaction_cleanup (local, this);
a3470f
 
a3470f
         priv = this->private;
a3470f
@@ -3160,22 +3181,8 @@ out:
a3470f
 void
a3470f
 _afr_cleanup_fd_ctx (afr_fd_ctx_t *fd_ctx)
a3470f
 {
a3470f
-        int i = 0;
a3470f
-
a3470f
-
a3470f
-	for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++)
a3470f
-		GF_FREE (fd_ctx->pre_op_done[i]);
a3470f
-
a3470f
         GF_FREE (fd_ctx->opened_on);
a3470f
-
a3470f
-        GF_FREE (fd_ctx->lock_piggyback);
a3470f
-
a3470f
-        GF_FREE (fd_ctx->lock_acquired);
a3470f
-
a3470f
-	pthread_mutex_destroy (&fd_ctx->delay_lock);
a3470f
-
a3470f
         GF_FREE (fd_ctx);
a3470f
-
a3470f
         return;
a3470f
 }
a3470f
 
a3470f
@@ -3193,15 +3200,7 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd)
a3470f
         fd_ctx = (afr_fd_ctx_t *)(long) ctx;
a3470f
 
a3470f
         if (fd_ctx) {
a3470f
-                /*no need to take any locks*/
a3470f
-                if (!list_empty (&fd_ctx->eager_locked))
a3470f
-                        gf_msg (this->name, GF_LOG_WARNING, 0,
a3470f
-                                AFR_MSG_INVALID_DATA, "%s: Stale "
a3470f
-                                "Eager-lock stubs found",
a3470f
-                                uuid_utoa (fd->inode->gfid));
a3470f
-
a3470f
                 _afr_cleanup_fd_ctx (fd_ctx);
a3470f
-
a3470f
         }
a3470f
 
a3470f
 out:
a3470f
@@ -3282,23 +3281,6 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd)
a3470f
                 goto out;
a3470f
         }
a3470f
 
a3470f
-        ret = pthread_mutex_init (&fd_ctx->delay_lock, NULL);
a3470f
-        if (ret) {
a3470f
-                GF_FREE (fd_ctx);
a3470f
-                fd_ctx = NULL;
a3470f
-                goto out;
a3470f
-        }
a3470f
-
a3470f
-	for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
a3470f
-		fd_ctx->pre_op_done[i] = GF_CALLOC (sizeof (*fd_ctx->pre_op_done[i]),
a3470f
-						    priv->child_count,
a3470f
-						    gf_afr_mt_int32_t);
a3470f
-		if (!fd_ctx->pre_op_done[i]) {
a3470f
-			ret = -ENOMEM;
a3470f
-			goto out;
a3470f
-		}
a3470f
-	}
a3470f
-
a3470f
         fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on),
a3470f
                                        priv->child_count,
a3470f
                                        gf_afr_mt_int32_t);
a3470f
@@ -3314,26 +3296,8 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd)
a3470f
 			fd_ctx->opened_on[i] = AFR_FD_NOT_OPENED;
a3470f
 	}
a3470f
 
a3470f
-        fd_ctx->lock_piggyback = GF_CALLOC (sizeof (*fd_ctx->lock_piggyback),
a3470f
-                                            priv->child_count,
a3470f
-                                            gf_afr_mt_char);
a3470f
-        if (!fd_ctx->lock_piggyback) {
a3470f
-                ret = -ENOMEM;
a3470f
-                goto out;
a3470f
-        }
a3470f
-
a3470f
-        fd_ctx->lock_acquired = GF_CALLOC (sizeof (*fd_ctx->lock_acquired),
a3470f
-                                           priv->child_count,
a3470f
-                                           gf_afr_mt_char);
a3470f
-        if (!fd_ctx->lock_acquired) {
a3470f
-                ret = -ENOMEM;
a3470f
-                goto out;
a3470f
-        }
a3470f
-
a3470f
 	fd_ctx->readdir_subvol = -1;
a3470f
 
a3470f
-        INIT_LIST_HEAD (&fd_ctx->eager_locked);
a3470f
-
a3470f
         ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx);
a3470f
         if (ret)
a3470f
                 gf_msg_debug (this->name, 0,
a3470f
@@ -3405,12 +3369,70 @@ afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
a3470f
         return 0;
a3470f
 }
a3470f
 
a3470f
+afr_local_t*
a3470f
+afr_wakeup_same_fd_delayed_op (xlator_t *this, afr_lock_t *lock, fd_t *fd)
a3470f
+{
a3470f
+        afr_local_t *local = NULL;
a3470f
+
a3470f
+        if (lock->delay_timer) {
a3470f
+                local = list_entry(lock->post_op.next, afr_local_t,
a3470f
+                                   transaction.owner_list);
a3470f
+                if (fd == local->fd) {
a3470f
+                        if (gf_timer_call_cancel (this->ctx,
a3470f
+                                                  lock->delay_timer)) {
a3470f
+                                local = NULL;
a3470f
+                        } else {
a3470f
+                                lock->delay_timer = NULL;
a3470f
+                        }
a3470f
+                } else {
a3470f
+                        local = NULL;
a3470f
+                }
a3470f
+        }
a3470f
+
a3470f
+        return local;
a3470f
+}
a3470f
+
a3470f
+void
a3470f
+afr_delayed_changelog_wake_resume (xlator_t *this, inode_t *inode,
a3470f
+                                   call_stub_t *stub)
a3470f
+{
a3470f
+        afr_inode_ctx_t *ctx = NULL;
a3470f
+        afr_lock_t      *lock = NULL;
a3470f
+        afr_local_t     *metadata_local = NULL;
a3470f
+        afr_local_t     *data_local = NULL;
a3470f
+        LOCK (&inode->lock);
a3470f
+        {
a3470f
+                (void)__afr_inode_ctx_get (this, inode, &ctx;;
a3470f
+                lock = &ctx->lock[AFR_DATA_TRANSACTION];
a3470f
+                data_local = afr_wakeup_same_fd_delayed_op (this, lock,
a3470f
+                                                            stub->args.fd);
a3470f
+                lock = &ctx->lock[AFR_METADATA_TRANSACTION];
a3470f
+                metadata_local = afr_wakeup_same_fd_delayed_op (this, lock,
a3470f
+                                                                stub->args.fd);
a3470f
+        }
a3470f
+        UNLOCK (&inode->lock);
a3470f
+
a3470f
+        if (data_local) {
a3470f
+                data_local->transaction.resume_stub = stub;
a3470f
+        } else if (metadata_local) {
a3470f
+                metadata_local->transaction.resume_stub = stub;
a3470f
+        } else {
a3470f
+                call_resume (stub);
a3470f
+        }
a3470f
+        if (data_local) {
a3470f
+                afr_delayed_changelog_wake_up_cbk (data_local);
a3470f
+        }
a3470f
+        if (metadata_local) {
a3470f
+                afr_delayed_changelog_wake_up_cbk (metadata_local);
a3470f
+        }
a3470f
+}
a3470f
+
a3470f
 int
a3470f
 afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
a3470f
 {
a3470f
-        afr_local_t   *local = NULL;
a3470f
-        call_stub_t   *stub = NULL;
a3470f
-        int            op_errno   = ENOMEM;
a3470f
+        afr_local_t *local   = NULL;
a3470f
+        call_stub_t *stub    = NULL;
a3470f
+        int         op_errno = ENOMEM;
a3470f
 
a3470f
 	local = AFR_FRAME_INIT (frame, op_errno);
a3470f
 	if (!local)
a3470f
@@ -3426,7 +3448,7 @@ afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
a3470f
         if (!stub)
a3470f
                 goto out;
a3470f
 
a3470f
-        afr_delayed_changelog_wake_resume (this, fd, stub);
a3470f
+        afr_delayed_changelog_wake_resume (this, fd->inode, stub);
a3470f
 
a3470f
 	return 0;
a3470f
 out:
a3470f
@@ -3434,7 +3456,6 @@ out:
a3470f
         return 0;
a3470f
 }
a3470f
 
a3470f
-
a3470f
 int
a3470f
 afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
a3470f
 		  int32_t op_ret, int32_t op_errno, dict_t *xdata)
a3470f
@@ -4497,7 +4518,7 @@ afr_forget (xlator_t *this, inode_t *inode)
a3470f
                 return 0;
a3470f
 
a3470f
         ctx = (afr_inode_ctx_t *)ctx_int;
a3470f
-        GF_FREE (ctx);
a3470f
+        afr_inode_ctx_destroy (ctx);
a3470f
         return 0;
a3470f
 }
a3470f
 
a3470f
@@ -5310,21 +5331,6 @@ out:
a3470f
 }
a3470f
 
a3470f
 int
a3470f
-afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count)
a3470f
-{
a3470f
-        int             ret = -ENOMEM;
a3470f
-
a3470f
-        lk->domain = dom;
a3470f
-        lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes),
a3470f
-                                      child_count, gf_afr_mt_char);
a3470f
-        if (NULL == lk->locked_nodes)
a3470f
-                goto out;
a3470f
-        ret = 0;
a3470f
-out:
a3470f
-        return ret;
a3470f
-}
a3470f
-
a3470f
-int
a3470f
 afr_transaction_local_init (afr_local_t *local, xlator_t *this)
a3470f
 {
a3470f
         int            ret = -ENOMEM;
a3470f
@@ -5335,25 +5341,9 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this)
a3470f
         if (ret < 0)
a3470f
                 goto out;
a3470f
 
a3470f
-        if ((local->transaction.type == AFR_DATA_TRANSACTION) ||
a3470f
-            (local->transaction.type == AFR_METADATA_TRANSACTION)) {
a3470f
-                ret = afr_inodelk_init (&local->internal_lock.inodelk[0],
a3470f
-                                        this->name, priv->child_count);
a3470f
-                if (ret < 0)
a3470f
-                        goto out;
a3470f
-        }
a3470f
-
a3470f
         ret = -ENOMEM;
a3470f
 	local->pre_op_compat = priv->pre_op_compat;
a3470f
 
a3470f
-        local->transaction.eager_lock =
a3470f
-                GF_CALLOC (sizeof (*local->transaction.eager_lock),
a3470f
-                           priv->child_count,
a3470f
-                           gf_afr_mt_int32_t);
a3470f
-
a3470f
-        if (!local->transaction.eager_lock)
a3470f
-                goto out;
a3470f
-
a3470f
         local->transaction.pre_op = GF_CALLOC (sizeof (*local->transaction.pre_op),
a3470f
                                                priv->child_count,
a3470f
                                                gf_afr_mt_char);
a3470f
@@ -5385,9 +5375,9 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this)
a3470f
         if (!local->pending)
a3470f
                 goto out;
a3470f
 
a3470f
-	INIT_LIST_HEAD (&local->transaction.eager_locked);
a3470f
-
a3470f
         ret = 0;
a3470f
+        INIT_LIST_HEAD (&local->transaction.wait_list);
a3470f
+        INIT_LIST_HEAD (&local->transaction.owner_list);
a3470f
 out:
a3470f
         return ret;
a3470f
 }
a3470f
@@ -5422,24 +5412,6 @@ out:
a3470f
         return;
a3470f
 }
a3470f
 
a3470f
-void
a3470f
-afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this)
a3470f
-{
a3470f
-        afr_local_t     *local = NULL;
a3470f
-        afr_fd_ctx_t    *fd_ctx   = NULL;
a3470f
-
a3470f
-        local = frame->local;
a3470f
-
a3470f
-        if (!local->fd)
a3470f
-		return;
a3470f
-
a3470f
-	fd_ctx = afr_fd_ctx_get (local->fd, this);
a3470f
-	if (!fd_ctx)
a3470f
-		return;
a3470f
-
a3470f
-	fd_ctx->open_fd_count = local->open_fd_count;
a3470f
-}
a3470f
-
a3470f
 int**
a3470f
 afr_mark_pending_changelog (afr_private_t *priv, unsigned char *pending,
a3470f
                             dict_t *xattr, ia_type_t iat)
a3470f
@@ -5548,7 +5520,7 @@ out:
a3470f
 
a3470f
 int
a3470f
 afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this,
a3470f
-                                  inode_t *inode, gf_boolean_t *dsh,
a3470f
+                                  fd_t *fd, gf_boolean_t *dsh,
a3470f
                                   gf_boolean_t *pflag)
a3470f
 {
a3470f
         int ret = -1;
a3470f
@@ -5558,8 +5530,8 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this,
a3470f
         unsigned char *healed_sinks = NULL;
a3470f
         unsigned char *undid_pending = NULL;
a3470f
         afr_private_t   *priv = NULL;
a3470f
-        fd_t          *fd = NULL;
a3470f
         struct afr_reply *locked_replies = NULL;
a3470f
+        inode_t *inode = fd->inode;
a3470f
 
a3470f
         priv = this->private;
a3470f
         data_lock = alloca0 (priv->child_count);
a3470f
@@ -5568,18 +5540,6 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this,
a3470f
         healed_sinks = alloca0 (priv->child_count);
a3470f
         undid_pending = alloca0 (priv->child_count);
a3470f
 
a3470f
-        /* Heal-info does an open() on the file being examined so that the
a3470f
-         * current eager-lock holding client, if present, at some point sees
a3470f
-         * open-fd count being > 1 and releases the eager-lock so that heal-info
a3470f
-         * doesn't remain blocked forever until IO completes.
a3470f
-         */
a3470f
-        ret = afr_selfheal_data_open (this, inode, &fd;;
a3470f
-        if (ret < 0) {
a3470f
-                gf_msg_debug (this->name, -ret, "%s: Failed to open",
a3470f
-                              uuid_utoa (inode->gfid));
a3470f
-                goto out;
a3470f
-        }
a3470f
-
a3470f
         locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count);
a3470f
 
a3470f
         ret = afr_selfheal_inodelk (frame, this, inode, this->name,
a3470f
@@ -5602,8 +5562,6 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this,
a3470f
 out:
a3470f
         if (locked_replies)
a3470f
                 afr_replies_wipe (locked_replies, priv->child_count);
a3470f
-        if (fd)
a3470f
-                fd_unref (fd);
a3470f
         return ret;
a3470f
 }
a3470f
 
a3470f
@@ -5688,6 +5646,7 @@ afr_selfheal_locked_inspect (call_frame_t *frame, xlator_t *this, uuid_t gfid,
a3470f
 
a3470f
 {
a3470f
         int ret             = -1;
a3470f
+        fd_t *fd            = NULL;
a3470f
         gf_boolean_t    dsh = _gf_false;
a3470f
         gf_boolean_t    msh = _gf_false;
a3470f
         gf_boolean_t    esh = _gf_false;
a3470f
@@ -5699,6 +5658,21 @@ afr_selfheal_locked_inspect (call_frame_t *frame, xlator_t *this, uuid_t gfid,
a3470f
 
a3470f
         /* For every heal type hold locks and check if it indeed needs heal */
a3470f
 
a3470f
+
a3470f
+        /* Heal-info does an open() on the file being examined so that the
a3470f
+         * current eager-lock holding client, if present, at some point sees
a3470f
+         * open-fd count being > 1 and releases the eager-lock so that heal-info
a3470f
+         * doesn't remain blocked forever until IO completes.
a3470f
+         */
a3470f
+        if ((*inode)->ia_type == IA_IFREG) {
a3470f
+                ret = afr_selfheal_data_open (this, *inode, &fd;;
a3470f
+                if (ret < 0) {
a3470f
+                        gf_msg_debug (this->name, -ret, "%s: Failed to open",
a3470f
+                                      uuid_utoa ((*inode)->gfid));
a3470f
+                        goto out;
a3470f
+                }
a3470f
+        }
a3470f
+
a3470f
         if (msh) {
a3470f
                 ret = afr_selfheal_locked_metadata_inspect (frame, this,
a3470f
                                                             *inode, &msh,
a3470f
@@ -5708,7 +5682,7 @@ afr_selfheal_locked_inspect (call_frame_t *frame, xlator_t *this, uuid_t gfid,
a3470f
         }
a3470f
 
a3470f
         if (dsh) {
a3470f
-                ret = afr_selfheal_locked_data_inspect (frame, this, *inode,
a3470f
+                ret = afr_selfheal_locked_data_inspect (frame, this, fd,
a3470f
                                                         &dsh, pending);
a3470f
                 if (ret == -EIO || (ret == -EAGAIN))
a3470f
                         goto out;
a3470f
@@ -5723,6 +5697,8 @@ out:
a3470f
         *data_selfheal = dsh;
a3470f
         *entry_selfheal = esh;
a3470f
         *metadata_selfheal = msh;
a3470f
+        if (fd)
a3470f
+                fd_unref (fd);
a3470f
         return ret;
a3470f
 }
a3470f
 
a3470f
@@ -6352,6 +6328,7 @@ afr_write_subvol_reset (call_frame_t *frame, xlator_t *this)
a3470f
         local = frame->local;
a3470f
         LOCK(&local->inode->lock);
a3470f
         {
a3470f
+                GF_ASSERT (local->inode_ctx->lock_count > 0);
a3470f
                 local->inode_ctx->lock_count--;
a3470f
 
a3470f
                 if (!local->inode_ctx->lock_count)
a3470f
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
a3470f
index 2402bb2..b52b6ca 100644
a3470f
--- a/xlators/cluster/afr/src/afr-inode-write.c
a3470f
+++ b/xlators/cluster/afr/src/afr-inode-write.c
a3470f
@@ -341,14 +341,14 @@ afr_process_post_writev (call_frame_t *frame, xlator_t *this)
a3470f
                    the xattrs are not reliably pointing at
a3470f
                    a stale file.
a3470f
                 */
a3470f
-                afr_fd_report_unstable_write (this, local->fd);
a3470f
+                afr_fd_report_unstable_write (this, local);
a3470f
 
a3470f
         __afr_inode_write_finalize (frame, this);
a3470f
 
a3470f
         afr_writev_handle_short_writes (frame, this);
a3470f
 
a3470f
         if (local->update_open_fd_count)
a3470f
-                afr_handle_open_fd_count (frame, this);
a3470f
+                local->inode_ctx->open_fd_count = local->open_fd_count;
a3470f
 
a3470f
 }
a3470f
 
a3470f
@@ -2590,7 +2590,7 @@ afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
a3470f
         local->op = GF_FOP_FSYNC;
a3470f
         local->cont.fsync.datasync = datasync;
a3470f
 
a3470f
-	if (afr_fd_has_witnessed_unstable_write (this, fd)) {
a3470f
+	if (afr_fd_has_witnessed_unstable_write (this, fd->inode)) {
a3470f
 		/* don't care. we only wanted to CLEAR the bit */
a3470f
 	}
a3470f
 
a3470f
diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c
a3470f
index 260815f..be3de01 100644
a3470f
--- a/xlators/cluster/afr/src/afr-lk-common.c
a3470f
+++ b/xlators/cluster/afr/src/afr-lk-common.c
a3470f
@@ -52,31 +52,6 @@ afr_entry_lockee_cmp (const void *l1, const void *l2)
a3470f
 
a3470f
 int afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index);
a3470f
 
a3470f
-static int
a3470f
-afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this);
a3470f
-
a3470f
-static uint64_t afr_lock_number = 1;
a3470f
-
a3470f
-static uint64_t
a3470f
-get_afr_lock_number ()
a3470f
-{
a3470f
-        return (++afr_lock_number);
a3470f
-}
a3470f
-
a3470f
-int
a3470f
-afr_set_lock_number (call_frame_t *frame, xlator_t *this)
a3470f
-{
a3470f
-        afr_local_t         *local    = NULL;
a3470f
-        afr_internal_lock_t *int_lock = NULL;
a3470f
-
a3470f
-        local    = frame->local;
a3470f
-        int_lock = &local->internal_lock;
a3470f
-
a3470f
-        int_lock->lock_number = get_afr_lock_number ();
a3470f
-
a3470f
-        return 0;
a3470f
-}
a3470f
-
a3470f
 void
a3470f
 afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner)
a3470f
 {
a3470f
@@ -203,21 +178,16 @@ initialize_inodelk_variables (call_frame_t *frame, xlator_t *this)
a3470f
         afr_local_t         *local    = NULL;
a3470f
         afr_internal_lock_t *int_lock = NULL;
a3470f
         afr_private_t       *priv     = NULL;
a3470f
-        afr_inodelk_t       *inodelk  = NULL;
a3470f
 
a3470f
         priv     = this->private;
a3470f
         local    = frame->local;
a3470f
         int_lock = &local->internal_lock;
a3470f
 
a3470f
-        inodelk = afr_get_inodelk (int_lock, int_lock->domain);
a3470f
-
a3470f
-        inodelk->lock_count    = 0;
a3470f
+        int_lock->lock_count    = 0;
a3470f
         int_lock->lk_attempted_count = 0;
a3470f
         int_lock->lock_op_ret   = -1;
a3470f
         int_lock->lock_op_errno = 0;
a3470f
 
a3470f
-        memset (inodelk->locked_nodes, 0,
a3470f
-                sizeof (*inodelk->locked_nodes) * priv->child_count);
a3470f
         memset (int_lock->locked_nodes, 0,
a3470f
                 sizeof (*int_lock->locked_nodes) * priv->child_count);
a3470f
 
a3470f
@@ -286,12 +256,7 @@ void
a3470f
 afr_update_uninodelk (afr_local_t *local, afr_internal_lock_t *int_lock,
a3470f
                     int32_t child_index)
a3470f
 {
a3470f
-        afr_inodelk_t       *inodelk = NULL;
a3470f
-
a3470f
-        inodelk = afr_get_inodelk (int_lock, int_lock->domain);
a3470f
-        inodelk->locked_nodes[child_index] &= LOCKED_NO;
a3470f
-        if (local->transaction.eager_lock)
a3470f
-                local->transaction.eager_lock[child_index] = 0;
a3470f
+        int_lock->locked_nodes[child_index] &= LOCKED_NO;
a3470f
 
a3470f
 }
a3470f
 
a3470f
@@ -331,35 +296,27 @@ static int
a3470f
 afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)
a3470f
 {
a3470f
         afr_internal_lock_t *int_lock = NULL;
a3470f
-        afr_inodelk_t       *inodelk  = NULL;
a3470f
         afr_local_t         *local    = NULL;
a3470f
         afr_private_t       *priv     = NULL;
a3470f
         struct gf_flock flock = {0,};
a3470f
-        struct gf_flock full_flock = {0,};
a3470f
-        struct gf_flock *flock_use = NULL;
a3470f
         int call_count = 0;
a3470f
         int i = 0;
a3470f
-        int piggyback = 0;
a3470f
-        afr_fd_ctx_t        *fd_ctx      = NULL;
a3470f
-
a3470f
 
a3470f
         local    = frame->local;
a3470f
         int_lock = &local->internal_lock;
a3470f
         priv     = this->private;
a3470f
 
a3470f
-        inodelk = afr_get_inodelk (int_lock, int_lock->domain);
a3470f
-
a3470f
-        flock.l_start = inodelk->flock.l_start;
a3470f
-        flock.l_len   = inodelk->flock.l_len;
a3470f
+        flock.l_start = int_lock->flock.l_start;
a3470f
+        flock.l_len   = int_lock->flock.l_len;
a3470f
         flock.l_type  = F_UNLCK;
a3470f
 
a3470f
-        full_flock.l_type = F_UNLCK;
a3470f
-        call_count = afr_locked_nodes_count (inodelk->locked_nodes,
a3470f
+        call_count = afr_locked_nodes_count (int_lock->locked_nodes,
a3470f
                                              priv->child_count);
a3470f
 
a3470f
         int_lock->lk_call_count = call_count;
a3470f
 
a3470f
         if (!call_count) {
a3470f
+                GF_ASSERT (!local->transaction.do_eager_unlock);
a3470f
                 gf_msg_trace (this->name, 0,
a3470f
                               "No internal locks unlocked");
a3470f
 
a3470f
@@ -367,64 +324,28 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)
a3470f
                 goto out;
a3470f
         }
a3470f
 
a3470f
-        if (local->fd)
a3470f
-                fd_ctx = afr_fd_ctx_get (local->fd, this);
a3470f
-
a3470f
         for (i = 0; i < priv->child_count; i++) {
a3470f
-                if ((inodelk->locked_nodes[i] & LOCKED_YES) != LOCKED_YES)
a3470f
+                if ((int_lock->locked_nodes[i] & LOCKED_YES) != LOCKED_YES)
a3470f
                         continue;
a3470f
 
a3470f
                 if (local->fd) {
a3470f
-                        flock_use = &flock;
a3470f
-                        if (!local->transaction.eager_lock[i]) {
a3470f
-                                goto wind;
a3470f
-                        }
a3470f
-
a3470f
-                        piggyback = 0;
a3470f
-
a3470f
-                        LOCK (&local->fd->lock);
a3470f
-                        {
a3470f
-                                if (fd_ctx->lock_piggyback[i]) {
a3470f
-                                        fd_ctx->lock_piggyback[i]--;
a3470f
-                                        piggyback = 1;
a3470f
-                                } else {
a3470f
-                                        fd_ctx->lock_acquired[i]--;
a3470f
-                                }
a3470f
-                        }
a3470f
-                        UNLOCK (&local->fd->lock);
a3470f
-
a3470f
-                        if (piggyback) {
a3470f
-                                afr_unlock_inodelk_cbk (frame, (void *) (long) i,
a3470f
-                                                        this, 1, 0, NULL);
a3470f
-                                if (!--call_count)
a3470f
-                                        break;
a3470f
-                                continue;
a3470f
-                        }
a3470f
-
a3470f
-                        flock_use = &full_flock;
a3470f
-                wind:
a3470f
                         STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk,
a3470f
                                            (void *) (long)i,
a3470f
                                            priv->children[i],
a3470f
                                            priv->children[i]->fops->finodelk,
a3470f
                                            int_lock->domain, local->fd,
a3470f
-                                           F_SETLK, flock_use, NULL);
a3470f
-
a3470f
-                        if (!--call_count)
a3470f
-                                break;
a3470f
-
a3470f
+                                           F_SETLK, &flock, NULL);
a3470f
                 } else {
a3470f
-
a3470f
                         STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk,
a3470f
                                            (void *) (long)i,
a3470f
                                            priv->children[i],
a3470f
                                            priv->children[i]->fops->inodelk,
a3470f
                                            int_lock->domain, &local->loc,
a3470f
                                            F_SETLK, &flock, NULL);
a3470f
-
a3470f
-                        if (!--call_count)
a3470f
-                                break;
a3470f
                 }
a3470f
+
a3470f
+                if (!--call_count)
a3470f
+                        break;
a3470f
         }
a3470f
 out:
a3470f
         return 0;
a3470f
@@ -512,6 +433,18 @@ out:
a3470f
 
a3470f
 }
a3470f
 
a3470f
+int32_t
a3470f
+afr_unlock_now (call_frame_t *frame, xlator_t *this)
a3470f
+{
a3470f
+        afr_local_t *local = frame->local;
a3470f
+
a3470f
+        if (afr_is_inodelk_transaction(local->transaction.type))
a3470f
+                afr_unlock_inodelk (frame, this);
a3470f
+        else
a3470f
+                afr_unlock_entrylk (frame, this);
a3470f
+        return 0;
a3470f
+}
a3470f
+
a3470f
 static int32_t
a3470f
 afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
a3470f
               int32_t op_ret, int32_t op_errno, dict_t *xdata)
a3470f
@@ -553,7 +486,7 @@ afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
a3470f
 
a3470f
         if ((op_ret == -1) &&
a3470f
             (op_errno == ENOSYS)) {
a3470f
-                afr_unlock (frame, this);
a3470f
+                afr_unlock_now (frame, this);
a3470f
         } else {
a3470f
                 if (op_ret == 0) {
a3470f
                         if (local->transaction.type == AFR_ENTRY_TRANSACTION ||
a3470f
@@ -598,38 +531,6 @@ afr_blocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
a3470f
         return 0;
a3470f
 }
a3470f
 
a3470f
-static int
a3470f
-afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this)
a3470f
-{
a3470f
-        afr_internal_lock_t *int_lock = NULL;
a3470f
-        afr_inodelk_t       *inodelk  = NULL;
a3470f
-        afr_local_t         *local    = NULL;
a3470f
-        afr_private_t       *priv     = NULL;
a3470f
-
a3470f
-        priv     = this->private;
a3470f
-        local    = frame->local;
a3470f
-        int_lock = &local->internal_lock;
a3470f
-
a3470f
-        switch (local->transaction.type) {
a3470f
-        case AFR_DATA_TRANSACTION:
a3470f
-        case AFR_METADATA_TRANSACTION:
a3470f
-                inodelk = afr_get_inodelk (int_lock, int_lock->domain);
a3470f
-                memcpy (inodelk->locked_nodes, int_lock->locked_nodes,
a3470f
-                        sizeof (*inodelk->locked_nodes) * priv->child_count);
a3470f
-                inodelk->lock_count = int_lock->lock_count;
a3470f
-                break;
a3470f
-
a3470f
-        case AFR_ENTRY_RENAME_TRANSACTION:
a3470f
-        case AFR_ENTRY_TRANSACTION:
a3470f
-                /*entrylk_count is being used in both non-blocking and blocking
a3470f
-                 * modes */
a3470f
-                break;
a3470f
-        }
a3470f
-
a3470f
-        return 0;
a3470f
-
a3470f
-}
a3470f
-
a3470f
 static gf_boolean_t
a3470f
 afr_is_entrylk (afr_transaction_type trans_type)
a3470f
 {
a3470f
@@ -733,7 +634,6 @@ int
a3470f
 afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
a3470f
 {
a3470f
         afr_internal_lock_t *int_lock    = NULL;
a3470f
-        afr_inodelk_t       *inodelk     = NULL;
a3470f
         afr_local_t         *local       = NULL;
a3470f
         afr_private_t       *priv        = NULL;
a3470f
         struct gf_flock flock = {0,};
a3470f
@@ -752,10 +652,9 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
a3470f
 
a3470f
 
a3470f
         if (!is_entrylk) {
a3470f
-                inodelk = afr_get_inodelk (int_lock, int_lock->domain);
a3470f
-                flock.l_start = inodelk->flock.l_start;
a3470f
-                flock.l_len   = inodelk->flock.l_len;
a3470f
-                flock.l_type  = inodelk->flock.l_type;
a3470f
+                flock.l_start = int_lock->flock.l_start;
a3470f
+                flock.l_len   = int_lock->flock.l_len;
a3470f
+                flock.l_type  = int_lock->flock.l_type;
a3470f
         }
a3470f
 
a3470f
         if (local->fd) {
a3470f
@@ -770,9 +669,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
a3470f
                         local->op_ret           = -1;
a3470f
                         int_lock->lock_op_ret   = -1;
a3470f
 
a3470f
-                        afr_copy_locked_nodes (frame, this);
a3470f
-
a3470f
-                        afr_unlock (frame, this);
a3470f
+                        afr_unlock_now (frame, this);
a3470f
 
a3470f
                         return 0;
a3470f
                 }
a3470f
@@ -784,9 +681,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
a3470f
                         local->op_ret           = -1;
a3470f
                         int_lock->lock_op_ret   = -1;
a3470f
 
a3470f
-                        afr_copy_locked_nodes (frame, this);
a3470f
-
a3470f
-                        afr_unlock(frame, this);
a3470f
+                        afr_unlock_now(frame, this);
a3470f
 
a3470f
                         return 0;
a3470f
                 }
a3470f
@@ -798,8 +693,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
a3470f
                 gf_msg_debug (this->name, 0,
a3470f
                               "we're done locking");
a3470f
 
a3470f
-                afr_copy_locked_nodes (frame, this);
a3470f
-
a3470f
                 int_lock->lock_op_ret = 0;
a3470f
                 int_lock->lock_cbk (frame, this);
a3470f
                 return 0;
a3470f
@@ -815,7 +708,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
a3470f
         case AFR_METADATA_TRANSACTION:
a3470f
 
a3470f
                 if (local->fd) {
a3470f
-
a3470f
                         STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk,
a3470f
                                            (void *) (long) child_index,
a3470f
                                            priv->children[child_index],
a3470f
@@ -824,7 +716,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
a3470f
                                            F_SETLKW, &flock, NULL);
a3470f
 
a3470f
                 } else {
a3470f
-
a3470f
                         STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk,
a3470f
                                            (void *) (long) child_index,
a3470f
                                            priv->children[child_index],
a3470f
@@ -841,7 +732,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
a3470f
                  *and 'fd-less' children */
a3470f
 
a3470f
                 if (local->fd) {
a3470f
-
a3470f
                         STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk,
a3470f
                                            (void *) (long) cookie,
a3470f
                                            priv->children[child_index],
a3470f
@@ -850,7 +740,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
a3470f
                                            int_lock->lockee[lockee_no].basename,
a3470f
                                            ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
a3470f
                 } else {
a3470f
-
a3470f
                         STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk,
a3470f
                                            (void *) (long) cookie,
a3470f
                                            priv->children[child_index],
a3470f
@@ -922,7 +811,6 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
a3470f
         local    = frame->local;
a3470f
         int_lock = &local->internal_lock;
a3470f
 
a3470f
-
a3470f
 	LOCK (&frame->lock);
a3470f
 	{
a3470f
 		if (op_ret < 0 ) {
a3470f
@@ -969,7 +857,7 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
a3470f
                                       "with blocking calls",
a3470f
                                       int_lock->lock_count);
a3470f
 
a3470f
-                        afr_unlock(frame, this);
a3470f
+                        afr_unlock_now(frame, this);
a3470f
                 }
a3470f
         }
a3470f
 
a3470f
@@ -1009,7 +897,7 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)
a3470f
                         local->op_errno         = EINVAL;
a3470f
                         int_lock->lock_op_errno = EINVAL;
a3470f
 
a3470f
-			afr_unlock (frame, this);
a3470f
+			afr_unlock_now (frame, this);
a3470f
                         return -1;
a3470f
                 }
a3470f
 
a3470f
@@ -1021,7 +909,7 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)
a3470f
                         gf_msg (this->name, GF_LOG_INFO, 0,
a3470f
                                 AFR_MSG_INFO_COMMON,
a3470f
                                 "fd not open on any subvolumes. aborting.");
a3470f
-                        afr_unlock (frame, this);
a3470f
+                        afr_unlock_now (frame, this);
a3470f
                         goto out;
a3470f
                 }
a3470f
 
a3470f
@@ -1031,7 +919,6 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)
a3470f
                         index = i%copies;
a3470f
                         lockee_no = i/copies;
a3470f
                         if (local->child_up[index]) {
a3470f
-
a3470f
                                 STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk,
a3470f
                                                    (void *) (long) i,
a3470f
                                                    priv->children[index],
a3470f
@@ -1053,7 +940,6 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)
a3470f
                         index = i%copies;
a3470f
                         lockee_no = i/copies;
a3470f
                         if (local->child_up[index]) {
a3470f
-
a3470f
                                 STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk,
a3470f
                                                    (void *) (long) i,
a3470f
                                                    priv->children[index],
a3470f
@@ -1077,18 +963,12 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
a3470f
                              int32_t op_ret, int32_t op_errno, dict_t *xdata)
a3470f
 {
a3470f
         afr_internal_lock_t *int_lock    = NULL;
a3470f
-        afr_inodelk_t       *inodelk     = NULL;
a3470f
         afr_local_t         *local       = NULL;
a3470f
-        afr_fd_ctx_t        *fd_ctx      = NULL;
a3470f
         int                  call_count  = 0;
a3470f
         int                  child_index = (long) cookie;
a3470f
 
a3470f
         local    = frame->local;
a3470f
         int_lock = &local->internal_lock;
a3470f
-        inodelk = afr_get_inodelk (int_lock, int_lock->domain);
a3470f
-
a3470f
-	if (local->fd)
a3470f
-		fd_ctx = afr_fd_ctx_get (local->fd, this);
a3470f
 
a3470f
         LOCK (&frame->lock);
a3470f
         {
a3470f
@@ -1105,43 +985,27 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
a3470f
 				int_lock->lock_op_errno      = op_errno;
a3470f
 				local->op_errno              = op_errno;
a3470f
 			}
a3470f
-			if (local->transaction.eager_lock)
a3470f
-				local->transaction.eager_lock[child_index] = 0;
a3470f
 		} else {
a3470f
-			inodelk->locked_nodes[child_index] |= LOCKED_YES;
a3470f
-			inodelk->lock_count++;
a3470f
-
a3470f
-			if (local->transaction.eager_lock &&
a3470f
-			    local->transaction.eager_lock[child_index] &&
a3470f
-			    local->fd) {
a3470f
-				/* piggybacked */
a3470f
-				if (op_ret == 1) {
a3470f
-					/* piggybacked */
a3470f
-				} else if (op_ret == 0) {
a3470f
-					/* lock acquired from server */
a3470f
-                                        fd_ctx->lock_acquired[child_index]++;
a3470f
-				}
a3470f
-			}
a3470f
-
a3470f
-                        if (local->transaction.type == AFR_DATA_TRANSACTION &&
a3470f
-                            op_ret == 0) {
a3470f
-                                LOCK(&local->inode->lock);
a3470f
-                                {
a3470f
-                                        local->inode_ctx->lock_count++;
a3470f
-                                }
a3470f
-                                UNLOCK (&local->inode->lock);
a3470f
-                        }
a3470f
+			int_lock->locked_nodes[child_index] |= LOCKED_YES;
a3470f
+			int_lock->lock_count++;
a3470f
 		}
a3470f
 
a3470f
                 call_count = --int_lock->lk_call_count;
a3470f
         }
a3470f
         UNLOCK (&frame->lock);
a3470f
 
a3470f
+        if (op_ret == 0 && local->transaction.type == AFR_DATA_TRANSACTION) {
a3470f
+                LOCK (&local->inode->lock);
a3470f
+                {
a3470f
+                        local->inode_ctx->lock_count++;
a3470f
+                }
a3470f
+                UNLOCK (&local->inode->lock);
a3470f
+        }
a3470f
         if (call_count == 0) {
a3470f
                 gf_msg_trace (this->name, 0,
a3470f
                               "Last inode locking reply received");
a3470f
                 /* all locks successful. Proceed to call FOP */
a3470f
-                if (inodelk->lock_count == int_lock->lk_expected_count) {
a3470f
+                if (int_lock->lock_count == int_lock->lk_expected_count) {
a3470f
                         gf_msg_trace (this->name, 0,
a3470f
                                       "All servers locked. Calling the cbk");
a3470f
                         int_lock->lock_op_ret = 0;
a3470f
@@ -1155,7 +1019,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
a3470f
                                       "Trying again with blocking calls",
a3470f
                                       int_lock->lock_count);
a3470f
 
a3470f
-                        afr_unlock(frame, this);
a3470f
+                        afr_unlock_now(frame, this);
a3470f
                 }
a3470f
         }
a3470f
 
a3470f
@@ -1166,30 +1030,17 @@ int
a3470f
 afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
a3470f
 {
a3470f
         afr_internal_lock_t *int_lock = NULL;
a3470f
-        afr_inodelk_t       *inodelk  = NULL;
a3470f
         afr_local_t         *local    = NULL;
a3470f
         afr_private_t       *priv     = NULL;
a3470f
         afr_fd_ctx_t        *fd_ctx   = NULL;
a3470f
         int32_t             call_count = 0;
a3470f
         int                 i          = 0;
a3470f
         int                 ret        = 0;
a3470f
-        struct              gf_flock flock = {0,};
a3470f
-        struct              gf_flock full_flock = {0,};
a3470f
-        struct              gf_flock *flock_use = NULL;
a3470f
-        int                 piggyback = 0;
a3470f
 
a3470f
         local    = frame->local;
a3470f
         int_lock = &local->internal_lock;
a3470f
         priv     = this->private;
a3470f
 
a3470f
-        inodelk = afr_get_inodelk (int_lock, int_lock->domain);
a3470f
-
a3470f
-        flock.l_start = inodelk->flock.l_start;
a3470f
-        flock.l_len   = inodelk->flock.l_len;
a3470f
-        flock.l_type  = inodelk->flock.l_type;
a3470f
-
a3470f
-        full_flock.l_type = inodelk->flock.l_type;
a3470f
-
a3470f
         initialize_inodelk_variables (frame, this);
a3470f
 
a3470f
         if (local->fd) {
a3470f
@@ -1205,88 +1056,48 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
a3470f
                         local->op_errno         = EINVAL;
a3470f
                         int_lock->lock_op_errno = EINVAL;
a3470f
 
a3470f
-			afr_unlock (frame, this);
a3470f
+			afr_unlock_now (frame, this);
a3470f
                         ret = -1;
a3470f
                         goto out;
a3470f
                 }
a3470f
+        }
a3470f
 
a3470f
-                call_count = internal_lock_count (frame, this);
a3470f
-                int_lock->lk_call_count = call_count;
a3470f
-                int_lock->lk_expected_count = call_count;
a3470f
-
a3470f
-                if (!call_count) {
a3470f
-                        gf_msg (this->name, GF_LOG_INFO, 0,
a3470f
-                                AFR_MSG_SUBVOLS_DOWN,
a3470f
-                                "All bricks are down, aborting.");
a3470f
-                        afr_unlock (frame, this);
a3470f
-                        goto out;
a3470f
-                }
a3470f
-
a3470f
-                /* Send non-blocking inodelk calls only on up children
a3470f
-                   and where the fd has been opened */
a3470f
-                for (i = 0; i < priv->child_count; i++) {
a3470f
-                        if (!local->child_up[i])
a3470f
-                                continue;
a3470f
-
a3470f
-                        flock_use = &flock;
a3470f
-                        if (!local->transaction.eager_lock_on) {
a3470f
-                                goto wind;
a3470f
-                        }
a3470f
-
a3470f
-                        piggyback = 0;
a3470f
-                        local->transaction.eager_lock[i] = 1;
a3470f
-
a3470f
-			afr_set_delayed_post_op (frame, this);
a3470f
+        call_count = internal_lock_count (frame, this);
a3470f
+        int_lock->lk_call_count = call_count;
a3470f
+        int_lock->lk_expected_count = call_count;
a3470f
 
a3470f
-                        LOCK (&local->fd->lock);
a3470f
-                        {
a3470f
-                                if (fd_ctx->lock_acquired[i]) {
a3470f
-                                        fd_ctx->lock_piggyback[i]++;
a3470f
-                                        piggyback = 1;
a3470f
-                                }
a3470f
-                        }
a3470f
-                        UNLOCK (&local->fd->lock);
a3470f
+        if (!call_count) {
a3470f
+                gf_msg (this->name, GF_LOG_INFO, 0,
a3470f
+                        AFR_MSG_SUBVOLS_DOWN,
a3470f
+                        "All bricks are down, aborting.");
a3470f
+                afr_unlock_now (frame, this);
a3470f
+                goto out;
a3470f
+        }
a3470f
 
a3470f
-                        if (piggyback) {
a3470f
-                                /* (op_ret == 1) => indicate piggybacked lock */
a3470f
-                                afr_nonblocking_inodelk_cbk (frame, (void *) (long) i,
a3470f
-                                                             this, 1, 0, NULL);
a3470f
-                                if (!--call_count)
a3470f
-                                        break;
a3470f
-                                continue;
a3470f
-                        }
a3470f
-                        flock_use = &full_flock;
a3470f
-                wind:
a3470f
+        /* Send non-blocking inodelk calls only on up children
a3470f
+           and where the fd has been opened */
a3470f
+        for (i = 0; i < priv->child_count; i++) {
a3470f
+                if (!local->child_up[i])
a3470f
+                        continue;
a3470f
 
a3470f
+                if (local->fd) {
a3470f
                         STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk,
a3470f
                                            (void *) (long) i,
a3470f
                                            priv->children[i],
a3470f
                                            priv->children[i]->fops->finodelk,
a3470f
                                            int_lock->domain, local->fd,
a3470f
-                                           F_SETLK, flock_use, NULL);
a3470f
-
a3470f
-                        if (!--call_count)
a3470f
-                                break;
a3470f
-                }
a3470f
-        } else {
a3470f
-                call_count = internal_lock_count (frame, this);
a3470f
-                int_lock->lk_call_count = call_count;
a3470f
-                int_lock->lk_expected_count = call_count;
a3470f
-
a3470f
-                for (i = 0; i < priv->child_count; i++) {
a3470f
-                        if (!local->child_up[i])
a3470f
-                                continue;
a3470f
+                                           F_SETLK, &int_lock->flock, NULL);
a3470f
+                } else {
a3470f
 
a3470f
                         STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk,
a3470f
                                            (void *) (long) i,
a3470f
                                            priv->children[i],
a3470f
                                            priv->children[i]->fops->inodelk,
a3470f
                                            int_lock->domain, &local->loc,
a3470f
-                                           F_SETLK, &flock, NULL);
a3470f
-
a3470f
-                        if (!--call_count)
a3470f
-                                break;
a3470f
+                                           F_SETLK, &int_lock->flock, NULL);
a3470f
                 }
a3470f
+                if (!--call_count)
a3470f
+                        break;
a3470f
         }
a3470f
 out:
a3470f
         return ret;
a3470f
@@ -1296,13 +1107,32 @@ int32_t
a3470f
 afr_unlock (call_frame_t *frame, xlator_t *this)
a3470f
 {
a3470f
         afr_local_t *local = NULL;
a3470f
+        afr_lock_t  *lock  = NULL;
a3470f
 
a3470f
         local = frame->local;
a3470f
 
a3470f
-        if (afr_is_inodelk_transaction(local->transaction.type))
a3470f
-                afr_unlock_inodelk (frame, this);
a3470f
-        else
a3470f
-                afr_unlock_entrylk (frame, this);
a3470f
+        if (!local->transaction.eager_lock_on)
a3470f
+                goto out;
a3470f
+        lock = &local->inode_ctx->lock[local->transaction.type];
a3470f
+        LOCK (&local->inode->lock);
a3470f
+        {
a3470f
+                list_del_init (&local->transaction.owner_list);
a3470f
+                if (list_empty (&lock->owners) && list_empty (&lock->post_op)) {
a3470f
+                        local->transaction.do_eager_unlock = _gf_true;
a3470f
+        /*TODO: Need to get metadata use on_disk and inherit/uninherit
a3470f
+         *GF_ASSERT (!local->inode_ctx->on_disk[local->transaction.type]);
a3470f
+         *GF_ASSERT (!local->inode_ctx->inherited[local->transaction.type]);
a3470f
+        */
a3470f
+                        GF_ASSERT (lock->release);
a3470f
+                }
a3470f
+        }
a3470f
+        UNLOCK (&local->inode->lock);
a3470f
+        if (!local->transaction.do_eager_unlock) {
a3470f
+                local->internal_lock.lock_cbk (frame, this);
a3470f
+                return 0;
a3470f
+        }
a3470f
 
a3470f
+out:
a3470f
+        afr_unlock_now (frame, this);
a3470f
         return 0;
a3470f
 }
a3470f
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
a3470f
index f61b237..32fd24a 100644
a3470f
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
a3470f
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
a3470f
@@ -2463,6 +2463,7 @@ afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid)
a3470f
         int           data_ret          = 1;
a3470f
         int           or_ret            = 0;
a3470f
         inode_t      *inode             = NULL;
a3470f
+        fd_t         *fd                = NULL;
a3470f
 	gf_boolean_t  data_selfheal     = _gf_false;
a3470f
 	gf_boolean_t  metadata_selfheal = _gf_false;
a3470f
 	gf_boolean_t  entry_selfheal    = _gf_false;
a3470f
@@ -2487,8 +2488,16 @@ afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid)
a3470f
                 goto out;
a3470f
         }
a3470f
 
a3470f
+        if (inode->ia_type == IA_IFREG) {
a3470f
+                ret = afr_selfheal_data_open (this, inode, &fd;;
a3470f
+                if (!fd) {
a3470f
+                        ret = -EIO;
a3470f
+                        goto out;
a3470f
+                }
a3470f
+        }
a3470f
+
a3470f
 	if (data_selfheal && dataheal_enabled)
a3470f
-                data_ret = afr_selfheal_data (frame, this, inode);
a3470f
+                data_ret = afr_selfheal_data (frame, this, fd);
a3470f
 
a3470f
 	if (metadata_selfheal && priv->metadata_self_heal)
a3470f
                 metadata_ret = afr_selfheal_metadata (frame, this, inode);
a3470f
@@ -2510,6 +2519,8 @@ afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid)
a3470f
 out:
a3470f
         if (inode)
a3470f
                 inode_unref (inode);
a3470f
+        if (fd)
a3470f
+                fd_unref (fd);
a3470f
         return ret;
a3470f
 }
a3470f
 /*
a3470f
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
a3470f
index bcd0dec..f872a98 100644
a3470f
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
a3470f
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
a3470f
@@ -856,22 +856,15 @@ out:
a3470f
 }
a3470f
 
a3470f
 int
a3470f
-afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode)
a3470f
+afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd)
a3470f
 {
a3470f
 	afr_private_t *priv = NULL;
a3470f
 	unsigned char *locked_on = NULL;
a3470f
 	int ret = 0;
a3470f
-	fd_t *fd = NULL;
a3470f
+        inode_t *inode = fd->inode;
a3470f
 
a3470f
 	priv = this->private;
a3470f
 
a3470f
-	ret = afr_selfheal_data_open (this, inode, &fd;;
a3470f
-	if (!fd) {
a3470f
-                gf_msg_debug (this->name, -ret, "%s: Failed to open",
a3470f
-                              uuid_utoa (inode->gfid));
a3470f
-                return -EIO;
a3470f
-        }
a3470f
-
a3470f
 	locked_on = alloca0 (priv->child_count);
a3470f
 
a3470f
 	ret = afr_selfheal_tie_breaker_inodelk (frame, this, inode,
a3470f
@@ -898,8 +891,5 @@ unlock:
a3470f
 	afr_selfheal_uninodelk (frame, this, inode, priv->sh_domain, 0, 0,
a3470f
 	                        locked_on);
a3470f
 
a3470f
-	if (fd)
a3470f
-		fd_unref (fd);
a3470f
-
a3470f
 	return ret;
a3470f
 }
a3470f
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
a3470f
index 188a334..b015976 100644
a3470f
--- a/xlators/cluster/afr/src/afr-self-heal.h
a3470f
+++ b/xlators/cluster/afr/src/afr-self-heal.h
a3470f
@@ -102,7 +102,7 @@ afr_selfheal_name (xlator_t *this, uuid_t gfid, const char *name,
a3470f
                    void *gfid_req, dict_t *xdata);
a3470f
 
a3470f
 int
a3470f
-afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode);
a3470f
+afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd);
a3470f
 
a3470f
 int
a3470f
 afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode);
a3470f
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
a3470f
index acbfe1a..993029d 100644
a3470f
--- a/xlators/cluster/afr/src/afr-transaction.c
a3470f
+++ b/xlators/cluster/afr/src/afr-transaction.c
a3470f
@@ -25,6 +25,18 @@ typedef enum {
a3470f
         AFR_TRANSACTION_POST_OP,
a3470f
 } afr_xattrop_type_t;
a3470f
 
a3470f
+static void
a3470f
+afr_lock_resume_shared (struct list_head *list);
a3470f
+
a3470f
+void
a3470f
+__afr_transaction_wake_shared (afr_local_t *local, struct list_head *shared);
a3470f
+
a3470f
+void
a3470f
+afr_changelog_post_op (call_frame_t *frame, xlator_t *this);
a3470f
+
a3470f
+int
a3470f
+afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this);
a3470f
+
a3470f
 gf_boolean_t
a3470f
 afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this);
a3470f
 
a3470f
@@ -168,13 +180,14 @@ afr_transaction_fop (call_frame_t *frame, xlator_t *this)
a3470f
         return 0;
a3470f
 }
a3470f
 
a3470f
-
a3470f
 int
a3470f
 afr_transaction_done (call_frame_t *frame, xlator_t *this)
a3470f
 {
a3470f
-        afr_local_t *local = NULL;
a3470f
-        afr_private_t *priv = NULL;
a3470f
-        gf_boolean_t unwind = _gf_false;
a3470f
+        afr_local_t   *local      = NULL;
a3470f
+        afr_private_t *priv       = NULL;
a3470f
+        gf_boolean_t  unwind      = _gf_false;
a3470f
+        afr_lock_t    *lock       = NULL;
a3470f
+        afr_local_t   *lock_local = NULL;
a3470f
 
a3470f
         priv  = this->private;
a3470f
         local = frame->local;
a3470f
@@ -188,6 +201,31 @@ afr_transaction_done (call_frame_t *frame, xlator_t *this)
a3470f
                 if (unwind)/*It definitely did post-op*/
a3470f
                         afr_zero_fill_stat (local);
a3470f
         }
a3470f
+
a3470f
+        if (local->transaction.do_eager_unlock) {
a3470f
+                lock = &local->inode_ctx->lock[local->transaction.type];
a3470f
+                LOCK (&local->inode->lock);
a3470f
+                {
a3470f
+                        lock->acquired = _gf_false;
a3470f
+                        lock->release = _gf_false;
a3470f
+                        list_splice_init (&lock->frozen,
a3470f
+                                          &lock->waiting);
a3470f
+                        if (list_empty (&lock->waiting))
a3470f
+                                goto unlock;
a3470f
+                        lock_local = list_entry (lock->waiting.next,
a3470f
+                                                 afr_local_t,
a3470f
+                                                transaction.wait_list);
a3470f
+                        list_del_init (&lock_local->transaction.wait_list);
a3470f
+                        list_add (&lock_local->transaction.owner_list,
a3470f
+                                  &lock->owners);
a3470f
+                }
a3470f
+unlock:
a3470f
+                UNLOCK (&local->inode->lock);
a3470f
+        }
a3470f
+        if (lock_local) {
a3470f
+                afr_lock (lock_local->transaction.frame,
a3470f
+                          lock_local->transaction.frame->this);
a3470f
+        }
a3470f
         local->transaction.unwind (frame, this);
a3470f
 
a3470f
         AFR_STACK_DESTROY (frame);
a3470f
@@ -195,6 +233,52 @@ afr_transaction_done (call_frame_t *frame, xlator_t *this)
a3470f
         return 0;
a3470f
 }
a3470f
 
a3470f
+static void
a3470f
+afr_lock_fail_shared (afr_local_t *local, struct list_head *list)
a3470f
+{
a3470f
+        afr_local_t *each = NULL;
a3470f
+
a3470f
+        while (!list_empty(list)) {
a3470f
+                each = list_entry (list->next, afr_local_t,
a3470f
+                                   transaction.wait_list);
a3470f
+                list_del_init(&each->transaction.wait_list);
a3470f
+                each->op_ret = -1;
a3470f
+                each->op_errno = local->op_errno;
a3470f
+                afr_transaction_done (each->transaction.frame,
a3470f
+                                      each->transaction.frame->this);
a3470f
+        }
a3470f
+}
a3470f
+
a3470f
+static void
a3470f
+afr_handle_lock_acquire_failure (afr_local_t *local, gf_boolean_t locked)
a3470f
+{
a3470f
+        struct list_head shared;
a3470f
+        afr_lock_t *lock = NULL;
a3470f
+
a3470f
+        if (!local->transaction.eager_lock_on)
a3470f
+                goto out;
a3470f
+
a3470f
+        lock = &local->inode_ctx->lock[local->transaction.type];
a3470f
+
a3470f
+        INIT_LIST_HEAD (&shared);
a3470f
+        LOCK (&local->inode->lock);
a3470f
+        {
a3470f
+                list_splice_init (&lock->waiting, &shared);
a3470f
+        }
a3470f
+        UNLOCK (&local->inode->lock);
a3470f
+
a3470f
+        afr_lock_fail_shared (local, &shared);
a3470f
+        local->transaction.do_eager_unlock = _gf_true;
a3470f
+out:
a3470f
+        if (locked) {
a3470f
+                local->internal_lock.lock_cbk = afr_transaction_done;
a3470f
+                afr_unlock (local->transaction.frame,
a3470f
+                            local->transaction.frame->this);
a3470f
+        } else {
a3470f
+                afr_transaction_done (local->transaction.frame,
a3470f
+                                      local->transaction.frame->this);
a3470f
+        }
a3470f
+}
a3470f
 
a3470f
 call_frame_t*
a3470f
 afr_transaction_detach_fop_frame (call_frame_t *frame)
a3470f
@@ -334,6 +418,7 @@ afr_txn_arbitrate_fop (call_frame_t *frame, xlator_t *this)
a3470f
         afr_local_t *local = NULL;
a3470f
         afr_private_t *priv = NULL;
a3470f
         int pre_op_sources_count = 0;
a3470f
+        int i = 0;
a3470f
 
a3470f
         priv = this->private;
a3470f
         local = frame->local;
a3470f
@@ -345,11 +430,11 @@ afr_txn_arbitrate_fop (call_frame_t *frame, xlator_t *this)
a3470f
         /* If arbiter is the only source, do not proceed. */
a3470f
         if (pre_op_sources_count < 2 &&
a3470f
             local->transaction.pre_op_sources[ARBITER_BRICK_INDEX]) {
a3470f
-                local->internal_lock.lock_cbk = afr_transaction_done;
a3470f
                 local->op_ret = -1;
a3470f
                 local->op_errno =  ENOTCONN;
a3470f
-                afr_restore_lk_owner (frame);
a3470f
-                afr_unlock (frame, this);
a3470f
+                for (i = 0; i < priv->child_count; i++)
a3470f
+                        local->transaction.failed_subvols[i] = 1;
a3470f
+                afr_changelog_post_op (frame, this);/*uninherit should happen*/
a3470f
         } else {
a3470f
                 afr_transaction_fop (frame, this);
a3470f
         }
a3470f
@@ -362,14 +447,16 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)
a3470f
 {
a3470f
         afr_local_t   *local = NULL;
a3470f
         afr_private_t *priv  = NULL;
a3470f
-        fd_t          *fd    = NULL;
a3470f
         int           i      = 0;
a3470f
         int           ret    = 0;
a3470f
+        int     failure_count = 0;
a3470f
+        struct list_head shared;
a3470f
+        afr_lock_t *lock = NULL;
a3470f
 
a3470f
         local = frame->local;
a3470f
         priv = this->private;
a3470f
-        fd    = local->fd;
a3470f
 
a3470f
+        INIT_LIST_HEAD (&shared);
a3470f
         if (local->transaction.type == AFR_DATA_TRANSACTION &&
a3470f
             !local->transaction.inherited) {
a3470f
                 ret = afr_write_subvol_set (frame, this);
a3470f
@@ -394,22 +481,31 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)
a3470f
 		   just now, before OP */
a3470f
 		afr_changelog_pre_op_update (frame, this);
a3470f
 
a3470f
-        /* The wake up needs to happen independent of
a3470f
-           what type of fop arrives here. If it was
a3470f
-           a write, then it has already inherited the
a3470f
-           lock and changelog. If it was not a write,
a3470f
-           then the presumption of the optimization (of
a3470f
-           optimizing for successive write operations)
a3470f
-           fails.
a3470f
-        */
a3470f
-        if (fd)
a3470f
-                afr_delayed_changelog_wake_up (this, fd);
a3470f
+        if (!local->transaction.eager_lock_on ||
a3470f
+            local->transaction.inherited)
a3470f
+                goto fop;
a3470f
+        failure_count = AFR_COUNT (local->transaction.failed_subvols,
a3470f
+                                   priv->child_count);
a3470f
+        if (failure_count == priv->child_count) {
a3470f
+                afr_handle_lock_acquire_failure (local, _gf_true);
a3470f
+        } else {
a3470f
+                lock = &local->inode_ctx->lock[local->transaction.type];
a3470f
+                LOCK (&local->inode->lock);
a3470f
+                {
a3470f
+                        lock->acquired = _gf_true;
a3470f
+                        __afr_transaction_wake_shared (local, &shared);
a3470f
+                }
a3470f
+                UNLOCK (&local->inode->lock);
a3470f
+        }
a3470f
+
a3470f
+fop:
a3470f
         if (priv->arbiter_count == 1) {
a3470f
                 afr_txn_arbitrate_fop (frame, this);
a3470f
         } else {
a3470f
                 afr_transaction_fop (frame, this);
a3470f
         }
a3470f
 
a3470f
+        afr_lock_resume_shared (&shared);
a3470f
 	return 0;
a3470f
 }
a3470f
 
a3470f
@@ -486,30 +582,14 @@ afr_changelog_post_op_done (call_frame_t *frame, xlator_t *this)
a3470f
 }
a3470f
 
a3470f
 
a3470f
-afr_inodelk_t*
a3470f
-afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom)
a3470f
-{
a3470f
-        afr_inodelk_t *inodelk = NULL;
a3470f
-        int           i = 0;
a3470f
-
a3470f
-        for (i = 0; int_lock->inodelk[i].domain; i++) {
a3470f
-                inodelk = &int_lock->inodelk[i];
a3470f
-                if (strcmp (dom, inodelk->domain) == 0)
a3470f
-                        return inodelk;
a3470f
-        }
a3470f
-        return NULL;
a3470f
-}
a3470f
-
a3470f
 unsigned char*
a3470f
 afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock)
a3470f
 {
a3470f
         unsigned char *locked_nodes = NULL;
a3470f
-        afr_inodelk_t *inodelk = NULL;
a3470f
         switch (type) {
a3470f
         case AFR_DATA_TRANSACTION:
a3470f
         case AFR_METADATA_TRANSACTION:
a3470f
-                inodelk = afr_get_inodelk (int_lock, int_lock->domain);
a3470f
-                locked_nodes = inodelk->locked_nodes;
a3470f
+                locked_nodes = int_lock->locked_nodes;
a3470f
         break;
a3470f
 
a3470f
         case AFR_ENTRY_TRANSACTION:
a3470f
@@ -834,27 +914,19 @@ afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this)
a3470f
 {
a3470f
 	afr_local_t *local = NULL;
a3470f
 	afr_private_t *priv = NULL;
a3470f
-	fd_t *fd = NULL;
a3470f
+        afr_inode_ctx_t *ctx = NULL;
a3470f
 	int i = 0;
a3470f
 	gf_boolean_t ret = _gf_false;
a3470f
-	afr_fd_ctx_t *fd_ctx = NULL;
a3470f
 	int type = 0;
a3470f
 
a3470f
 	local = frame->local;
a3470f
 	priv = this->private;
a3470f
-	fd = local->fd;
a3470f
+        ctx = local->inode_ctx;
a3470f
 
a3470f
 	type = afr_index_for_transaction_type (local->transaction.type);
a3470f
 	if (type != AFR_DATA_TRANSACTION)
a3470f
 		return !local->transaction.dirtied;
a3470f
 
a3470f
-	if (!fd)
a3470f
-		return !local->transaction.dirtied;
a3470f
-
a3470f
-	fd_ctx = afr_fd_ctx_get (fd, this);
a3470f
-	if (!fd_ctx)
a3470f
-		return _gf_false;
a3470f
-
a3470f
 	if (local->transaction.no_uninherit)
a3470f
 		return _gf_false;
a3470f
 
a3470f
@@ -868,34 +940,34 @@ afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this)
a3470f
 	if (local->transaction.uninherit_done)
a3470f
 		return local->transaction.uninherit_value;
a3470f
 
a3470f
-	LOCK(&fd->lock);
a3470f
+	LOCK(&local->inode->lock);
a3470f
 	{
a3470f
 		for (i = 0; i < priv->child_count; i++) {
a3470f
 			if (local->transaction.pre_op[i] !=
a3470f
-			    fd_ctx->pre_op_done[type][i]) {
a3470f
+			    ctx->pre_op_done[type][i]) {
a3470f
 				ret = !local->transaction.dirtied;
a3470f
 				goto unlock;
a3470f
 			}
a3470f
 		}
a3470f
 
a3470f
-		if (fd_ctx->inherited[type]) {
a3470f
+		if (ctx->inherited[type]) {
a3470f
 			ret = _gf_true;
a3470f
-			fd_ctx->inherited[type]--;
a3470f
-		} else if (fd_ctx->on_disk[type]) {
a3470f
+			ctx->inherited[type]--;
a3470f
+		} else if (ctx->on_disk[type]) {
a3470f
 			ret = _gf_false;
a3470f
-			fd_ctx->on_disk[type]--;
a3470f
+			ctx->on_disk[type]--;
a3470f
 		} else {
a3470f
 			/* ASSERT */
a3470f
 			ret = _gf_false;
a3470f
 		}
a3470f
 
a3470f
-		if (!fd_ctx->inherited[type] && !fd_ctx->on_disk[type]) {
a3470f
+		if (!ctx->inherited[type] && !ctx->on_disk[type]) {
a3470f
 			for (i = 0; i < priv->child_count; i++)
a3470f
-				fd_ctx->pre_op_done[type][i] = 0;
a3470f
+				ctx->pre_op_done[type][i] = 0;
a3470f
 		}
a3470f
 	}
a3470f
 unlock:
a3470f
-	UNLOCK(&fd->lock);
a3470f
+	UNLOCK(&local->inode->lock);
a3470f
 
a3470f
 	local->transaction.uninherit_done = _gf_true;
a3470f
 	local->transaction.uninherit_value = ret;
a3470f
@@ -909,31 +981,21 @@ afr_changelog_pre_op_inherit (call_frame_t *frame, xlator_t *this)
a3470f
 {
a3470f
 	afr_local_t *local = NULL;
a3470f
 	afr_private_t *priv = NULL;
a3470f
-	fd_t *fd = NULL;
a3470f
 	int i = 0;
a3470f
 	gf_boolean_t ret = _gf_false;
a3470f
-	afr_fd_ctx_t *fd_ctx = NULL;
a3470f
 	int type = 0;
a3470f
 
a3470f
 	local = frame->local;
a3470f
 	priv = this->private;
a3470f
-	fd = local->fd;
a3470f
 
a3470f
 	if (local->transaction.type != AFR_DATA_TRANSACTION)
a3470f
 		return _gf_false;
a3470f
 
a3470f
 	type = afr_index_for_transaction_type (local->transaction.type);
a3470f
 
a3470f
-	if (!fd)
a3470f
-		return _gf_false;
a3470f
-
a3470f
-	fd_ctx = afr_fd_ctx_get (fd, this);
a3470f
-	if (!fd_ctx)
a3470f
-		return _gf_false;
a3470f
-
a3470f
-	LOCK(&fd->lock);
a3470f
+	LOCK(&local->inode->lock);
a3470f
 	{
a3470f
-		if (!fd_ctx->on_disk[type]) {
a3470f
+		if (!local->inode_ctx->on_disk[type]) {
a3470f
 			/* nothing to inherit yet */
a3470f
 			ret = _gf_false;
a3470f
 			goto unlock;
a3470f
@@ -941,21 +1003,21 @@ afr_changelog_pre_op_inherit (call_frame_t *frame, xlator_t *this)
a3470f
 
a3470f
 		for (i = 0; i < priv->child_count; i++) {
a3470f
 			if (local->transaction.pre_op[i] !=
a3470f
-			    fd_ctx->pre_op_done[type][i]) {
a3470f
+			    local->inode_ctx->pre_op_done[type][i]) {
a3470f
 				/* either inherit exactly, or don't */
a3470f
 				ret = _gf_false;
a3470f
 				goto unlock;
a3470f
 			}
a3470f
 		}
a3470f
 
a3470f
-		fd_ctx->inherited[type]++;
a3470f
+		local->inode_ctx->inherited[type]++;
a3470f
 
a3470f
 		ret = _gf_true;
a3470f
 
a3470f
 		local->transaction.inherited = _gf_true;
a3470f
 	}
a3470f
 unlock:
a3470f
-	UNLOCK(&fd->lock);
a3470f
+	UNLOCK(&local->inode->lock);
a3470f
 
a3470f
 	return ret;
a3470f
 }
a3470f
@@ -966,22 +1028,16 @@ afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this)
a3470f
 {
a3470f
 	afr_local_t *local = NULL;
a3470f
 	afr_private_t *priv = NULL;
a3470f
-	fd_t *fd = NULL;
a3470f
-	afr_fd_ctx_t *fd_ctx = NULL;
a3470f
 	int i = 0;
a3470f
 	gf_boolean_t ret = _gf_false;
a3470f
 	int type = 0;
a3470f
 
a3470f
 	local = frame->local;
a3470f
 	priv = this->private;
a3470f
-	fd = local->fd;
a3470f
 
a3470f
-	if (!fd)
a3470f
-		return _gf_false;
a3470f
-
a3470f
-	fd_ctx = afr_fd_ctx_get (fd, this);
a3470f
-	if (!fd_ctx)
a3470f
-		return _gf_false;
a3470f
+        if (local->transaction.type == AFR_ENTRY_TRANSACTION ||
a3470f
+            local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION)
a3470f
+                return _gf_false;
a3470f
 
a3470f
 	if (local->transaction.inherited)
a3470f
 		/* was already inherited in afr_changelog_pre_op */
a3470f
@@ -997,26 +1053,26 @@ afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this)
a3470f
 
a3470f
 	ret = _gf_false;
a3470f
 
a3470f
-	LOCK(&fd->lock);
a3470f
+	LOCK(&local->inode->lock);
a3470f
 	{
a3470f
-		if (!fd_ctx->on_disk[type]) {
a3470f
+		if (!local->inode_ctx->on_disk[type]) {
a3470f
 			for (i = 0; i < priv->child_count; i++)
a3470f
-				fd_ctx->pre_op_done[type][i] =
a3470f
+				local->inode_ctx->pre_op_done[type][i] =
a3470f
                                         (!local->transaction.failed_subvols[i]);
a3470f
 		} else {
a3470f
 			for (i = 0; i < priv->child_count; i++)
a3470f
-				if (fd_ctx->pre_op_done[type][i] !=
a3470f
+				if (local->inode_ctx->pre_op_done[type][i] !=
a3470f
 				    (!local->transaction.failed_subvols[i])) {
a3470f
 					local->transaction.no_uninherit = 1;
a3470f
 					goto unlock;
a3470f
 				}
a3470f
 		}
a3470f
-		fd_ctx->on_disk[type]++;
a3470f
+		local->inode_ctx->on_disk[type]++;
a3470f
 
a3470f
 		ret = _gf_true;
a3470f
 	}
a3470f
 unlock:
a3470f
-	UNLOCK(&fd->lock);
a3470f
+	UNLOCK(&local->inode->lock);
a3470f
 
a3470f
 	return ret;
a3470f
 }
a3470f
@@ -1324,6 +1380,9 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
a3470f
 
a3470f
         afr_init_optimistic_changelog_for_txn (this, local);
a3470f
 
a3470f
+        if (afr_changelog_pre_op_inherit (frame, this))
a3470f
+                goto next;
a3470f
+
a3470f
         /* This condition should not be met with present code, as
a3470f
          * transaction.done will be called if locks are not acquired on even a
a3470f
          * single node.
a3470f
@@ -1349,9 +1408,6 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
a3470f
 		goto err;
a3470f
 	}
a3470f
 
a3470f
-	if (afr_changelog_pre_op_inherit (frame, this))
a3470f
-		goto next;
a3470f
-
a3470f
         if (call_count < priv->child_count)
a3470f
                 pre_nop = _gf_false;
a3470f
 
a3470f
@@ -1408,7 +1464,7 @@ err:
a3470f
 	local->op_ret = -1;
a3470f
 	local->op_errno = op_errno;
a3470f
 
a3470f
-	afr_unlock (frame, this);
a3470f
+        afr_handle_lock_acquire_failure (local, _gf_true);
a3470f
 
a3470f
 	if (xdata_req)
a3470f
 		dict_unref (xdata_req);
a3470f
@@ -1418,31 +1474,6 @@ err:
a3470f
 
a3470f
 
a3470f
 int
a3470f
-afr_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
a3470f
-{
a3470f
-        afr_internal_lock_t *int_lock = NULL;
a3470f
-        afr_local_t         *local    = NULL;
a3470f
-
a3470f
-        local    = frame->local;
a3470f
-        int_lock = &local->internal_lock;
a3470f
-
a3470f
-        if (int_lock->lock_op_ret < 0) {
a3470f
-                gf_msg (this->name, GF_LOG_INFO,
a3470f
-                        0, AFR_MSG_BLOCKING_LKS_FAILED,
a3470f
-                        "Blocking inodelks failed.");
a3470f
-                afr_transaction_done (frame, this);
a3470f
-        } else {
a3470f
-
a3470f
-                gf_msg_debug (this->name, 0,
a3470f
-                              "Blocking inodelks done. Proceeding to FOP");
a3470f
-                afr_internal_lock_finish (frame, this);
a3470f
-        }
a3470f
-
a3470f
-        return 0;
a3470f
-}
a3470f
-
a3470f
-
a3470f
-int
a3470f
 afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
a3470f
 {
a3470f
         afr_internal_lock_t *int_lock = NULL;
a3470f
@@ -1455,7 +1486,7 @@ afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
a3470f
         if (int_lock->lock_op_ret < 0) {
a3470f
                 gf_msg_debug (this->name, 0,
a3470f
                               "Non blocking inodelks failed. Proceeding to blocking");
a3470f
-                int_lock->lock_cbk = afr_post_blocking_inodelk_cbk;
a3470f
+                int_lock->lock_cbk = afr_internal_lock_finish;
a3470f
                 afr_blocking_lock (frame, this);
a3470f
         } else {
a3470f
 
a3470f
@@ -1469,31 +1500,6 @@ afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
a3470f
 
a3470f
 
a3470f
 int
a3470f
-afr_post_blocking_entrylk_cbk (call_frame_t *frame, xlator_t *this)
a3470f
-{
a3470f
-        afr_internal_lock_t *int_lock = NULL;
a3470f
-        afr_local_t         *local    = NULL;
a3470f
-
a3470f
-        local    = frame->local;
a3470f
-        int_lock = &local->internal_lock;
a3470f
-
a3470f
-        if (int_lock->lock_op_ret < 0) {
a3470f
-                gf_msg (this->name, GF_LOG_INFO, 0,
a3470f
-                        AFR_MSG_BLOCKING_LKS_FAILED,
a3470f
-                        "Blocking entrylks failed.");
a3470f
-                afr_transaction_done (frame, this);
a3470f
-        } else {
a3470f
-
a3470f
-                gf_msg_debug (this->name, 0,
a3470f
-                             "Blocking entrylks done. Proceeding to FOP");
a3470f
-                afr_internal_lock_finish (frame, this);
a3470f
-        }
a3470f
-
a3470f
-        return 0;
a3470f
-}
a3470f
-
a3470f
-
a3470f
-int
a3470f
 afr_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this)
a3470f
 {
a3470f
         afr_internal_lock_t *int_lock = NULL;
a3470f
@@ -1506,7 +1512,7 @@ afr_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this)
a3470f
         if (int_lock->lock_op_ret < 0) {
a3470f
                 gf_msg_debug (this->name, 0,
a3470f
                               "Non blocking entrylks failed. Proceeding to blocking");
a3470f
-                int_lock->lock_cbk = afr_post_blocking_entrylk_cbk;
a3470f
+                int_lock->lock_cbk = afr_internal_lock_finish;
a3470f
                 afr_blocking_lock (frame, this);
a3470f
         } else {
a3470f
 
a3470f
@@ -1567,29 +1573,28 @@ int
a3470f
 afr_set_transaction_flock (xlator_t *this, afr_local_t *local)
a3470f
 {
a3470f
         afr_internal_lock_t *int_lock = NULL;
a3470f
-        afr_inodelk_t       *inodelk  = NULL;
a3470f
         afr_private_t       *priv     = NULL;
a3470f
 
a3470f
         int_lock = &local->internal_lock;
a3470f
-        inodelk = afr_get_inodelk (int_lock, int_lock->domain);
a3470f
         priv = this->private;
a3470f
 
a3470f
-        if ((priv->arbiter_count || priv->full_lock) &&
a3470f
+        if ((priv->arbiter_count || local->transaction.eager_lock_on ||
a3470f
+             priv->full_lock) &&
a3470f
             local->transaction.type == AFR_DATA_TRANSACTION) {
a3470f
                 /*Lock entire file to avoid network split brains.*/
a3470f
-                inodelk->flock.l_len   = 0;
a3470f
-                inodelk->flock.l_start = 0;
a3470f
+                int_lock->flock.l_len   = 0;
a3470f
+                int_lock->flock.l_start = 0;
a3470f
         } else {
a3470f
-                inodelk->flock.l_len   = local->transaction.len;
a3470f
-                inodelk->flock.l_start = local->transaction.start;
a3470f
+                int_lock->flock.l_len   = local->transaction.len;
a3470f
+                int_lock->flock.l_start = local->transaction.start;
a3470f
         }
a3470f
-        inodelk->flock.l_type  = F_WRLCK;
a3470f
+        int_lock->flock.l_type  = F_WRLCK;
a3470f
 
a3470f
         return 0;
a3470f
 }
a3470f
 
a3470f
 int
a3470f
-afr_lock_rec (call_frame_t *frame, xlator_t *this)
a3470f
+afr_lock (call_frame_t *frame, xlator_t *this)
a3470f
 {
a3470f
         afr_internal_lock_t *int_lock = NULL;
a3470f
         afr_local_t         *local    = NULL;
a3470f
@@ -1630,74 +1635,153 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this)
a3470f
         return 0;
a3470f
 }
a3470f
 
a3470f
+static gf_boolean_t
a3470f
+afr_locals_overlap (afr_local_t *local1, afr_local_t *local2)
a3470f
+{
a3470f
+        uint64_t start1 = local1->transaction.start;
a3470f
+        uint64_t start2 = local2->transaction.start;
a3470f
+        uint64_t end1 = 0;
a3470f
+        uint64_t end2 = 0;
a3470f
+
a3470f
+        if (local1->transaction.len)
a3470f
+                end1 = start1 + local1->transaction.len - 1;
a3470f
+        else
a3470f
+                end1 = ULLONG_MAX;
a3470f
+
a3470f
+        if (local2->transaction.len)
a3470f
+                end2 = start2 + local2->transaction.len - 1;
a3470f
+        else
a3470f
+                end2 = ULLONG_MAX;
a3470f
 
a3470f
-int
a3470f
-afr_lock (call_frame_t *frame, xlator_t *this)
a3470f
+        return ((end1 >= start2) && (end2 >= start1));
a3470f
+}
a3470f
+
a3470f
+gf_boolean_t
a3470f
+afr_has_lock_conflict (afr_local_t *local, gf_boolean_t waitlist_check)
a3470f
 {
a3470f
-        afr_set_lock_number (frame, this);
a3470f
+        afr_local_t     *each = NULL;
a3470f
+        afr_lock_t      *lock = NULL;
a3470f
 
a3470f
-        return afr_lock_rec (frame, this);
a3470f
+        lock = &local->inode_ctx->lock[local->transaction.type];
a3470f
+        /*
a3470f
+         * Once full file lock is acquired in eager-lock phase, overlapping
a3470f
+         * writes do not compete for inode-locks, instead are transferred to the
a3470f
+         * next writes. Because of this overlapping writes are not ordered.
a3470f
+         * This can cause inconsistencies in replication.
a3470f
+         * Example:
a3470f
+         * Two overlapping writes w1, w2 are sent in parallel on same fd
a3470f
+         * in two threads t1, t2.
a3470f
+         * Both threads can execute afr_writev_wind in the following manner.
a3470f
+         * t1 winds w1 on brick-0
a3470f
+         * t2 winds w2 on brick-0
a3470f
+         * t2 winds w2 on brick-1
a3470f
+         * t1 winds w1 on brick-1
a3470f
+         *
a3470f
+         * This check makes sure the locks are not transferred for
a3470f
+         * overlapping writes.
a3470f
+         */
a3470f
+        list_for_each_entry (each, &lock->owners, transaction.owner_list) {
a3470f
+                if (afr_locals_overlap (each, local)) {
a3470f
+                        return _gf_true;
a3470f
+                }
a3470f
+        }
a3470f
+
a3470f
+        if (!waitlist_check)
a3470f
+                return _gf_false;
a3470f
+        list_for_each_entry (each, &lock->waiting, transaction.wait_list) {
a3470f
+                if (afr_locals_overlap (each, local)) {
a3470f
+                        return _gf_true;
a3470f
+                }
a3470f
+        }
a3470f
+        return _gf_false;
a3470f
 }
a3470f
 
a3470f
 
a3470f
 /* }}} */
a3470f
-
a3470f
-int
a3470f
-afr_internal_lock_finish (call_frame_t *frame, xlator_t *this)
a3470f
+static void
a3470f
+afr_copy_inodelk_vars (afr_internal_lock_t *dst, afr_internal_lock_t *src,
a3470f
+                       xlator_t *this)
a3470f
 {
a3470f
-        afr_changelog_pre_op (frame, this);
a3470f
+        afr_private_t *priv = this->private;
a3470f
 
a3470f
-        return 0;
a3470f
+        dst->domain = src->domain;
a3470f
+        dst->flock.l_len  = src->flock.l_len;
a3470f
+        dst->flock.l_start  = src->flock.l_start;
a3470f
+        dst->flock.l_type  = src->flock.l_type;
a3470f
+        dst->lock_count = src->lock_count;
a3470f
+        memcpy (dst->locked_nodes, src->locked_nodes,
a3470f
+                priv->child_count * sizeof (*dst->locked_nodes));
a3470f
 }
a3470f
 
a3470f
-
a3470f
 void
a3470f
-afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this)
a3470f
+__afr_transaction_wake_shared (afr_local_t *local, struct list_head *shared)
a3470f
 {
a3470f
-        afr_local_t    *local = NULL;
a3470f
-        afr_private_t  *priv = NULL;
a3470f
+        gf_boolean_t conflict = _gf_false;
a3470f
+        afr_local_t *each = NULL;
a3470f
+        afr_lock_t *lock = &local->inode_ctx->lock[local->transaction.type];
a3470f
 
a3470f
-        /* call this function from any of the related optimizations
a3470f
-           which benefit from delaying post op are enabled, namely:
a3470f
-
a3470f
-           - changelog piggybacking
a3470f
-           - eager locking
a3470f
-        */
a3470f
+        while (!conflict) {
a3470f
+                if (list_empty (&lock->waiting))
a3470f
+                        return;
a3470f
+                each = list_entry(lock->waiting.next, afr_local_t,
a3470f
+                                  transaction.wait_list);
a3470f
+                if (afr_has_lock_conflict (each, _gf_false)) {
a3470f
+                        conflict = _gf_true;
a3470f
+                }
a3470f
+                if (conflict && !list_empty (&lock->owners))
a3470f
+                        return;
a3470f
+                afr_copy_inodelk_vars (&each->internal_lock,
a3470f
+                                       &local->internal_lock,
a3470f
+                                       each->transaction.frame->this);
a3470f
+                list_move_tail (&each->transaction.wait_list, shared);
a3470f
+                list_add_tail(&each->transaction.owner_list, &lock->owners);
a3470f
+        }
a3470f
+}
a3470f
 
a3470f
-        priv = this->private;
a3470f
-        if (!priv)
a3470f
-                return;
a3470f
+static void
a3470f
+afr_lock_resume_shared (struct list_head *list)
a3470f
+{
a3470f
+        afr_local_t *each = NULL;
a3470f
 
a3470f
-        if (!priv->post_op_delay_secs)
a3470f
-                return;
a3470f
+        while (!list_empty(list)) {
a3470f
+                each = list_entry(list->next, afr_local_t,
a3470f
+                                  transaction.wait_list);
a3470f
+                list_del_init(&each->transaction.wait_list);
a3470f
+                afr_changelog_pre_op (each->transaction.frame,
a3470f
+                                      each->transaction.frame->this);
a3470f
+        }
a3470f
+}
a3470f
 
a3470f
-        local = frame->local;
a3470f
-        if (!local)
a3470f
-                return;
a3470f
+int
a3470f
+afr_internal_lock_finish (call_frame_t *frame, xlator_t *this)
a3470f
+{
a3470f
+        afr_local_t *local = frame->local;
a3470f
+        afr_lock_t   *lock  = NULL;
a3470f
 
a3470f
-        if (!local->transaction.eager_lock_on)
a3470f
-                return;
a3470f
 
a3470f
-        if (!local->fd)
a3470f
-                return;
a3470f
+        local->internal_lock.lock_cbk = NULL;
a3470f
+        if (!local->transaction.eager_lock_on) {
a3470f
+                if (local->internal_lock.lock_op_ret < 0) {
a3470f
+                        afr_transaction_done (frame, this);
a3470f
+                        return 0;
a3470f
+                }
a3470f
+                afr_changelog_pre_op (frame, this);
a3470f
+        } else {
a3470f
+                lock = &local->inode_ctx->lock[local->transaction.type];
a3470f
+                if (local->internal_lock.lock_op_ret < 0) {
a3470f
+                        afr_handle_lock_acquire_failure (local, _gf_false);
a3470f
+                } else {
a3470f
+                        lock->event_generation = local->event_generation;
a3470f
+                        afr_changelog_pre_op (frame, this);
a3470f
+                }
a3470f
+        }
a3470f
 
a3470f
-        if (local->op == GF_FOP_WRITE)
a3470f
-                local->delayed_post_op = _gf_true;
a3470f
+        return 0;
a3470f
 }
a3470f
 
a3470f
 gf_boolean_t
a3470f
-afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this)
a3470f
+afr_are_multiple_fds_opened (afr_local_t *local, xlator_t *this)
a3470f
 {
a3470f
-        afr_fd_ctx_t *fd_ctx = NULL;
a3470f
-
a3470f
-        if (!fd) {
a3470f
-                /* If false is returned, it may keep on taking eager-lock
a3470f
-                 * which may lead to starvation, so return true to avoid that.
a3470f
-                 */
a3470f
-                gf_msg_callingfn (this->name, GF_LOG_ERROR, EBADF,
a3470f
-                                  AFR_MSG_INVALID_ARG, "Invalid fd");
a3470f
-                return _gf_true;
a3470f
-        }
a3470f
         /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock
a3470f
          * is taken mount2 opened the same file, it won't be able to
a3470f
          * perform any data operations until mount1 releases eager-lock.
a3470f
@@ -1705,11 +1789,7 @@ afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this)
a3470f
          * if open-fd-count is > 1
a3470f
          */
a3470f
 
a3470f
-        fd_ctx = afr_fd_ctx_get (fd, this);
a3470f
-        if (!fd_ctx)
a3470f
-                return _gf_true;
a3470f
-
a3470f
-        if (fd_ctx->open_fd_count > 1)
a3470f
+        if (local->inode_ctx->open_fd_count > 1)
a3470f
                 return _gf_true;
a3470f
 
a3470f
         return _gf_false;
a3470f
@@ -1717,24 +1797,45 @@ afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this)
a3470f
 
a3470f
 
a3470f
 gf_boolean_t
a3470f
-is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this)
a3470f
+afr_is_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this,
a3470f
+                                         int delay)
a3470f
 {
a3470f
-        afr_local_t      *local = NULL;
a3470f
-        gf_boolean_t      res = _gf_false;
a3470f
+        afr_local_t  *local = NULL;
a3470f
+        afr_lock_t   *lock  = NULL;
a3470f
+        gf_boolean_t res    = _gf_false;
a3470f
 
a3470f
         local = frame->local;
a3470f
-        if (!local)
a3470f
+        lock = &local->inode_ctx->lock[local->transaction.type];
a3470f
+
a3470f
+        if (!afr_txn_nothing_failed (frame, this)) {
a3470f
+                lock->release = _gf_true;
a3470f
                 goto out;
a3470f
+        }
a3470f
 
a3470f
-        if (!local->delayed_post_op)
a3470f
+        if (afr_are_multiple_fds_opened (local, this)) {
a3470f
+                lock->release = _gf_true;
a3470f
                 goto out;
a3470f
+        }
a3470f
 
a3470f
-        //Mark pending changelog ASAP
a3470f
-        if (!afr_txn_nothing_failed (frame, this))
a3470f
+        if (!list_empty (&lock->owners))
a3470f
+                goto out;
a3470f
+        else
a3470f
+                GF_ASSERT (list_empty (&lock->waiting));
a3470f
+
a3470f
+        if (lock->release) {
a3470f
+                goto out;
a3470f
+        }
a3470f
+
a3470f
+        if (!delay) {
a3470f
                 goto out;
a3470f
+        }
a3470f
 
a3470f
-        if (local->fd && afr_are_multiple_fds_opened (local->fd, this))
a3470f
+        if ((local->op != GF_FOP_WRITE) &&
a3470f
+            (local->op != GF_FOP_FXATTROP)) {
a3470f
+                /*Only allow writes but shard does [f]xattrops on writes, so
a3470f
+                 * they are fine too*/
a3470f
                 goto out;
a3470f
+        }
a3470f
 
a3470f
         res = _gf_true;
a3470f
 out:
a3470f
@@ -1745,50 +1846,61 @@ out:
a3470f
 void
a3470f
 afr_delayed_changelog_wake_up_cbk (void *data)
a3470f
 {
a3470f
-        fd_t           *fd = NULL;
a3470f
+        afr_lock_t  *lock  = NULL;
a3470f
+        afr_local_t *local = data;
a3470f
+        afr_local_t *timer_local = NULL;
a3470f
+        struct list_head shared;
a3470f
 
a3470f
-        fd = data;
a3470f
-
a3470f
-        afr_delayed_changelog_wake_up (THIS, fd);
a3470f
+        INIT_LIST_HEAD (&shared);
a3470f
+        lock = &local->inode_ctx->lock[local->transaction.type];
a3470f
+        LOCK (&local->inode->lock);
a3470f
+        {
a3470f
+                timer_local = list_entry(lock->post_op.next,
a3470f
+                                         afr_local_t,
a3470f
+                                        transaction.owner_list);
a3470f
+                if (list_empty (&lock->owners) && (local == timer_local)) {
a3470f
+                        GF_ASSERT (list_empty (&lock->waiting));
a3470f
+                        /*Last owner*/
a3470f
+                        lock->release = _gf_true;
a3470f
+                        lock->delay_timer = NULL;
a3470f
+                }
a3470f
+        }
a3470f
+        UNLOCK (&local->inode->lock);
a3470f
+        afr_changelog_post_op_now (local->transaction.frame,
a3470f
+                                   local->transaction.frame->this);
a3470f
 }
a3470f
 
a3470f
 
a3470f
 /* SET operation */
a3470f
 int
a3470f
-afr_fd_report_unstable_write (xlator_t *this, fd_t *fd)
a3470f
+afr_fd_report_unstable_write (xlator_t *this, afr_local_t *local)
a3470f
 {
a3470f
-        afr_fd_ctx_t *fdctx = NULL;
a3470f
-
a3470f
-        fdctx = afr_fd_ctx_get (fd, this);
a3470f
-
a3470f
-        LOCK(&fd->lock);
a3470f
+        LOCK(&local->inode->lock);
a3470f
         {
a3470f
-                fdctx->witnessed_unstable_write = _gf_true;
a3470f
+                local->inode_ctx->witnessed_unstable_write = _gf_true;
a3470f
         }
a3470f
-        UNLOCK(&fd->lock);
a3470f
+        UNLOCK(&local->inode->lock);
a3470f
 
a3470f
         return 0;
a3470f
 }
a3470f
 
a3470f
 /* TEST and CLEAR operation */
a3470f
 gf_boolean_t
a3470f
-afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd)
a3470f
+afr_fd_has_witnessed_unstable_write (xlator_t *this, inode_t *inode)
a3470f
 {
a3470f
-        afr_fd_ctx_t *fdctx = NULL;
a3470f
+        afr_inode_ctx_t *ctx = NULL;
a3470f
         gf_boolean_t witness = _gf_false;
a3470f
 
a3470f
-        fdctx = afr_fd_ctx_get (fd, this);
a3470f
-        if (!fdctx)
a3470f
-                return _gf_true;
a3470f
-
a3470f
-        LOCK(&fd->lock);
a3470f
+        LOCK(&inode->lock);
a3470f
         {
a3470f
-                if (fdctx->witnessed_unstable_write) {
a3470f
+                (void)__afr_inode_ctx_get (this, inode, &ctx;;
a3470f
+
a3470f
+                if (ctx->witnessed_unstable_write) {
a3470f
                         witness = _gf_true;
a3470f
-                        fdctx->witnessed_unstable_write = _gf_false;
a3470f
+                        ctx->witnessed_unstable_write = _gf_false;
a3470f
                 }
a3470f
         }
a3470f
-        UNLOCK (&fd->lock);
a3470f
+        UNLOCK (&inode->lock);
a3470f
 
a3470f
         return witness;
a3470f
 }
a3470f
@@ -1931,7 +2043,7 @@ afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this)
a3470f
            mark a flag in the fdctx whenever an unstable write is witnessed.
a3470f
            */
a3470f
 
a3470f
-        if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) {
a3470f
+        if (!afr_fd_has_witnessed_unstable_write (this, local->inode)) {
a3470f
                 afr_changelog_post_op_now (frame, this);
a3470f
                 return 0;
a3470f
         }
a3470f
@@ -1949,87 +2061,64 @@ afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this)
a3470f
         return 0;
a3470f
 }
a3470f
 
a3470f
-
a3470f
 void
a3470f
-afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd,
a3470f
-                               call_stub_t *stub)
a3470f
+afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
a3470f
 {
a3470f
-	afr_fd_ctx_t      *fd_ctx = NULL;
a3470f
-	call_frame_t      *prev_frame = NULL;
a3470f
-	struct timespec    delta = {0, };
a3470f
-	afr_private_t     *priv = NULL;
a3470f
-	afr_local_t       *local = NULL;
a3470f
+	struct timespec delta   = {0, };
a3470f
+	afr_private_t   *priv   = NULL;
a3470f
+	afr_local_t     *local  = frame->local;
a3470f
+        afr_lock_t      *lock   = NULL;
a3470f
+        gf_boolean_t    post_op = _gf_true;
a3470f
+        struct list_head  shared;
a3470f
 
a3470f
 	priv = this->private;
a3470f
-
a3470f
-	fd_ctx = afr_fd_ctx_get (fd, this);
a3470f
-	if (!fd_ctx)
a3470f
-                goto out;
a3470f
-
a3470f
 	delta.tv_sec = priv->post_op_delay_secs;
a3470f
 	delta.tv_nsec = 0;
a3470f
 
a3470f
-	pthread_mutex_lock (&fd_ctx->delay_lock);
a3470f
-	{
a3470f
-		prev_frame = fd_ctx->delay_frame;
a3470f
-		fd_ctx->delay_frame = NULL;
a3470f
-		if (fd_ctx->delay_timer)
a3470f
-			gf_timer_call_cancel (this->ctx, fd_ctx->delay_timer);
a3470f
-		fd_ctx->delay_timer = NULL;
a3470f
-		if (!frame)
a3470f
-			goto unlock;
a3470f
-		fd_ctx->delay_timer = gf_timer_call_after (this->ctx, delta,
a3470f
-							   afr_delayed_changelog_wake_up_cbk,
a3470f
-							   fd);
a3470f
-		fd_ctx->delay_frame = frame;
a3470f
-	}
a3470f
-unlock:
a3470f
-	pthread_mutex_unlock (&fd_ctx->delay_lock);
a3470f
-
a3470f
-out:
a3470f
-	if (prev_frame) {
a3470f
-		local = prev_frame->local;
a3470f
-		local->transaction.resume_stub = stub;
a3470f
-		afr_changelog_post_op_now (prev_frame, this);
a3470f
-	} else if (stub) {
a3470f
-		call_resume (stub);
a3470f
-	}
a3470f
-}
a3470f
-
a3470f
-
a3470f
-void
a3470f
-afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
a3470f
-{
a3470f
-        afr_local_t  *local = NULL;
a3470f
-
a3470f
-        local = frame->local;
a3470f
-
a3470f
-        if (is_afr_delayed_changelog_post_op_needed (frame, this))
a3470f
-                afr_delayed_changelog_post_op (this, frame, local->fd, NULL);
a3470f
-        else
a3470f
-                afr_changelog_post_op_safe (frame, this);
a3470f
-}
a3470f
-
a3470f
+        INIT_LIST_HEAD (&shared);
a3470f
+        if (!local->transaction.eager_lock_on)
a3470f
+                goto out;
a3470f
 
a3470f
+        lock = &local->inode_ctx->lock[local->transaction.type];
a3470f
+        LOCK (&local->inode->lock);
a3470f
+	{
a3470f
+                list_del_init (&local->transaction.owner_list);
a3470f
+                list_add (&local->transaction.owner_list, &lock->post_op);
a3470f
+                __afr_transaction_wake_shared (local, &shared);
a3470f
+
a3470f
+                if (!afr_is_delayed_changelog_post_op_needed (frame, this,
a3470f
+                                                              delta.tv_sec)) {
a3470f
+                        if (list_empty (&lock->owners))
a3470f
+                                lock->release = _gf_true;
a3470f
+                        goto unlock;
a3470f
+                }
a3470f
 
a3470f
-/* Wake up the sleeping/delayed post-op, and also register
a3470f
-   a stub to have it resumed after this transaction
a3470f
-   completely finishes.
a3470f
+                GF_ASSERT (lock->delay_timer == NULL);
a3470f
+		lock->delay_timer = gf_timer_call_after (this->ctx, delta,
a3470f
+                                              afr_delayed_changelog_wake_up_cbk,
a3470f
+                                              local);
a3470f
+                if (!lock->delay_timer) {
a3470f
+                        lock->release = _gf_true;
a3470f
+                } else {
a3470f
+                        post_op = _gf_false;
a3470f
+                }
a3470f
 
a3470f
-   The @stub gets saved in @local and gets resumed in
a3470f
-   afr_local_cleanup()
a3470f
-   */
a3470f
-void
a3470f
-afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub)
a3470f
-{
a3470f
-        afr_delayed_changelog_post_op (this, NULL, fd, stub);
a3470f
-}
a3470f
+	}
a3470f
+unlock:
a3470f
+        UNLOCK (&local->inode->lock);
a3470f
 
a3470f
+        if (!list_empty (&shared)) {
a3470f
+                afr_lock_resume_shared (&shared);
a3470f
+        }
a3470f
 
a3470f
-void
a3470f
-afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd)
a3470f
-{
a3470f
-        afr_delayed_changelog_post_op (this, NULL, fd, NULL);
a3470f
+out:
a3470f
+        if (post_op) {
a3470f
+                if (!local->transaction.eager_lock_on || lock->release) {
a3470f
+                        afr_changelog_post_op_safe (frame, this);
a3470f
+                } else {
a3470f
+                        afr_changelog_post_op_now (frame, this);
a3470f
+                }
a3470f
+        }
a3470f
 }
a3470f
 
a3470f
 int
a3470f
@@ -2039,13 +2128,6 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this)
a3470f
 
a3470f
         local    = frame->local;
a3470f
 
a3470f
-        if (local->transaction.eager_lock_on) {
a3470f
-                /* We don't need to retain "local" in the
a3470f
-                   fd list anymore, writes to all subvols
a3470f
-                   are finished by now */
a3470f
-                afr_remove_eager_lock_stub (local);
a3470f
-        }
a3470f
-
a3470f
         afr_restore_lk_owner (frame);
a3470f
 
a3470f
         afr_handle_symmetric_errors (frame, this);
a3470f
@@ -2076,114 +2158,149 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,
a3470f
 	local->transaction.failed_subvols[child_index] = 1;
a3470f
 }
a3470f
 
a3470f
-
a3470f
-
a3470f
 static gf_boolean_t
a3470f
-afr_locals_overlap (afr_local_t *local1, afr_local_t *local2)
a3470f
+__need_previous_lock_unlocked (afr_local_t *local)
a3470f
 {
a3470f
-        uint64_t start1 = local1->transaction.start;
a3470f
-        uint64_t start2 = local2->transaction.start;
a3470f
-        uint64_t end1 = 0;
a3470f
-        uint64_t end2 = 0;
a3470f
-
a3470f
-        if (local1->transaction.len)
a3470f
-                end1 = start1 + local1->transaction.len - 1;
a3470f
-        else
a3470f
-                end1 = ULLONG_MAX;
a3470f
+        afr_lock_t      *lock = NULL;
a3470f
 
a3470f
-        if (local2->transaction.len)
a3470f
-                end2 = start2 + local2->transaction.len - 1;
a3470f
-        else
a3470f
-                end2 = ULLONG_MAX;
a3470f
+        if (!local->transaction.eager_lock_on)
a3470f
+                return _gf_true;
a3470f
 
a3470f
-        return ((end1 >= start2) && (end2 >= start1));
a3470f
+        lock = &local->inode_ctx->lock[local->transaction.type];
a3470f
+        if (!lock->acquired)
a3470f
+                return _gf_false;
a3470f
+        if (lock->acquired && lock->event_generation != local->event_generation)
a3470f
+                return _gf_true;
a3470f
+        return _gf_false;
a3470f
 }
a3470f
 
a3470f
 void
a3470f
-afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this)
a3470f
+__afr_eager_lock_handle (afr_local_t *local, gf_boolean_t *take_lock,
a3470f
+                         gf_boolean_t *do_pre_op, afr_local_t **timer_local)
a3470f
 {
a3470f
-        afr_private_t *priv = NULL;
a3470f
-        afr_fd_ctx_t  *fdctx = NULL;
a3470f
-        afr_local_t   *each = NULL;
a3470f
+        afr_lock_t      *lock = NULL;
a3470f
+        afr_local_t     *owner_local = NULL;
a3470f
+        xlator_t        *this = local->transaction.frame->this;
a3470f
 
a3470f
-        priv = this->private;
a3470f
-
a3470f
-        if (!local->fd)
a3470f
-                return;
a3470f
-
a3470f
-        if (local->transaction.type != AFR_DATA_TRANSACTION)
a3470f
-                return;
a3470f
+        if (local->fd && !afr_are_multiple_fds_opened (local, this)) {
a3470f
+                local->transaction.eager_lock_on = _gf_true;
a3470f
+        }
a3470f
 
a3470f
-        if (!priv->eager_lock)
a3470f
-                return;
a3470f
+        lock = &local->inode_ctx->lock[local->transaction.type];
a3470f
+        if (__need_previous_lock_unlocked (local)) {
a3470f
+                if (!list_empty (&lock->owners)) {
a3470f
+                        lock->release = _gf_true;
a3470f
+                } else if (lock->delay_timer) {
a3470f
+                        lock->release = _gf_true;
a3470f
+                        if (gf_timer_call_cancel (this->ctx,
a3470f
+                                                  lock->delay_timer)) {
a3470f
+                                /* It will be put in frozen list
a3470f
+                                 * in the code flow below*/
a3470f
+                        } else {
a3470f
+                                *timer_local = list_entry(lock->post_op.next,
a3470f
+                                                          afr_local_t,
a3470f
+                                                        transaction.owner_list);
a3470f
+                                lock->delay_timer = NULL;
a3470f
+                        }
a3470f
+                }
a3470f
+                if (!local->transaction.eager_lock_on)
a3470f
+                        goto out;
a3470f
+        }
a3470f
 
a3470f
-        fdctx = afr_fd_ctx_get (local->fd, this);
a3470f
-        if (!fdctx)
a3470f
-                return;
a3470f
+        if (lock->release) {
a3470f
+                list_add_tail (&local->transaction.wait_list,
a3470f
+                               &lock->frozen);
a3470f
+                *take_lock = _gf_false;
a3470f
+                goto out;
a3470f
+        }
a3470f
 
a3470f
-        if (afr_are_multiple_fds_opened (local->fd, this))
a3470f
-                return;
a3470f
-        /*
a3470f
-         * Once full file lock is acquired in eager-lock phase, overlapping
a3470f
-         * writes do not compete for inode-locks, instead are transferred to the
a3470f
-         * next writes. Because of this overlapping writes are not ordered.
a3470f
-         * This can cause inconsistencies in replication.
a3470f
-         * Example:
a3470f
-         * Two overlapping writes w1, w2 are sent in parallel on same fd
a3470f
-         * in two threads t1, t2.
a3470f
-         * Both threads can execute afr_writev_wind in the following manner.
a3470f
-         * t1 winds w1 on brick-0
a3470f
-         * t2 winds w2 on brick-0
a3470f
-         * t2 winds w2 on brick-1
a3470f
-         * t1 winds w1 on brick-1
a3470f
-         *
a3470f
-         * This check makes sure the locks are not transferred for
a3470f
-         * overlapping writes.
a3470f
-         */
a3470f
-        LOCK (&local->fd->lock);
a3470f
-        {
a3470f
-                list_for_each_entry (each, &fdctx->eager_locked,
a3470f
-                                     transaction.eager_locked) {
a3470f
-                        if (afr_locals_overlap (each, local)) {
a3470f
-                                local->transaction.eager_lock_on = _gf_false;
a3470f
-                                goto unlock;
a3470f
-                        }
a3470f
+        if (lock->delay_timer) {
a3470f
+                *take_lock = _gf_false;
a3470f
+                if (gf_timer_call_cancel (this->ctx,
a3470f
+                                          lock->delay_timer)) {
a3470f
+                        list_add_tail (&local->transaction.wait_list,
a3470f
+                                       &lock->frozen);
a3470f
+                } else {
a3470f
+                        *timer_local = list_entry(lock->post_op.next,
a3470f
+                                                  afr_local_t,
a3470f
+                                                  transaction.owner_list);
a3470f
+                        afr_copy_inodelk_vars (&local->internal_lock,
a3470f
+                                               &(*timer_local)->internal_lock,
a3470f
+                                               this);
a3470f
+                        lock->delay_timer = NULL;
a3470f
+                        *do_pre_op = _gf_true;
a3470f
+                        list_add_tail (&local->transaction.owner_list,
a3470f
+                                       &lock->owners);
a3470f
                 }
a3470f
+                goto out;
a3470f
+        }
a3470f
 
a3470f
-                local->transaction.eager_lock_on = _gf_true;
a3470f
-                list_add_tail (&local->transaction.eager_locked,
a3470f
-                               &fdctx->eager_locked);
a3470f
+        if (!list_empty (&lock->owners)) {
a3470f
+                if (!lock->acquired ||
a3470f
+                    afr_has_lock_conflict (local, _gf_true)) {
a3470f
+                        list_add_tail (&local->transaction.wait_list,
a3470f
+                                       &lock->waiting);
a3470f
+                        *take_lock = _gf_false;
a3470f
+                        goto out;
a3470f
+                }
a3470f
+                owner_local = list_entry (lock->owners.next,
a3470f
+                                          afr_local_t,
a3470f
+                                          transaction.owner_list);
a3470f
+                afr_copy_inodelk_vars (&local->internal_lock,
a3470f
+                                       &owner_local->internal_lock,
a3470f
+                                       this);
a3470f
+                *take_lock = _gf_false;
a3470f
+                *do_pre_op = _gf_true;
a3470f
         }
a3470f
-unlock:
a3470f
-        UNLOCK (&local->fd->lock);
a3470f
+
a3470f
+        if (lock->acquired)
a3470f
+                GF_ASSERT (!(*take_lock));
a3470f
+        list_add_tail (&local->transaction.owner_list, &lock->owners);
a3470f
+out:
a3470f
+        return;
a3470f
 }
a3470f
 
a3470f
 void
a3470f
-afr_transaction_start (call_frame_t *frame, xlator_t *this)
a3470f
+afr_transaction_start (afr_local_t *local, xlator_t *this)
a3470f
 {
a3470f
-        afr_local_t   *local = frame->local;
a3470f
-        fd_t          *fd    = NULL;
a3470f
+        afr_private_t   *priv = NULL;
a3470f
+        gf_boolean_t    take_lock  = _gf_true;
a3470f
+        gf_boolean_t    do_pre_op  = _gf_false;
a3470f
+        afr_local_t     *timer_local = NULL;
a3470f
 
a3470f
-        afr_transaction_eager_lock_init (local, this);
a3470f
+        priv = this->private;
a3470f
 
a3470f
-        if (local->fd && local->transaction.eager_lock_on)
a3470f
-                afr_set_lk_owner (frame, this, local->fd);
a3470f
-        else
a3470f
-                afr_set_lk_owner (frame, this, frame->root);
a3470f
+        if (local->transaction.type != AFR_DATA_TRANSACTION &&
a3470f
+            local->transaction.type != AFR_METADATA_TRANSACTION)
a3470f
+                goto lock_phase;
a3470f
 
a3470f
-        if (!local->transaction.eager_lock_on && local->loc.inode) {
a3470f
-                fd = fd_lookup (local->loc.inode, frame->root->pid);
a3470f
-                if (fd == NULL)
a3470f
-                        fd = fd_lookup_anonymous (local->loc.inode,
a3470f
-                                                  GF_ANON_FD_FLAGS);
a3470f
+        if (!priv->eager_lock)
a3470f
+                goto lock_phase;
a3470f
 
a3470f
-                if (fd) {
a3470f
-                        afr_delayed_changelog_wake_up (this, fd);
a3470f
-                        fd_unref (fd);
a3470f
-                }
a3470f
+        LOCK (&local->inode->lock);
a3470f
+        {
a3470f
+                __afr_eager_lock_handle (local, &take_lock, &do_pre_op,
a3470f
+                                         &timer_local);
a3470f
         }
a3470f
+        UNLOCK (&local->inode->lock);
a3470f
+lock_phase:
a3470f
+        if (!local->transaction.eager_lock_on) {
a3470f
+                afr_set_lk_owner (local->transaction.frame, this,
a3470f
+                                  local->transaction.frame->root);
a3470f
+        } else {
a3470f
+                afr_set_lk_owner (local->transaction.frame, this, local->inode);
a3470f
+        }
a3470f
+
a3470f
 
a3470f
-        afr_lock (frame, this);
a3470f
+        if (take_lock) {
a3470f
+                afr_lock (local->transaction.frame, this);
a3470f
+        } else if (do_pre_op) {
a3470f
+                afr_changelog_pre_op (local->transaction.frame, this);
a3470f
+        }
a3470f
+        /*Always call delayed_changelog_wake_up_cbk after calling pre-op above
a3470f
+         * so that any inheriting can happen*/
a3470f
+        if (timer_local)
a3470f
+                afr_delayed_changelog_wake_up_cbk (timer_local);
a3470f
 }
a3470f
 
a3470f
 int
a3470f
@@ -2196,7 +2313,7 @@ afr_write_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err)
a3470f
                 goto fail;
a3470f
         }
a3470f
 
a3470f
-        afr_transaction_start (frame, this);
a3470f
+        afr_transaction_start (local, this);
a3470f
         return 0;
a3470f
 fail:
a3470f
         local->transaction.unwind (frame, this);
a3470f
@@ -2214,6 +2331,7 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
a3470f
 
a3470f
         local = frame->local;
a3470f
         priv  = this->private;
a3470f
+        local->transaction.frame = frame;
a3470f
 
a3470f
         local->transaction.type   = type;
a3470f
 
a3470f
@@ -2226,11 +2344,9 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
a3470f
         if (ret < 0)
a3470f
                 goto out;
a3470f
 
a3470f
-        if (type == AFR_ENTRY_TRANSACTION ||
a3470f
-            type == AFR_ENTRY_RENAME_TRANSACTION) {
a3470f
-                afr_transaction_start (frame, this);
a3470f
-                ret = 0;
a3470f
-                goto out;
a3470f
+
a3470f
+        if (type != AFR_METADATA_TRANSACTION) {
a3470f
+                goto txn_start;
a3470f
         }
a3470f
 
a3470f
         ret = afr_inode_get_readable (frame, local->inode, this,
a3470f
@@ -2240,10 +2356,13 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
a3470f
                                                   event_generation)) {
a3470f
                 afr_inode_refresh (frame, this, local->inode, local->loc.gfid,
a3470f
                                    afr_write_txn_refresh_done);
a3470f
-        } else {
a3470f
-                afr_transaction_start (frame, this);
a3470f
+                ret = 0;
a3470f
+                goto out;
a3470f
         }
a3470f
+
a3470f
+txn_start:
a3470f
         ret = 0;
a3470f
+        afr_transaction_start (local, this);
a3470f
 out:
a3470f
         return ret;
a3470f
 }
a3470f
diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h
a3470f
index ddcb1eb..a27e9a3 100644
a3470f
--- a/xlators/cluster/afr/src/afr-transaction.h
a3470f
+++ b/xlators/cluster/afr/src/afr-transaction.h
a3470f
@@ -17,12 +17,6 @@ void
a3470f
 afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,
a3470f
 			    int child_index);
a3470f
 
a3470f
-int
a3470f
-afr_lock_server_count (afr_private_t *priv, afr_transaction_type type);
a3470f
-
a3470f
-afr_inodelk_t*
a3470f
-afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom);
a3470f
-
a3470f
 int32_t
a3470f
 afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type);
a3470f
 
a3470f
@@ -30,9 +24,6 @@ int
a3470f
 afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending);
a3470f
 
a3470f
 void
a3470f
-afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this);
a3470f
-
a3470f
-void
a3470f
 afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd);
a3470f
 
a3470f
 void
a3470f
@@ -57,4 +48,8 @@ afr_pick_error_xdata (afr_local_t *local, afr_private_t *priv,
a3470f
                       inode_t *inode2, unsigned char *readable2);
a3470f
 int
a3470f
 afr_transaction_resume (call_frame_t *frame, xlator_t *this);
a3470f
+int
a3470f
+afr_lock (call_frame_t *frame, xlator_t *this);
a3470f
+void
a3470f
+afr_delayed_changelog_wake_up_cbk (void *data);
a3470f
 #endif /* __TRANSACTION_H__ */
a3470f
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
a3470f
index 5ff57c0..6be59dc 100644
a3470f
--- a/xlators/cluster/afr/src/afr.h
a3470f
+++ b/xlators/cluster/afr/src/afr.h
a3470f
@@ -230,19 +230,12 @@ int
a3470f
 afr_entry_lockee_cmp (const void *l1, const void *l2);
a3470f
 
a3470f
 typedef struct {
a3470f
-        char    *domain; /* Domain on which inodelk is taken */
a3470f
-        struct gf_flock flock;
a3470f
-        unsigned char *locked_nodes;
a3470f
-        int32_t lock_count;
a3470f
-} afr_inodelk_t;
a3470f
-
a3470f
-typedef struct {
a3470f
         loc_t *lk_loc;
a3470f
 
a3470f
         int                     lockee_count;
a3470f
         afr_entry_lockee_t      lockee[AFR_LOCKEE_COUNT_MAX];
a3470f
 
a3470f
-        afr_inodelk_t       inodelk[AFR_DOM_COUNT_MAX];
a3470f
+        struct gf_flock flock;
a3470f
         const char *lk_basename;
a3470f
         const char *lower_basename;
a3470f
         const char *higher_basename;
a3470f
@@ -255,7 +248,6 @@ typedef struct {
a3470f
         int32_t lock_count;
a3470f
         int32_t entrylk_lock_count;
a3470f
 
a3470f
-        uint64_t lock_number;
a3470f
         int32_t lk_call_count;
a3470f
         int32_t lk_expected_count;
a3470f
         int32_t lk_attempted_count;
a3470f
@@ -292,37 +284,9 @@ typedef enum {
a3470f
 } afr_fd_open_status_t;
a3470f
 
a3470f
 typedef struct {
a3470f
-        unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS];
a3470f
-	int inherited[AFR_NUM_CHANGE_LOGS];
a3470f
-	int on_disk[AFR_NUM_CHANGE_LOGS];
a3470f
         afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */
a3470f
-
a3470f
-        unsigned int *lock_piggyback;
a3470f
-        unsigned int *lock_acquired;
a3470f
-
a3470f
         int flags;
a3470f
 
a3470f
-	/* used for delayed-post-op optimization */
a3470f
-	pthread_mutex_t    delay_lock;
a3470f
-	gf_timer_t        *delay_timer;
a3470f
-	call_frame_t      *delay_frame;
a3470f
-
a3470f
-	/* set if any write on this fd was a non stable write
a3470f
-	   (i.e, without O_SYNC or O_DSYNC)
a3470f
-	*/
a3470f
-	gf_boolean_t      witnessed_unstable_write;
a3470f
-
a3470f
-	/* @open_fd_count:
a3470f
-	   Number of open FDs queried from the server, as queried through
a3470f
-	   xdata in FOPs. Currently, used to decide if eager-locking must be
a3470f
-	   temporarily disabled.
a3470f
-	*/
a3470f
-        uint32_t        open_fd_count;
a3470f
-
a3470f
-
a3470f
-	/* list of frames currently in progress */
a3470f
-	struct list_head  eager_locked;
a3470f
-
a3470f
 	/* the subvolume on which the latest sequence of readdirs (starting
a3470f
 	   at offset 0) has begun. Till the next readdir request with 0 offset
a3470f
 	   arrives, we continue to read off this subvol.
a3470f
@@ -336,6 +300,20 @@ typedef enum {
a3470f
         AFR_FOP_LOCK_QUORUM_FAILED,
a3470f
 } afr_fop_lock_state_t;
a3470f
 
a3470f
+typedef struct _afr_inode_lock_t {
a3470f
+        unsigned int event_generation;
a3470f
+        gf_boolean_t    release;
a3470f
+        gf_boolean_t    acquired;
a3470f
+        gf_timer_t        *delay_timer;
a3470f
+        struct list_head  owners; /*Transactions that are performing fop*/
a3470f
+        struct list_head  post_op;/*Transactions that are done with the fop
a3470f
+                                   *So can not conflict with the fops*/
a3470f
+        struct list_head waiting;/*Transaction that are waiting for
a3470f
+                                   *conflicting transactions to complete*/
a3470f
+        struct list_head frozen;/*Transactions that need to go as part of
a3470f
+                                 * next batch of eager-lock*/
a3470f
+} afr_lock_t;
a3470f
+
a3470f
 typedef struct _afr_inode_ctx {
a3470f
         uint64_t        read_subvol;
a3470f
         uint64_t        write_subvol;
a3470f
@@ -343,6 +321,23 @@ typedef struct _afr_inode_ctx {
a3470f
         int             spb_choice;
a3470f
         gf_timer_t      *timer;
a3470f
         gf_boolean_t    need_refresh;
a3470f
+        unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS];
a3470f
+        int inherited[AFR_NUM_CHANGE_LOGS];
a3470f
+        int on_disk[AFR_NUM_CHANGE_LOGS];
a3470f
+
a3470f
+        /* set if any write on this fd was a non stable write
a3470f
+           (i.e, without O_SYNC or O_DSYNC)
a3470f
+        */
a3470f
+        gf_boolean_t      witnessed_unstable_write;
a3470f
+
a3470f
+        /* @open_fd_count:
a3470f
+           Number of open FDs queried from the server, as queried through
a3470f
+           xdata in FOPs. Currently, used to decide if eager-locking must be
a3470f
+           temporarily disabled.
a3470f
+        */
a3470f
+        uint32_t        open_fd_count;
a3470f
+        /*Only 2 types of transactions support eager-locks now. DATA/METADATA*/
a3470f
+        afr_lock_t lock[2];
a3470f
 } afr_inode_ctx_t;
a3470f
 
a3470f
 
a3470f
@@ -457,7 +452,6 @@ typedef struct _afr_local {
a3470f
         dict_t  *dict;
a3470f
 
a3470f
         int      optimistic_change_log;
a3470f
-	gf_boolean_t      delayed_post_op;
a3470f
 
a3470f
 	/* Is the current writev() going to perform a stable write?
a3470f
 	   i.e, is fd->flags or @flags writev param have O_SYNC or
a3470f
@@ -693,7 +687,7 @@ typedef struct _afr_local {
a3470f
                 off_t start, len;
a3470f
 
a3470f
                 gf_boolean_t    eager_lock_on;
a3470f
-                int *eager_lock;
a3470f
+                gf_boolean_t    do_eager_unlock;
a3470f
 
a3470f
                 char *basename;
a3470f
                 char *new_basename;
a3470f
@@ -707,7 +701,8 @@ typedef struct _afr_local {
a3470f
 		   of the transaction frame */
a3470f
 		call_stub_t      *resume_stub;
a3470f
 
a3470f
-		struct list_head  eager_locked;
a3470f
+		struct list_head  owner_list;
a3470f
+                struct list_head  wait_list;
a3470f
 
a3470f
                 unsigned char   *pre_op;
a3470f
 
a3470f
@@ -768,7 +763,8 @@ typedef struct _afr_local {
a3470f
 		*/
a3470f
 		afr_changelog_resume_t changelog_resume;
a3470f
 
a3470f
-                call_frame_t *main_frame;
a3470f
+                call_frame_t *main_frame; /*Fop frame*/
a3470f
+                call_frame_t *frame; /*Transaction frame*/
a3470f
 
a3470f
                 int (*wind) (call_frame_t *frame, xlator_t *this, int subvol);
a3470f
 
a3470f
@@ -1009,7 +1005,7 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd);
a3470f
 		afr_local_cleanup (frame->local, THIS);		       \
a3470f
 		mem_put (frame->local);				       \
a3470f
 		frame->local = NULL; };				       \
a3470f
-	frame->local;})
a3470f
+	frame->local; })
a3470f
 
a3470f
 #define AFR_STACK_RESET(frame)                                         \
a3470f
         do {                                                           \
a3470f
@@ -1096,22 +1092,10 @@ afr_filter_xattrs (dict_t *xattr);
a3470f
 #define AFR_QUORUM_AUTO INT_MAX
a3470f
 
a3470f
 int
a3470f
-afr_fd_report_unstable_write (xlator_t *this, fd_t *fd);
a3470f
+afr_fd_report_unstable_write (xlator_t *this, afr_local_t *local);
a3470f
 
a3470f
 gf_boolean_t
a3470f
-afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd);
a3470f
-
a3470f
-void
a3470f
-afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub);
a3470f
-
a3470f
-int
a3470f
-afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count);
a3470f
-
a3470f
-void
a3470f
-afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this);
a3470f
-
a3470f
-void
a3470f
-afr_remove_eager_lock_stub (afr_local_t *local);
a3470f
+afr_fd_has_witnessed_unstable_write (xlator_t *this, inode_t *inode);
a3470f
 
a3470f
 void
a3470f
 afr_reply_wipe (struct afr_reply *reply);
a3470f
-- 
a3470f
1.8.3.1
a3470f