d1681e
From 30fb0e640ae94d9591e9bb64800b0971e52d5416 Mon Sep 17 00:00:00 2001
7c2869
From: Pranith Kumar K <pkarampu@redhat.com>
7c2869
Date: Wed, 31 Jan 2018 16:41:14 +0530
d1681e
Subject: [PATCH 194/201] cluster/afr: Make AFR eager-locking similar to EC
7c2869
7c2869
Problem:
7c2869
1) Afr's eager-lock only works for data transactions.
7c2869
2) When there are conflicting writes, write with conflicting region initiates
7c2869
unlock of eager-lock leading to extra pre-ops and post-ops on the file. When
7c2869
eager-lock goes off, it leads to extra fsyncs for random-write workload in afr.
7c2869
7c2869
Solution (that is modeled after EC):
7c2869
In EC, when there is a conflicting write, it waits for the current write to
7c2869
complete before it winds the conflicted write. This leads to better utilization
7c2869
of network and disk, because we will not be doing extra xattrops and FSYNCs and
7c2869
inodelk/unlock. Moved fd based counters to inode based counters.
7c2869
7c2869
I tried to model the solution based on EC's locking, but it is not similar to
7c2869
AFR because we had to keep backward compatibility.
7c2869
7c2869
Lifecycle of lock:
7c2869
==================
7c2869
First transaction is added to inode->owners list and an inodelk will be sent on
7c2869
the wire. All the next transactions will be put in inode->waiters list until
7c2869
the first transaction completes inodelk and [f]xattrop completely.  Once
7c2869
[f]xattrop also completes, all the requests in the inode->waiters list are
7c2869
checked if it conflict with any of the existing locks which are in
7c2869
inode->owners list and if not are added to inode->owners list and resumed with
7c2869
doing transaction. When these transactions complete fop phase they will be
7c2869
moved to inode->post_op list and resume the transactions that were paused
7c2869
because of conflicts. Post-op and unlock will not be issued on the wire until
7c2869
that is the last transaction on that inode. Last transaction when it has to
7c2869
perform post-op can choose to sleep for deyed-post-op-secs value. During that
7c2869
time if any other transaction comes, it will wake up the sleeping transaction
7c2869
and takes over the ownership of the lock and the cycle continues. If the
7c2869
dealyed-post-op-secs expire, then the timer thread will wakeup the sleeping
7c2869
transaction and it will set lock->release to true and starts doing post-op and
7c2869
then unlock. During this time if any other transactions come, they will be put
7c2869
in inode->frozen list. Once the previous unlock comes it will move the frozen
7c2869
list to waiters list and moves the first element from this waiters-list to
7c2869
owners-list and attempts the lock and the cycle continues. This is the general
7c2869
idea.  There is logic at the time of dealying and at the time of new
7c2869
transaction or in flush fop to wakeup existing sleeping transactions or
7c2869
choosing whether to delay a transaction etc, which is subjected to change based
7c2869
on future enhancements etc.
7c2869
7c2869
 >Fixes: #418
7c2869
 >BUG: 1549606
7c2869
7c2869
Upstream-patch: https://review.gluster.org/19503
d1681e
BUG: 1491785
d1681e
Change-Id: I88b570bbcf332a27c82d2767dfa82472f60055dc
7c2869
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
d1681e
Reviewed-on: https://code.engineering.redhat.com/gerrit/131945
7c2869
Tested-by: RHGS Build Bot <nigelb@redhat.com>
7c2869
---
7c2869
 tests/bugs/replicate/bug-966018.t              |  36 -
d1681e
 xlators/cluster/afr/src/afr-common.c           | 315 ++++-----
d1681e
 xlators/cluster/afr/src/afr-inode-write.c      |   6 +-
d1681e
 xlators/cluster/afr/src/afr-lk-common.c        | 348 +++-------
7c2869
 xlators/cluster/afr/src/afr-self-heal-common.c |  13 +-
7c2869
 xlators/cluster/afr/src/afr-self-heal-data.c   |  14 +-
7c2869
 xlators/cluster/afr/src/afr-self-heal.h        |   2 +-
d1681e
 xlators/cluster/afr/src/afr-transaction.c      | 913 ++++++++++++++-----------
7c2869
 xlators/cluster/afr/src/afr-transaction.h      |  13 +-
d1681e
 xlators/cluster/afr/src/afr.h                  |  96 ++-
d1681e
 10 files changed, 813 insertions(+), 943 deletions(-)
7c2869
 delete mode 100644 tests/bugs/replicate/bug-966018.t
7c2869
7c2869
diff --git a/tests/bugs/replicate/bug-966018.t b/tests/bugs/replicate/bug-966018.t
7c2869
deleted file mode 100644
7c2869
index 1b5296b..0000000
7c2869
--- a/tests/bugs/replicate/bug-966018.t
7c2869
+++ /dev/null
7c2869
@@ -1,36 +0,0 @@
7c2869
-#!/bin/bash
7c2869
-
7c2869
-. $(dirname $0)/../../include.rc
7c2869
-. $(dirname $0)/../../volume.rc
7c2869
-. $(dirname $0)/../../nfs.rc
7c2869
-
7c2869
-#This tests if cluster.eager-lock blocks metadata operations on nfs/fuse mounts.
7c2869
-#If it is not woken up, INODELK from the next command waits
7c2869
-#for post-op-delay secs.
7c2869
-
7c2869
-cleanup;
7c2869
-TEST glusterd
7c2869
-TEST pidof glusterd
7c2869
-
7c2869
-TEST $CLI volume create $V0 replica 2 $H0:$B0/r2_0 $H0:$B0/r2_1
7c2869
-TEST $CLI volume set $V0 ensure-durability off
7c2869
-TEST $CLI volume set $V0 cluster.eager-lock on
7c2869
-TEST $CLI volume set $V0 cluster.post-op-delay-secs 3
7c2869
-TEST $CLI volume set $V0 nfs.disable false
7c2869
-
7c2869
-TEST $CLI volume start $V0
7c2869
-TEST $CLI volume profile $V0 start
7c2869
-EXPECT_WITHIN $NFS_EXPORT_TIMEOUT "1" is_nfs_export_available;
7c2869
-TEST mount_nfs $H0:/$V0 $N0 nolock;
7c2869
-TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id=$V0 $M0
7c2869
-echo 1 > $N0/1 && chmod +x $N0/1
7c2869
-echo 1 > $M0/1 && chmod +x $M0/1
7c2869
-
7c2869
-#Check that INODELK MAX latency is not in the order of seconds
7c2869
-#Test if the MAX INODELK fop latency is of the order of seconds.
7c2869
-inodelk_max_latency=$($CLI volume profile $V0 info | grep INODELK | awk 'BEGIN {max = 0} {if ($6 > max) max=$6;} END {print max}' | cut -d. -f 1 | egrep "[0-9]{7,}")
7c2869
-
7c2869
-TEST [ -z $inodelk_max_latency ]
7c2869
-EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $N0
7c2869
-
7c2869
-cleanup;
7c2869
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
d1681e
index 06863b6..6025a60 100644
7c2869
--- a/xlators/cluster/afr/src/afr-common.c
7c2869
+++ b/xlators/cluster/afr/src/afr-common.c
d1681e
@@ -126,37 +126,77 @@ afr_is_possibly_under_txn (afr_transaction_type type, afr_local_t *local,
7c2869
         return _gf_false;
7c2869
 }
7c2869
 
7c2869
+static void
7c2869
+afr_inode_ctx_destroy (afr_inode_ctx_t *ctx)
7c2869
+{
7c2869
+        int i = 0;
7c2869
+
7c2869
+        if (!ctx)
7c2869
+                return;
7c2869
+
7c2869
+        for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
7c2869
+                GF_FREE (ctx->pre_op_done[i]);
7c2869
+        }
7c2869
+
7c2869
+        GF_FREE (ctx);
7c2869
+}
7c2869
+
7c2869
 int
7c2869
 __afr_inode_ctx_get (xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx)
7c2869
 {
7c2869
-        uint64_t                ctx_int = 0;
7c2869
-        int                     ret     = -1;
7c2869
-        afr_inode_ctx_t        *tmp_ctx = NULL;
7c2869
+        uint64_t        ctx_int   = 0;
7c2869
+        int             ret       = -1;
7c2869
+        int             i         = -1;
7c2869
+        int             num_locks = -1;
7c2869
+        afr_inode_ctx_t *ictx     = NULL;
7c2869
+        afr_lock_t      *lock     = NULL;
7c2869
+        afr_private_t   *priv     = this->private;
7c2869
 
7c2869
         ret = __inode_ctx_get (inode, this, &ctx_int);
7c2869
-        if (ret) {
7c2869
-                tmp_ctx = GF_CALLOC (1, sizeof (afr_inode_ctx_t),
7c2869
-                                     gf_afr_mt_inode_ctx_t);
7c2869
-                if (!tmp_ctx)
7c2869
-                        goto out;
7c2869
+        if (ret == 0) {
7c2869
+                *ctx = (afr_inode_ctx_t *)ctx_int;
7c2869
+                return 0;
7c2869
+        }
7c2869
 
7c2869
-                ctx_int = (long) tmp_ctx;
7c2869
-                ret = __inode_ctx_set (inode, this, &ctx_int);
7c2869
-                if (ret) {
7c2869
-                        GF_FREE (tmp_ctx);
7c2869
+        ictx = GF_CALLOC (1, sizeof (afr_inode_ctx_t), gf_afr_mt_inode_ctx_t);
7c2869
+        if (!ictx)
7c2869
+                goto out;
7c2869
+
7c2869
+        for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
7c2869
+                ictx->pre_op_done[i] = GF_CALLOC (sizeof *ictx->pre_op_done[i],
7c2869
+                                                  priv->child_count,
7c2869
+                                                  gf_afr_mt_int32_t);
7c2869
+                if (!ictx->pre_op_done[i]) {
7c2869
+                        ret = -ENOMEM;
7c2869
                         goto out;
7c2869
                 }
7c2869
-                tmp_ctx->spb_choice = -1;
7c2869
-                tmp_ctx->read_subvol = 0;
d1681e
-                tmp_ctx->write_subvol = 0;
d1681e
-                tmp_ctx->lock_count = 0;
7c2869
-        } else {
7c2869
-                tmp_ctx = (afr_inode_ctx_t *) ctx_int;
7c2869
         }
7c2869
 
7c2869
-        *ctx = tmp_ctx;
7c2869
+        num_locks = sizeof(ictx->lock)/sizeof(afr_lock_t);
7c2869
+        for (i = 0; i < num_locks; i++) {
7c2869
+                lock = &ictx->lock[i];
7c2869
+                INIT_LIST_HEAD (&lock->post_op);
7c2869
+                INIT_LIST_HEAD (&lock->frozen);
7c2869
+                INIT_LIST_HEAD (&lock->waiting);
7c2869
+                INIT_LIST_HEAD (&lock->owners);
7c2869
+        }
7c2869
+
7c2869
+        ctx_int = (uint64_t)ictx;
7c2869
+        ret = __inode_ctx_set (inode, this, &ctx_int);
7c2869
+        if (ret) {
7c2869
+                goto out;
7c2869
+        }
7c2869
+
7c2869
+        ictx->spb_choice = -1;
7c2869
+        ictx->read_subvol = 0;
d1681e
+        ictx->write_subvol = 0;
d1681e
+        ictx->lock_count = 0;
7c2869
         ret = 0;
7c2869
+        *ctx = ictx;
7c2869
 out:
7c2869
+        if (ret) {
7c2869
+                afr_inode_ctx_destroy (ictx);
7c2869
+        }
7c2869
         return ret;
7c2869
 }
7c2869
 
d1681e
@@ -1745,10 +1785,6 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)
7c2869
 
7c2869
         GF_FREE (local->internal_lock.locked_nodes);
7c2869
 
7c2869
-        for (i = 0; local->internal_lock.inodelk[i].domain; i++) {
7c2869
-                GF_FREE (local->internal_lock.inodelk[i].locked_nodes);
7c2869
-        }
7c2869
-
7c2869
         GF_FREE (local->internal_lock.lower_locked_nodes);
7c2869
 
7c2869
         afr_entry_lockee_cleanup (&local->internal_lock);
d1681e
@@ -1765,7 +1801,6 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)
d1681e
                 GF_FREE (local->transaction.changelog_xdata);
7c2869
         }
7c2869
 
7c2869
-        GF_FREE (local->transaction.eager_lock);
7c2869
         GF_FREE (local->transaction.failed_subvols);
7c2869
 
7c2869
         GF_FREE (local->transaction.basename);
d1681e
@@ -1812,16 +1847,6 @@ afr_local_replies_wipe (afr_local_t *local, afr_private_t *priv)
7c2869
 	memset (local->replies, 0, sizeof(*local->replies) * priv->child_count);
7c2869
 }
7c2869
 
7c2869
-void
7c2869
-afr_remove_eager_lock_stub (afr_local_t *local)
7c2869
-{
7c2869
-        LOCK (&local->fd->lock);
7c2869
-        {
7c2869
-                list_del_init (&local->transaction.eager_locked);
7c2869
-        }
7c2869
-        UNLOCK (&local->fd->lock);
7c2869
-}
7c2869
-
7c2869
 static gf_boolean_t
7c2869
 afr_fop_lock_is_unlock (call_frame_t *frame)
7c2869
 {
d1681e
@@ -1926,10 +1951,6 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this)
7c2869
 
7c2869
 	syncbarrier_destroy (&local->barrier);
7c2869
 
7c2869
-        if (local->transaction.eager_lock_on &&
7c2869
-            !list_empty (&local->transaction.eager_locked))
7c2869
-                afr_remove_eager_lock_stub (local);
7c2869
-
7c2869
         afr_local_transaction_cleanup (local, this);
7c2869
 
7c2869
         priv = this->private;
d1681e
@@ -3160,22 +3181,8 @@ out:
7c2869
 void
7c2869
 _afr_cleanup_fd_ctx (afr_fd_ctx_t *fd_ctx)
7c2869
 {
7c2869
-        int i = 0;
7c2869
-
7c2869
-
7c2869
-	for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++)
7c2869
-		GF_FREE (fd_ctx->pre_op_done[i]);
7c2869
-
7c2869
         GF_FREE (fd_ctx->opened_on);
7c2869
-
7c2869
-        GF_FREE (fd_ctx->lock_piggyback);
7c2869
-
7c2869
-        GF_FREE (fd_ctx->lock_acquired);
7c2869
-
7c2869
-	pthread_mutex_destroy (&fd_ctx->delay_lock);
7c2869
-
7c2869
         GF_FREE (fd_ctx);
7c2869
-
7c2869
         return;
7c2869
 }
7c2869
 
d1681e
@@ -3193,15 +3200,7 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd)
7c2869
         fd_ctx = (afr_fd_ctx_t *)(long) ctx;
7c2869
 
7c2869
         if (fd_ctx) {
7c2869
-                /*no need to take any locks*/
7c2869
-                if (!list_empty (&fd_ctx->eager_locked))
7c2869
-                        gf_msg (this->name, GF_LOG_WARNING, 0,
7c2869
-                                AFR_MSG_INVALID_DATA, "%s: Stale "
7c2869
-                                "Eager-lock stubs found",
7c2869
-                                uuid_utoa (fd->inode->gfid));
7c2869
-
7c2869
                 _afr_cleanup_fd_ctx (fd_ctx);
7c2869
-
7c2869
         }
7c2869
 
7c2869
 out:
d1681e
@@ -3282,23 +3281,6 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd)
7c2869
                 goto out;
7c2869
         }
7c2869
 
7c2869
-        ret = pthread_mutex_init (&fd_ctx->delay_lock, NULL);
7c2869
-        if (ret) {
7c2869
-                GF_FREE (fd_ctx);
7c2869
-                fd_ctx = NULL;
7c2869
-                goto out;
7c2869
-        }
7c2869
-
7c2869
-	for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
7c2869
-		fd_ctx->pre_op_done[i] = GF_CALLOC (sizeof (*fd_ctx->pre_op_done[i]),
7c2869
-						    priv->child_count,
7c2869
-						    gf_afr_mt_int32_t);
7c2869
-		if (!fd_ctx->pre_op_done[i]) {
7c2869
-			ret = -ENOMEM;
7c2869
-			goto out;
7c2869
-		}
7c2869
-	}
7c2869
-
7c2869
         fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on),
7c2869
                                        priv->child_count,
7c2869
                                        gf_afr_mt_int32_t);
d1681e
@@ -3314,26 +3296,8 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd)
7c2869
 			fd_ctx->opened_on[i] = AFR_FD_NOT_OPENED;
7c2869
 	}
7c2869
 
7c2869
-        fd_ctx->lock_piggyback = GF_CALLOC (sizeof (*fd_ctx->lock_piggyback),
7c2869
-                                            priv->child_count,
7c2869
-                                            gf_afr_mt_char);
7c2869
-        if (!fd_ctx->lock_piggyback) {
7c2869
-                ret = -ENOMEM;
7c2869
-                goto out;
7c2869
-        }
7c2869
-
7c2869
-        fd_ctx->lock_acquired = GF_CALLOC (sizeof (*fd_ctx->lock_acquired),
7c2869
-                                           priv->child_count,
7c2869
-                                           gf_afr_mt_char);
7c2869
-        if (!fd_ctx->lock_acquired) {
7c2869
-                ret = -ENOMEM;
7c2869
-                goto out;
7c2869
-        }
7c2869
-
7c2869
 	fd_ctx->readdir_subvol = -1;
7c2869
 
7c2869
-        INIT_LIST_HEAD (&fd_ctx->eager_locked);
7c2869
-
7c2869
         ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx);
7c2869
         if (ret)
7c2869
                 gf_msg_debug (this->name, 0,
d1681e
@@ -3405,12 +3369,70 @@ afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
7c2869
         return 0;
7c2869
 }
7c2869
 
7c2869
+afr_local_t*
7c2869
+afr_wakeup_same_fd_delayed_op (xlator_t *this, afr_lock_t *lock, fd_t *fd)
7c2869
+{
7c2869
+        afr_local_t *local = NULL;
7c2869
+
7c2869
+        if (lock->delay_timer) {
7c2869
+                local = list_entry(lock->post_op.next, afr_local_t,
7c2869
+                                   transaction.owner_list);
7c2869
+                if (fd == local->fd) {
7c2869
+                        if (gf_timer_call_cancel (this->ctx,
7c2869
+                                                  lock->delay_timer)) {
7c2869
+                                local = NULL;
7c2869
+                        } else {
7c2869
+                                lock->delay_timer = NULL;
7c2869
+                        }
7c2869
+                } else {
7c2869
+                        local = NULL;
7c2869
+                }
7c2869
+        }
7c2869
+
7c2869
+        return local;
7c2869
+}
7c2869
+
7c2869
+void
7c2869
+afr_delayed_changelog_wake_resume (xlator_t *this, inode_t *inode,
7c2869
+                                   call_stub_t *stub)
7c2869
+{
7c2869
+        afr_inode_ctx_t *ctx = NULL;
7c2869
+        afr_lock_t      *lock = NULL;
7c2869
+        afr_local_t     *metadata_local = NULL;
7c2869
+        afr_local_t     *data_local = NULL;
7c2869
+        LOCK (&inode->lock);
7c2869
+        {
7c2869
+                (void)__afr_inode_ctx_get (this, inode, &ctx;;
7c2869
+                lock = &ctx->lock[AFR_DATA_TRANSACTION];
7c2869
+                data_local = afr_wakeup_same_fd_delayed_op (this, lock,
7c2869
+                                                            stub->args.fd);
7c2869
+                lock = &ctx->lock[AFR_METADATA_TRANSACTION];
7c2869
+                metadata_local = afr_wakeup_same_fd_delayed_op (this, lock,
7c2869
+                                                                stub->args.fd);
7c2869
+        }
7c2869
+        UNLOCK (&inode->lock);
7c2869
+
7c2869
+        if (data_local) {
7c2869
+                data_local->transaction.resume_stub = stub;
7c2869
+        } else if (metadata_local) {
7c2869
+                metadata_local->transaction.resume_stub = stub;
7c2869
+        } else {
7c2869
+                call_resume (stub);
7c2869
+        }
7c2869
+        if (data_local) {
7c2869
+                afr_delayed_changelog_wake_up_cbk (data_local);
7c2869
+        }
7c2869
+        if (metadata_local) {
7c2869
+                afr_delayed_changelog_wake_up_cbk (metadata_local);
7c2869
+        }
7c2869
+}
7c2869
+
7c2869
 int
7c2869
 afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
7c2869
 {
7c2869
-        afr_local_t   *local = NULL;
7c2869
-        call_stub_t   *stub = NULL;
7c2869
-        int            op_errno   = ENOMEM;
7c2869
+        afr_local_t *local   = NULL;
7c2869
+        call_stub_t *stub    = NULL;
7c2869
+        int         op_errno = ENOMEM;
7c2869
 
7c2869
 	local = AFR_FRAME_INIT (frame, op_errno);
7c2869
 	if (!local)
d1681e
@@ -3426,7 +3448,7 @@ afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
7c2869
         if (!stub)
7c2869
                 goto out;
7c2869
 
7c2869
-        afr_delayed_changelog_wake_resume (this, fd, stub);
7c2869
+        afr_delayed_changelog_wake_resume (this, fd->inode, stub);
7c2869
 
7c2869
 	return 0;
7c2869
 out:
d1681e
@@ -3434,7 +3456,6 @@ out:
d1681e
         return 0;
d1681e
 }
d1681e
 
d1681e
-
d1681e
 int
d1681e
 afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
d1681e
 		  int32_t op_ret, int32_t op_errno, dict_t *xdata)
d1681e
@@ -4497,7 +4518,7 @@ afr_forget (xlator_t *this, inode_t *inode)
7c2869
                 return 0;
7c2869
 
7c2869
         ctx = (afr_inode_ctx_t *)ctx_int;
7c2869
-        GF_FREE (ctx);
7c2869
+        afr_inode_ctx_destroy (ctx);
7c2869
         return 0;
7c2869
 }
7c2869
 
d1681e
@@ -5310,21 +5331,6 @@ out:
7c2869
 }
7c2869
 
7c2869
 int
7c2869
-afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count)
7c2869
-{
7c2869
-        int             ret = -ENOMEM;
7c2869
-
7c2869
-        lk->domain = dom;
7c2869
-        lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes),
7c2869
-                                      child_count, gf_afr_mt_char);
7c2869
-        if (NULL == lk->locked_nodes)
7c2869
-                goto out;
7c2869
-        ret = 0;
7c2869
-out:
7c2869
-        return ret;
7c2869
-}
7c2869
-
7c2869
-int
7c2869
 afr_transaction_local_init (afr_local_t *local, xlator_t *this)
7c2869
 {
7c2869
         int            ret = -ENOMEM;
d1681e
@@ -5335,25 +5341,9 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this)
7c2869
         if (ret < 0)
7c2869
                 goto out;
7c2869
 
7c2869
-        if ((local->transaction.type == AFR_DATA_TRANSACTION) ||
7c2869
-            (local->transaction.type == AFR_METADATA_TRANSACTION)) {
7c2869
-                ret = afr_inodelk_init (&local->internal_lock.inodelk[0],
7c2869
-                                        this->name, priv->child_count);
7c2869
-                if (ret < 0)
7c2869
-                        goto out;
7c2869
-        }
7c2869
-
7c2869
         ret = -ENOMEM;
7c2869
 	local->pre_op_compat = priv->pre_op_compat;
7c2869
 
7c2869
-        local->transaction.eager_lock =
7c2869
-                GF_CALLOC (sizeof (*local->transaction.eager_lock),
7c2869
-                           priv->child_count,
7c2869
-                           gf_afr_mt_int32_t);
7c2869
-
7c2869
-        if (!local->transaction.eager_lock)
7c2869
-                goto out;
7c2869
-
7c2869
         local->transaction.pre_op = GF_CALLOC (sizeof (*local->transaction.pre_op),
7c2869
                                                priv->child_count,
7c2869
                                                gf_afr_mt_char);
d1681e
@@ -5385,9 +5375,9 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this)
7c2869
         if (!local->pending)
7c2869
                 goto out;
7c2869
 
7c2869
-	INIT_LIST_HEAD (&local->transaction.eager_locked);
7c2869
-
7c2869
         ret = 0;
7c2869
+        INIT_LIST_HEAD (&local->transaction.wait_list);
7c2869
+        INIT_LIST_HEAD (&local->transaction.owner_list);
7c2869
 out:
7c2869
         return ret;
7c2869
 }
d1681e
@@ -5422,24 +5412,6 @@ out:
7c2869
         return;
7c2869
 }
7c2869
 
7c2869
-void
7c2869
-afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this)
7c2869
-{
7c2869
-        afr_local_t     *local = NULL;
7c2869
-        afr_fd_ctx_t    *fd_ctx   = NULL;
7c2869
-
7c2869
-        local = frame->local;
7c2869
-
7c2869
-        if (!local->fd)
7c2869
-		return;
7c2869
-
7c2869
-	fd_ctx = afr_fd_ctx_get (local->fd, this);
7c2869
-	if (!fd_ctx)
7c2869
-		return;
7c2869
-
7c2869
-	fd_ctx->open_fd_count = local->open_fd_count;
7c2869
-}
7c2869
-
7c2869
 int**
7c2869
 afr_mark_pending_changelog (afr_private_t *priv, unsigned char *pending,
7c2869
                             dict_t *xattr, ia_type_t iat)
d1681e
@@ -5548,7 +5520,7 @@ out:
7c2869
 
7c2869
 int
7c2869
 afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this,
7c2869
-                                  inode_t *inode, gf_boolean_t *dsh,
7c2869
+                                  fd_t *fd, gf_boolean_t *dsh,
7c2869
                                   gf_boolean_t *pflag)
7c2869
 {
7c2869
         int ret = -1;
d1681e
@@ -5558,8 +5530,8 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this,
7c2869
         unsigned char *healed_sinks = NULL;
7c2869
         unsigned char *undid_pending = NULL;
7c2869
         afr_private_t   *priv = NULL;
7c2869
-        fd_t          *fd = NULL;
7c2869
         struct afr_reply *locked_replies = NULL;
7c2869
+        inode_t *inode = fd->inode;
7c2869
 
7c2869
         priv = this->private;
7c2869
         data_lock = alloca0 (priv->child_count);
d1681e
@@ -5568,18 +5540,6 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this,
7c2869
         healed_sinks = alloca0 (priv->child_count);
7c2869
         undid_pending = alloca0 (priv->child_count);
7c2869
 
7c2869
-        /* Heal-info does an open() on the file being examined so that the
7c2869
-         * current eager-lock holding client, if present, at some point sees
7c2869
-         * open-fd count being > 1 and releases the eager-lock so that heal-info
7c2869
-         * doesn't remain blocked forever until IO completes.
7c2869
-         */
7c2869
-        ret = afr_selfheal_data_open (this, inode, &fd;;
7c2869
-        if (ret < 0) {
7c2869
-                gf_msg_debug (this->name, -ret, "%s: Failed to open",
7c2869
-                              uuid_utoa (inode->gfid));
7c2869
-                goto out;
7c2869
-        }
7c2869
-
7c2869
         locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count);
7c2869
 
7c2869
         ret = afr_selfheal_inodelk (frame, this, inode, this->name,
d1681e
@@ -5602,8 +5562,6 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this,
7c2869
 out:
7c2869
         if (locked_replies)
7c2869
                 afr_replies_wipe (locked_replies, priv->child_count);
7c2869
-        if (fd)
7c2869
-                fd_unref (fd);
7c2869
         return ret;
7c2869
 }
7c2869
 
d1681e
@@ -5688,6 +5646,7 @@ afr_selfheal_locked_inspect (call_frame_t *frame, xlator_t *this, uuid_t gfid,
7c2869
 
7c2869
 {
7c2869
         int ret             = -1;
7c2869
+        fd_t *fd            = NULL;
7c2869
         gf_boolean_t    dsh = _gf_false;
7c2869
         gf_boolean_t    msh = _gf_false;
7c2869
         gf_boolean_t    esh = _gf_false;
d1681e
@@ -5699,6 +5658,21 @@ afr_selfheal_locked_inspect (call_frame_t *frame, xlator_t *this, uuid_t gfid,
7c2869
 
7c2869
         /* For every heal type hold locks and check if it indeed needs heal */
7c2869
 
7c2869
+
7c2869
+        /* Heal-info does an open() on the file being examined so that the
7c2869
+         * current eager-lock holding client, if present, at some point sees
7c2869
+         * open-fd count being > 1 and releases the eager-lock so that heal-info
7c2869
+         * doesn't remain blocked forever until IO completes.
7c2869
+         */
7c2869
+        if ((*inode)->ia_type == IA_IFREG) {
7c2869
+                ret = afr_selfheal_data_open (this, *inode, &fd;;
7c2869
+                if (ret < 0) {
7c2869
+                        gf_msg_debug (this->name, -ret, "%s: Failed to open",
7c2869
+                                      uuid_utoa ((*inode)->gfid));
7c2869
+                        goto out;
7c2869
+                }
7c2869
+        }
7c2869
+
7c2869
         if (msh) {
7c2869
                 ret = afr_selfheal_locked_metadata_inspect (frame, this,
7c2869
                                                             *inode, &msh,
d1681e
@@ -5708,7 +5682,7 @@ afr_selfheal_locked_inspect (call_frame_t *frame, xlator_t *this, uuid_t gfid,
7c2869
         }
7c2869
 
7c2869
         if (dsh) {
7c2869
-                ret = afr_selfheal_locked_data_inspect (frame, this, *inode,
7c2869
+                ret = afr_selfheal_locked_data_inspect (frame, this, fd,
7c2869
                                                         &dsh, pending);
7c2869
                 if (ret == -EIO || (ret == -EAGAIN))
7c2869
                         goto out;
d1681e
@@ -5723,6 +5697,8 @@ out:
7c2869
         *data_selfheal = dsh;
7c2869
         *entry_selfheal = esh;
7c2869
         *metadata_selfheal = msh;
7c2869
+        if (fd)
7c2869
+                fd_unref (fd);
7c2869
         return ret;
7c2869
 }
7c2869
 
d1681e
@@ -6352,6 +6328,7 @@ afr_write_subvol_reset (call_frame_t *frame, xlator_t *this)
d1681e
         local = frame->local;
d1681e
         LOCK(&local->inode->lock);
d1681e
         {
d1681e
+                GF_ASSERT (local->inode_ctx->lock_count > 0);
d1681e
                 local->inode_ctx->lock_count--;
d1681e
 
d1681e
                 if (!local->inode_ctx->lock_count)
7c2869
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
d1681e
index 2402bb2..b52b6ca 100644
7c2869
--- a/xlators/cluster/afr/src/afr-inode-write.c
7c2869
+++ b/xlators/cluster/afr/src/afr-inode-write.c
7c2869
@@ -341,14 +341,14 @@ afr_process_post_writev (call_frame_t *frame, xlator_t *this)
7c2869
                    the xattrs are not reliably pointing at
7c2869
                    a stale file.
7c2869
                 */
7c2869
-                afr_fd_report_unstable_write (this, local->fd);
7c2869
+                afr_fd_report_unstable_write (this, local);
7c2869
 
7c2869
         __afr_inode_write_finalize (frame, this);
7c2869
 
7c2869
         afr_writev_handle_short_writes (frame, this);
7c2869
 
7c2869
         if (local->update_open_fd_count)
7c2869
-                afr_handle_open_fd_count (frame, this);
7c2869
+                local->inode_ctx->open_fd_count = local->open_fd_count;
7c2869
 
7c2869
 }
7c2869
 
d1681e
@@ -2590,7 +2590,7 @@ afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
7c2869
         local->op = GF_FOP_FSYNC;
7c2869
         local->cont.fsync.datasync = datasync;
7c2869
 
7c2869
-	if (afr_fd_has_witnessed_unstable_write (this, fd)) {
7c2869
+	if (afr_fd_has_witnessed_unstable_write (this, fd->inode)) {
7c2869
 		/* don't care. we only wanted to CLEAR the bit */
7c2869
 	}
7c2869
 
7c2869
diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c
d1681e
index 260815f..be3de01 100644
7c2869
--- a/xlators/cluster/afr/src/afr-lk-common.c
7c2869
+++ b/xlators/cluster/afr/src/afr-lk-common.c
7c2869
@@ -52,31 +52,6 @@ afr_entry_lockee_cmp (const void *l1, const void *l2)
7c2869
 
7c2869
 int afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index);
7c2869
 
7c2869
-static int
7c2869
-afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this);
7c2869
-
7c2869
-static uint64_t afr_lock_number = 1;
7c2869
-
7c2869
-static uint64_t
7c2869
-get_afr_lock_number ()
7c2869
-{
7c2869
-        return (++afr_lock_number);
7c2869
-}
7c2869
-
7c2869
-int
7c2869
-afr_set_lock_number (call_frame_t *frame, xlator_t *this)
7c2869
-{
7c2869
-        afr_local_t         *local    = NULL;
7c2869
-        afr_internal_lock_t *int_lock = NULL;
7c2869
-
7c2869
-        local    = frame->local;
7c2869
-        int_lock = &local->internal_lock;
7c2869
-
7c2869
-        int_lock->lock_number = get_afr_lock_number ();
7c2869
-
7c2869
-        return 0;
7c2869
-}
7c2869
-
7c2869
 void
7c2869
 afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner)
7c2869
 {
7c2869
@@ -203,21 +178,16 @@ initialize_inodelk_variables (call_frame_t *frame, xlator_t *this)
7c2869
         afr_local_t         *local    = NULL;
7c2869
         afr_internal_lock_t *int_lock = NULL;
7c2869
         afr_private_t       *priv     = NULL;
7c2869
-        afr_inodelk_t       *inodelk  = NULL;
7c2869
 
7c2869
         priv     = this->private;
7c2869
         local    = frame->local;
7c2869
         int_lock = &local->internal_lock;
7c2869
 
7c2869
-        inodelk = afr_get_inodelk (int_lock, int_lock->domain);
7c2869
-
7c2869
-        inodelk->lock_count    = 0;
7c2869
+        int_lock->lock_count    = 0;
7c2869
         int_lock->lk_attempted_count = 0;
7c2869
         int_lock->lock_op_ret   = -1;
7c2869
         int_lock->lock_op_errno = 0;
7c2869
 
7c2869
-        memset (inodelk->locked_nodes, 0,
7c2869
-                sizeof (*inodelk->locked_nodes) * priv->child_count);
7c2869
         memset (int_lock->locked_nodes, 0,
7c2869
                 sizeof (*int_lock->locked_nodes) * priv->child_count);
7c2869
 
d1681e
@@ -286,12 +256,7 @@ void
7c2869
 afr_update_uninodelk (afr_local_t *local, afr_internal_lock_t *int_lock,
7c2869
                     int32_t child_index)
7c2869
 {
7c2869
-        afr_inodelk_t       *inodelk = NULL;
7c2869
-
7c2869
-        inodelk = afr_get_inodelk (int_lock, int_lock->domain);
7c2869
-        inodelk->locked_nodes[child_index] &= LOCKED_NO;
7c2869
-        if (local->transaction.eager_lock)
7c2869
-                local->transaction.eager_lock[child_index] = 0;
7c2869
+        int_lock->locked_nodes[child_index] &= LOCKED_NO;
7c2869
 
7c2869
 }
7c2869
 
d1681e
@@ -331,35 +296,27 @@ static int
7c2869
 afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)
7c2869
 {
7c2869
         afr_internal_lock_t *int_lock = NULL;
7c2869
-        afr_inodelk_t       *inodelk  = NULL;
7c2869
         afr_local_t         *local    = NULL;
7c2869
         afr_private_t       *priv     = NULL;
7c2869
         struct gf_flock flock = {0,};
7c2869
-        struct gf_flock full_flock = {0,};
7c2869
-        struct gf_flock *flock_use = NULL;
7c2869
         int call_count = 0;
7c2869
         int i = 0;
7c2869
-        int piggyback = 0;
7c2869
-        afr_fd_ctx_t        *fd_ctx      = NULL;
7c2869
-
7c2869
 
7c2869
         local    = frame->local;
7c2869
         int_lock = &local->internal_lock;
7c2869
         priv     = this->private;
7c2869
 
7c2869
-        inodelk = afr_get_inodelk (int_lock, int_lock->domain);
7c2869
-
7c2869
-        flock.l_start = inodelk->flock.l_start;
7c2869
-        flock.l_len   = inodelk->flock.l_len;
7c2869
+        flock.l_start = int_lock->flock.l_start;
7c2869
+        flock.l_len   = int_lock->flock.l_len;
7c2869
         flock.l_type  = F_UNLCK;
7c2869
 
7c2869
-        full_flock.l_type = F_UNLCK;
7c2869
-        call_count = afr_locked_nodes_count (inodelk->locked_nodes,
7c2869
+        call_count = afr_locked_nodes_count (int_lock->locked_nodes,
7c2869
                                              priv->child_count);
7c2869
 
7c2869
         int_lock->lk_call_count = call_count;
7c2869
 
7c2869
         if (!call_count) {
7c2869
+                GF_ASSERT (!local->transaction.do_eager_unlock);
7c2869
                 gf_msg_trace (this->name, 0,
7c2869
                               "No internal locks unlocked");
7c2869
 
d1681e
@@ -367,64 +324,28 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)
7c2869
                 goto out;
7c2869
         }
7c2869
 
7c2869
-        if (local->fd)
7c2869
-                fd_ctx = afr_fd_ctx_get (local->fd, this);
7c2869
-
7c2869
         for (i = 0; i < priv->child_count; i++) {
7c2869
-                if ((inodelk->locked_nodes[i] & LOCKED_YES) != LOCKED_YES)
7c2869
+                if ((int_lock->locked_nodes[i] & LOCKED_YES) != LOCKED_YES)
7c2869
                         continue;
7c2869
 
7c2869
                 if (local->fd) {
7c2869
-                        flock_use = &flock;
7c2869
-                        if (!local->transaction.eager_lock[i]) {
7c2869
-                                goto wind;
7c2869
-                        }
7c2869
-
7c2869
-                        piggyback = 0;
7c2869
-
7c2869
-                        LOCK (&local->fd->lock);
7c2869
-                        {
7c2869
-                                if (fd_ctx->lock_piggyback[i]) {
7c2869
-                                        fd_ctx->lock_piggyback[i]--;
7c2869
-                                        piggyback = 1;
7c2869
-                                } else {
7c2869
-                                        fd_ctx->lock_acquired[i]--;
7c2869
-                                }
7c2869
-                        }
7c2869
-                        UNLOCK (&local->fd->lock);
7c2869
-
7c2869
-                        if (piggyback) {
7c2869
-                                afr_unlock_inodelk_cbk (frame, (void *) (long) i,
7c2869
-                                                        this, 1, 0, NULL);
7c2869
-                                if (!--call_count)
7c2869
-                                        break;
7c2869
-                                continue;
7c2869
-                        }
7c2869
-
7c2869
-                        flock_use = &full_flock;
7c2869
-                wind:
7c2869
                         STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk,
7c2869
                                            (void *) (long)i,
7c2869
                                            priv->children[i],
7c2869
                                            priv->children[i]->fops->finodelk,
7c2869
                                            int_lock->domain, local->fd,
7c2869
-                                           F_SETLK, flock_use, NULL);
7c2869
-
7c2869
-                        if (!--call_count)
7c2869
-                                break;
7c2869
-
7c2869
+                                           F_SETLK, &flock, NULL);
7c2869
                 } else {
7c2869
-
7c2869
                         STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk,
7c2869
                                            (void *) (long)i,
7c2869
                                            priv->children[i],
7c2869
                                            priv->children[i]->fops->inodelk,
7c2869
                                            int_lock->domain, &local->loc,
7c2869
                                            F_SETLK, &flock, NULL);
7c2869
-
7c2869
-                        if (!--call_count)
7c2869
-                                break;
7c2869
                 }
7c2869
+
7c2869
+                if (!--call_count)
7c2869
+                        break;
7c2869
         }
7c2869
 out:
7c2869
         return 0;
d1681e
@@ -512,6 +433,18 @@ out:
7c2869
 
7c2869
 }
7c2869
 
7c2869
+int32_t
7c2869
+afr_unlock_now (call_frame_t *frame, xlator_t *this)
7c2869
+{
7c2869
+        afr_local_t *local = frame->local;
7c2869
+
7c2869
+        if (afr_is_inodelk_transaction(local->transaction.type))
7c2869
+                afr_unlock_inodelk (frame, this);
7c2869
+        else
7c2869
+                afr_unlock_entrylk (frame, this);
7c2869
+        return 0;
7c2869
+}
7c2869
+
7c2869
 static int32_t
7c2869
 afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
7c2869
               int32_t op_ret, int32_t op_errno, dict_t *xdata)
d1681e
@@ -553,7 +486,7 @@ afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
7c2869
 
7c2869
         if ((op_ret == -1) &&
7c2869
             (op_errno == ENOSYS)) {
7c2869
-                afr_unlock (frame, this);
7c2869
+                afr_unlock_now (frame, this);
7c2869
         } else {
7c2869
                 if (op_ret == 0) {
7c2869
                         if (local->transaction.type == AFR_ENTRY_TRANSACTION ||
d1681e
@@ -598,38 +531,6 @@ afr_blocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
7c2869
         return 0;
7c2869
 }
7c2869
 
7c2869
-static int
7c2869
-afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this)
7c2869
-{
7c2869
-        afr_internal_lock_t *int_lock = NULL;
7c2869
-        afr_inodelk_t       *inodelk  = NULL;
7c2869
-        afr_local_t         *local    = NULL;
7c2869
-        afr_private_t       *priv     = NULL;
7c2869
-
7c2869
-        priv     = this->private;
7c2869
-        local    = frame->local;
7c2869
-        int_lock = &local->internal_lock;
7c2869
-
7c2869
-        switch (local->transaction.type) {
7c2869
-        case AFR_DATA_TRANSACTION:
7c2869
-        case AFR_METADATA_TRANSACTION:
7c2869
-                inodelk = afr_get_inodelk (int_lock, int_lock->domain);
7c2869
-                memcpy (inodelk->locked_nodes, int_lock->locked_nodes,
7c2869
-                        sizeof (*inodelk->locked_nodes) * priv->child_count);
7c2869
-                inodelk->lock_count = int_lock->lock_count;
7c2869
-                break;
7c2869
-
7c2869
-        case AFR_ENTRY_RENAME_TRANSACTION:
7c2869
-        case AFR_ENTRY_TRANSACTION:
7c2869
-                /*entrylk_count is being used in both non-blocking and blocking
7c2869
-                 * modes */
7c2869
-                break;
7c2869
-        }
7c2869
-
7c2869
-        return 0;
7c2869
-
7c2869
-}
7c2869
-
7c2869
 static gf_boolean_t
7c2869
 afr_is_entrylk (afr_transaction_type trans_type)
7c2869
 {
d1681e
@@ -733,7 +634,6 @@ int
7c2869
 afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
7c2869
 {
7c2869
         afr_internal_lock_t *int_lock    = NULL;
7c2869
-        afr_inodelk_t       *inodelk     = NULL;
7c2869
         afr_local_t         *local       = NULL;
7c2869
         afr_private_t       *priv        = NULL;
7c2869
         struct gf_flock flock = {0,};
d1681e
@@ -752,10 +652,9 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
7c2869
 
7c2869
 
7c2869
         if (!is_entrylk) {
7c2869
-                inodelk = afr_get_inodelk (int_lock, int_lock->domain);
7c2869
-                flock.l_start = inodelk->flock.l_start;
7c2869
-                flock.l_len   = inodelk->flock.l_len;
7c2869
-                flock.l_type  = inodelk->flock.l_type;
7c2869
+                flock.l_start = int_lock->flock.l_start;
7c2869
+                flock.l_len   = int_lock->flock.l_len;
7c2869
+                flock.l_type  = int_lock->flock.l_type;
7c2869
         }
7c2869
 
7c2869
         if (local->fd) {
d1681e
@@ -770,9 +669,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
7c2869
                         local->op_ret           = -1;
7c2869
                         int_lock->lock_op_ret   = -1;
7c2869
 
7c2869
-                        afr_copy_locked_nodes (frame, this);
7c2869
-
7c2869
-                        afr_unlock (frame, this);
7c2869
+                        afr_unlock_now (frame, this);
7c2869
 
7c2869
                         return 0;
7c2869
                 }
d1681e
@@ -784,9 +681,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
7c2869
                         local->op_ret           = -1;
7c2869
                         int_lock->lock_op_ret   = -1;
7c2869
 
7c2869
-                        afr_copy_locked_nodes (frame, this);
7c2869
-
7c2869
-                        afr_unlock(frame, this);
7c2869
+                        afr_unlock_now(frame, this);
7c2869
 
7c2869
                         return 0;
7c2869
                 }
d1681e
@@ -798,8 +693,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
7c2869
                 gf_msg_debug (this->name, 0,
7c2869
                               "we're done locking");
7c2869
 
7c2869
-                afr_copy_locked_nodes (frame, this);
7c2869
-
7c2869
                 int_lock->lock_op_ret = 0;
7c2869
                 int_lock->lock_cbk (frame, this);
7c2869
                 return 0;
d1681e
@@ -815,7 +708,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
7c2869
         case AFR_METADATA_TRANSACTION:
7c2869
 
7c2869
                 if (local->fd) {
7c2869
-
7c2869
                         STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk,
7c2869
                                            (void *) (long) child_index,
7c2869
                                            priv->children[child_index],
d1681e
@@ -824,7 +716,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
7c2869
                                            F_SETLKW, &flock, NULL);
7c2869
 
7c2869
                 } else {
7c2869
-
7c2869
                         STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk,
7c2869
                                            (void *) (long) child_index,
7c2869
                                            priv->children[child_index],
d1681e
@@ -841,7 +732,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
7c2869
                  *and 'fd-less' children */
7c2869
 
7c2869
                 if (local->fd) {
7c2869
-
7c2869
                         STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk,
7c2869
                                            (void *) (long) cookie,
7c2869
                                            priv->children[child_index],
d1681e
@@ -850,7 +740,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
7c2869
                                            int_lock->lockee[lockee_no].basename,
7c2869
                                            ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
7c2869
                 } else {
7c2869
-
7c2869
                         STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk,
7c2869
                                            (void *) (long) cookie,
7c2869
                                            priv->children[child_index],
d1681e
@@ -922,7 +811,6 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
7c2869
         local    = frame->local;
7c2869
         int_lock = &local->internal_lock;
7c2869
 
7c2869
-
7c2869
 	LOCK (&frame->lock);
7c2869
 	{
7c2869
 		if (op_ret < 0 ) {
d1681e
@@ -969,7 +857,7 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
7c2869
                                       "with blocking calls",
7c2869
                                       int_lock->lock_count);
7c2869
 
7c2869
-                        afr_unlock(frame, this);
7c2869
+                        afr_unlock_now(frame, this);
7c2869
                 }
7c2869
         }
7c2869
 
d1681e
@@ -1009,7 +897,7 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)
7c2869
                         local->op_errno         = EINVAL;
7c2869
                         int_lock->lock_op_errno = EINVAL;
7c2869
 
7c2869
-			afr_unlock (frame, this);
7c2869
+			afr_unlock_now (frame, this);
7c2869
                         return -1;
7c2869
                 }
7c2869
 
d1681e
@@ -1021,7 +909,7 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)
7c2869
                         gf_msg (this->name, GF_LOG_INFO, 0,
7c2869
                                 AFR_MSG_INFO_COMMON,
7c2869
                                 "fd not open on any subvolumes. aborting.");
7c2869
-                        afr_unlock (frame, this);
7c2869
+                        afr_unlock_now (frame, this);
7c2869
                         goto out;
7c2869
                 }
7c2869
 
d1681e
@@ -1031,7 +919,6 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)
7c2869
                         index = i%copies;
7c2869
                         lockee_no = i/copies;
7c2869
                         if (local->child_up[index]) {
7c2869
-
7c2869
                                 STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk,
7c2869
                                                    (void *) (long) i,
7c2869
                                                    priv->children[index],
d1681e
@@ -1053,7 +940,6 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)
7c2869
                         index = i%copies;
7c2869
                         lockee_no = i/copies;
7c2869
                         if (local->child_up[index]) {
7c2869
-
7c2869
                                 STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk,
7c2869
                                                    (void *) (long) i,
7c2869
                                                    priv->children[index],
d1681e
@@ -1077,18 +963,12 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
7c2869
                              int32_t op_ret, int32_t op_errno, dict_t *xdata)
7c2869
 {
d1681e
         afr_internal_lock_t *int_lock    = NULL;
d1681e
-        afr_inodelk_t       *inodelk     = NULL;
d1681e
         afr_local_t         *local       = NULL;
d1681e
-        afr_fd_ctx_t        *fd_ctx      = NULL;
d1681e
         int                  call_count  = 0;
d1681e
         int                  child_index = (long) cookie;
7c2869
 
7c2869
         local    = frame->local;
7c2869
         int_lock = &local->internal_lock;
7c2869
-        inodelk = afr_get_inodelk (int_lock, int_lock->domain);
7c2869
-
7c2869
-	if (local->fd)
7c2869
-		fd_ctx = afr_fd_ctx_get (local->fd, this);
7c2869
 
7c2869
         LOCK (&frame->lock);
7c2869
         {
d1681e
@@ -1105,43 +985,27 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
7c2869
 				int_lock->lock_op_errno      = op_errno;
7c2869
 				local->op_errno              = op_errno;
7c2869
 			}
7c2869
-			if (local->transaction.eager_lock)
7c2869
-				local->transaction.eager_lock[child_index] = 0;
7c2869
 		} else {
7c2869
-			inodelk->locked_nodes[child_index] |= LOCKED_YES;
7c2869
-			inodelk->lock_count++;
7c2869
-
7c2869
-			if (local->transaction.eager_lock &&
7c2869
-			    local->transaction.eager_lock[child_index] &&
7c2869
-			    local->fd) {
7c2869
-				/* piggybacked */
7c2869
-				if (op_ret == 1) {
7c2869
-					/* piggybacked */
7c2869
-				} else if (op_ret == 0) {
7c2869
-					/* lock acquired from server */
7c2869
-                                        fd_ctx->lock_acquired[child_index]++;
7c2869
-				}
7c2869
-			}
d1681e
-
d1681e
-                        if (local->transaction.type == AFR_DATA_TRANSACTION &&
d1681e
-                            op_ret == 0) {
d1681e
-                                LOCK(&local->inode->lock);
d1681e
-                                {
d1681e
-                                        local->inode_ctx->lock_count++;
d1681e
-                                }
d1681e
-                                UNLOCK (&local->inode->lock);
d1681e
-                        }
d1681e
+			int_lock->locked_nodes[child_index] |= LOCKED_YES;
d1681e
+			int_lock->lock_count++;
7c2869
 		}
7c2869
 
7c2869
                 call_count = --int_lock->lk_call_count;
d1681e
         }
d1681e
         UNLOCK (&frame->lock);
d1681e
 
d1681e
+        if (op_ret == 0 && local->transaction.type == AFR_DATA_TRANSACTION) {
d1681e
+                LOCK (&local->inode->lock);
d1681e
+                {
d1681e
+                        local->inode_ctx->lock_count++;
d1681e
+                }
d1681e
+                UNLOCK (&local->inode->lock);
d1681e
+        }
d1681e
         if (call_count == 0) {
7c2869
                 gf_msg_trace (this->name, 0,
7c2869
                               "Last inode locking reply received");
7c2869
                 /* all locks successful. Proceed to call FOP */
7c2869
-                if (inodelk->lock_count == int_lock->lk_expected_count) {
7c2869
+                if (int_lock->lock_count == int_lock->lk_expected_count) {
7c2869
                         gf_msg_trace (this->name, 0,
7c2869
                                       "All servers locked. Calling the cbk");
7c2869
                         int_lock->lock_op_ret = 0;
d1681e
@@ -1155,7 +1019,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
7c2869
                                       "Trying again with blocking calls",
7c2869
                                       int_lock->lock_count);
7c2869
 
7c2869
-                        afr_unlock(frame, this);
7c2869
+                        afr_unlock_now(frame, this);
7c2869
                 }
7c2869
         }
7c2869
 
d1681e
@@ -1166,30 +1030,17 @@ int
7c2869
 afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
7c2869
 {
7c2869
         afr_internal_lock_t *int_lock = NULL;
7c2869
-        afr_inodelk_t       *inodelk  = NULL;
7c2869
         afr_local_t         *local    = NULL;
7c2869
         afr_private_t       *priv     = NULL;
7c2869
         afr_fd_ctx_t        *fd_ctx   = NULL;
7c2869
         int32_t             call_count = 0;
7c2869
         int                 i          = 0;
7c2869
         int                 ret        = 0;
7c2869
-        struct              gf_flock flock = {0,};
7c2869
-        struct              gf_flock full_flock = {0,};
7c2869
-        struct              gf_flock *flock_use = NULL;
7c2869
-        int                 piggyback = 0;
7c2869
 
7c2869
         local    = frame->local;
7c2869
         int_lock = &local->internal_lock;
7c2869
         priv     = this->private;
7c2869
 
7c2869
-        inodelk = afr_get_inodelk (int_lock, int_lock->domain);
7c2869
-
7c2869
-        flock.l_start = inodelk->flock.l_start;
7c2869
-        flock.l_len   = inodelk->flock.l_len;
7c2869
-        flock.l_type  = inodelk->flock.l_type;
7c2869
-
7c2869
-        full_flock.l_type = inodelk->flock.l_type;
7c2869
-
7c2869
         initialize_inodelk_variables (frame, this);
7c2869
 
7c2869
         if (local->fd) {
d1681e
@@ -1205,88 +1056,48 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
7c2869
                         local->op_errno         = EINVAL;
7c2869
                         int_lock->lock_op_errno = EINVAL;
7c2869
 
7c2869
-			afr_unlock (frame, this);
7c2869
+			afr_unlock_now (frame, this);
7c2869
                         ret = -1;
7c2869
                         goto out;
7c2869
                 }
7c2869
+        }
7c2869
 
7c2869
-                call_count = internal_lock_count (frame, this);
7c2869
-                int_lock->lk_call_count = call_count;
7c2869
-                int_lock->lk_expected_count = call_count;
7c2869
-
7c2869
-                if (!call_count) {
7c2869
-                        gf_msg (this->name, GF_LOG_INFO, 0,
d1681e
-                                AFR_MSG_SUBVOLS_DOWN,
7c2869
-                                "All bricks are down, aborting.");
7c2869
-                        afr_unlock (frame, this);
7c2869
-                        goto out;
7c2869
-                }
7c2869
-
7c2869
-                /* Send non-blocking inodelk calls only on up children
7c2869
-                   and where the fd has been opened */
7c2869
-                for (i = 0; i < priv->child_count; i++) {
7c2869
-                        if (!local->child_up[i])
7c2869
-                                continue;
7c2869
-
7c2869
-                        flock_use = &flock;
7c2869
-                        if (!local->transaction.eager_lock_on) {
7c2869
-                                goto wind;
7c2869
-                        }
7c2869
-
7c2869
-                        piggyback = 0;
7c2869
-                        local->transaction.eager_lock[i] = 1;
7c2869
-
7c2869
-			afr_set_delayed_post_op (frame, this);
7c2869
+        call_count = internal_lock_count (frame, this);
7c2869
+        int_lock->lk_call_count = call_count;
7c2869
+        int_lock->lk_expected_count = call_count;
7c2869
 
7c2869
-                        LOCK (&local->fd->lock);
7c2869
-                        {
7c2869
-                                if (fd_ctx->lock_acquired[i]) {
7c2869
-                                        fd_ctx->lock_piggyback[i]++;
7c2869
-                                        piggyback = 1;
7c2869
-                                }
7c2869
-                        }
7c2869
-                        UNLOCK (&local->fd->lock);
7c2869
+        if (!call_count) {
7c2869
+                gf_msg (this->name, GF_LOG_INFO, 0,
d1681e
+                        AFR_MSG_SUBVOLS_DOWN,
7c2869
+                        "All bricks are down, aborting.");
7c2869
+                afr_unlock_now (frame, this);
7c2869
+                goto out;
7c2869
+        }
7c2869
 
7c2869
-                        if (piggyback) {
7c2869
-                                /* (op_ret == 1) => indicate piggybacked lock */
7c2869
-                                afr_nonblocking_inodelk_cbk (frame, (void *) (long) i,
7c2869
-                                                             this, 1, 0, NULL);
7c2869
-                                if (!--call_count)
7c2869
-                                        break;
7c2869
-                                continue;
7c2869
-                        }
7c2869
-                        flock_use = &full_flock;
7c2869
-                wind:
7c2869
+        /* Send non-blocking inodelk calls only on up children
7c2869
+           and where the fd has been opened */
7c2869
+        for (i = 0; i < priv->child_count; i++) {
7c2869
+                if (!local->child_up[i])
7c2869
+                        continue;
7c2869
 
7c2869
+                if (local->fd) {
7c2869
                         STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk,
7c2869
                                            (void *) (long) i,
7c2869
                                            priv->children[i],
7c2869
                                            priv->children[i]->fops->finodelk,
7c2869
                                            int_lock->domain, local->fd,
7c2869
-                                           F_SETLK, flock_use, NULL);
7c2869
-
7c2869
-                        if (!--call_count)
7c2869
-                                break;
7c2869
-                }
7c2869
-        } else {
7c2869
-                call_count = internal_lock_count (frame, this);
7c2869
-                int_lock->lk_call_count = call_count;
7c2869
-                int_lock->lk_expected_count = call_count;
7c2869
-
7c2869
-                for (i = 0; i < priv->child_count; i++) {
7c2869
-                        if (!local->child_up[i])
7c2869
-                                continue;
7c2869
+                                           F_SETLK, &int_lock->flock, NULL);
7c2869
+                } else {
7c2869
 
7c2869
                         STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk,
7c2869
                                            (void *) (long) i,
7c2869
                                            priv->children[i],
7c2869
                                            priv->children[i]->fops->inodelk,
7c2869
                                            int_lock->domain, &local->loc,
7c2869
-                                           F_SETLK, &flock, NULL);
7c2869
-
7c2869
-                        if (!--call_count)
7c2869
-                                break;
7c2869
+                                           F_SETLK, &int_lock->flock, NULL);
7c2869
                 }
7c2869
+                if (!--call_count)
7c2869
+                        break;
7c2869
         }
7c2869
 out:
7c2869
         return ret;
d1681e
@@ -1296,13 +1107,32 @@ int32_t
7c2869
 afr_unlock (call_frame_t *frame, xlator_t *this)
7c2869
 {
7c2869
         afr_local_t *local = NULL;
7c2869
+        afr_lock_t  *lock  = NULL;
7c2869
 
7c2869
         local = frame->local;
7c2869
 
7c2869
-        if (afr_is_inodelk_transaction(local->transaction.type))
7c2869
-                afr_unlock_inodelk (frame, this);
7c2869
-        else
7c2869
-                afr_unlock_entrylk (frame, this);
7c2869
+        if (!local->transaction.eager_lock_on)
7c2869
+                goto out;
7c2869
+        lock = &local->inode_ctx->lock[local->transaction.type];
7c2869
+        LOCK (&local->inode->lock);
7c2869
+        {
7c2869
+                list_del_init (&local->transaction.owner_list);
7c2869
+                if (list_empty (&lock->owners) && list_empty (&lock->post_op)) {
7c2869
+                        local->transaction.do_eager_unlock = _gf_true;
7c2869
+        /*TODO: Need to get metadata use on_disk and inherit/uninherit
7c2869
+         *GF_ASSERT (!local->inode_ctx->on_disk[local->transaction.type]);
7c2869
+         *GF_ASSERT (!local->inode_ctx->inherited[local->transaction.type]);
7c2869
+        */
7c2869
+                        GF_ASSERT (lock->release);
7c2869
+                }
7c2869
+        }
7c2869
+        UNLOCK (&local->inode->lock);
7c2869
+        if (!local->transaction.do_eager_unlock) {
7c2869
+                local->internal_lock.lock_cbk (frame, this);
7c2869
+                return 0;
7c2869
+        }
7c2869
 
7c2869
+out:
7c2869
+        afr_unlock_now (frame, this);
7c2869
         return 0;
7c2869
 }
7c2869
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
d1681e
index f61b237..32fd24a 100644
7c2869
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
7c2869
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
d1681e
@@ -2463,6 +2463,7 @@ afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid)
7c2869
         int           data_ret          = 1;
7c2869
         int           or_ret            = 0;
7c2869
         inode_t      *inode             = NULL;
7c2869
+        fd_t         *fd                = NULL;
7c2869
 	gf_boolean_t  data_selfheal     = _gf_false;
7c2869
 	gf_boolean_t  metadata_selfheal = _gf_false;
7c2869
 	gf_boolean_t  entry_selfheal    = _gf_false;
d1681e
@@ -2487,8 +2488,16 @@ afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid)
7c2869
                 goto out;
7c2869
         }
7c2869
 
7c2869
+        if (inode->ia_type == IA_IFREG) {
7c2869
+                ret = afr_selfheal_data_open (this, inode, &fd;;
7c2869
+                if (!fd) {
7c2869
+                        ret = -EIO;
7c2869
+                        goto out;
7c2869
+                }
7c2869
+        }
7c2869
+
7c2869
 	if (data_selfheal && dataheal_enabled)
7c2869
-                data_ret = afr_selfheal_data (frame, this, inode);
7c2869
+                data_ret = afr_selfheal_data (frame, this, fd);
7c2869
 
7c2869
 	if (metadata_selfheal && priv->metadata_self_heal)
7c2869
                 metadata_ret = afr_selfheal_metadata (frame, this, inode);
d1681e
@@ -2510,6 +2519,8 @@ afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid)
7c2869
 out:
7c2869
         if (inode)
7c2869
                 inode_unref (inode);
7c2869
+        if (fd)
7c2869
+                fd_unref (fd);
7c2869
         return ret;
7c2869
 }
7c2869
 /*
7c2869
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
d1681e
index bcd0dec..f872a98 100644
7c2869
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
7c2869
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
d1681e
@@ -856,22 +856,15 @@ out:
7c2869
 }
7c2869
 
7c2869
 int
7c2869
-afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode)
7c2869
+afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd)
7c2869
 {
7c2869
 	afr_private_t *priv = NULL;
7c2869
 	unsigned char *locked_on = NULL;
7c2869
 	int ret = 0;
7c2869
-	fd_t *fd = NULL;
7c2869
+        inode_t *inode = fd->inode;
7c2869
 
7c2869
 	priv = this->private;
7c2869
 
7c2869
-	ret = afr_selfheal_data_open (this, inode, &fd;;
7c2869
-	if (!fd) {
7c2869
-                gf_msg_debug (this->name, -ret, "%s: Failed to open",
7c2869
-                              uuid_utoa (inode->gfid));
7c2869
-                return -EIO;
7c2869
-        }
7c2869
-
7c2869
 	locked_on = alloca0 (priv->child_count);
7c2869
 
7c2869
 	ret = afr_selfheal_tie_breaker_inodelk (frame, this, inode,
d1681e
@@ -898,8 +891,5 @@ unlock:
7c2869
 	afr_selfheal_uninodelk (frame, this, inode, priv->sh_domain, 0, 0,
7c2869
 	                        locked_on);
7c2869
 
7c2869
-	if (fd)
7c2869
-		fd_unref (fd);
7c2869
-
7c2869
 	return ret;
7c2869
 }
7c2869
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
d1681e
index 188a334..b015976 100644
7c2869
--- a/xlators/cluster/afr/src/afr-self-heal.h
7c2869
+++ b/xlators/cluster/afr/src/afr-self-heal.h
d1681e
@@ -102,7 +102,7 @@ afr_selfheal_name (xlator_t *this, uuid_t gfid, const char *name,
d1681e
                    void *gfid_req, dict_t *xdata);
7c2869
 
7c2869
 int
7c2869
-afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode);
7c2869
+afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd);
7c2869
 
7c2869
 int
7c2869
 afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode);
7c2869
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
d1681e
index acbfe1a..993029d 100644
7c2869
--- a/xlators/cluster/afr/src/afr-transaction.c
7c2869
+++ b/xlators/cluster/afr/src/afr-transaction.c
7c2869
@@ -25,6 +25,18 @@ typedef enum {
7c2869
         AFR_TRANSACTION_POST_OP,
7c2869
 } afr_xattrop_type_t;
7c2869
 
7c2869
+static void
7c2869
+afr_lock_resume_shared (struct list_head *list);
7c2869
+
7c2869
+void
7c2869
+__afr_transaction_wake_shared (afr_local_t *local, struct list_head *shared);
7c2869
+
7c2869
+void
7c2869
+afr_changelog_post_op (call_frame_t *frame, xlator_t *this);
7c2869
+
7c2869
+int
7c2869
+afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this);
7c2869
+
7c2869
 gf_boolean_t
7c2869
 afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this);
7c2869
 
7c2869
@@ -168,13 +180,14 @@ afr_transaction_fop (call_frame_t *frame, xlator_t *this)
7c2869
         return 0;
7c2869
 }
7c2869
 
7c2869
-
7c2869
 int
7c2869
 afr_transaction_done (call_frame_t *frame, xlator_t *this)
7c2869
 {
7c2869
-        afr_local_t *local = NULL;
7c2869
-        afr_private_t *priv = NULL;
7c2869
-        gf_boolean_t unwind = _gf_false;
7c2869
+        afr_local_t   *local      = NULL;
7c2869
+        afr_private_t *priv       = NULL;
7c2869
+        gf_boolean_t  unwind      = _gf_false;
7c2869
+        afr_lock_t    *lock       = NULL;
7c2869
+        afr_local_t   *lock_local = NULL;
7c2869
 
7c2869
         priv  = this->private;
7c2869
         local = frame->local;
7c2869
@@ -188,6 +201,31 @@ afr_transaction_done (call_frame_t *frame, xlator_t *this)
7c2869
                 if (unwind)/*It definitely did post-op*/
7c2869
                         afr_zero_fill_stat (local);
7c2869
         }
7c2869
+
7c2869
+        if (local->transaction.do_eager_unlock) {
7c2869
+                lock = &local->inode_ctx->lock[local->transaction.type];
7c2869
+                LOCK (&local->inode->lock);
7c2869
+                {
7c2869
+                        lock->acquired = _gf_false;
7c2869
+                        lock->release = _gf_false;
7c2869
+                        list_splice_init (&lock->frozen,
7c2869
+                                          &lock->waiting);
7c2869
+                        if (list_empty (&lock->waiting))
7c2869
+                                goto unlock;
7c2869
+                        lock_local = list_entry (lock->waiting.next,
7c2869
+                                                 afr_local_t,
7c2869
+                                                transaction.wait_list);
7c2869
+                        list_del_init (&lock_local->transaction.wait_list);
7c2869
+                        list_add (&lock_local->transaction.owner_list,
7c2869
+                                  &lock->owners);
7c2869
+                }
7c2869
+unlock:
7c2869
+                UNLOCK (&local->inode->lock);
7c2869
+        }
7c2869
+        if (lock_local) {
7c2869
+                afr_lock (lock_local->transaction.frame,
7c2869
+                          lock_local->transaction.frame->this);
7c2869
+        }
7c2869
         local->transaction.unwind (frame, this);
7c2869
 
7c2869
         AFR_STACK_DESTROY (frame);
7c2869
@@ -195,6 +233,52 @@ afr_transaction_done (call_frame_t *frame, xlator_t *this)
7c2869
         return 0;
7c2869
 }
7c2869
 
7c2869
+static void
7c2869
+afr_lock_fail_shared (afr_local_t *local, struct list_head *list)
7c2869
+{
7c2869
+        afr_local_t *each = NULL;
7c2869
+
7c2869
+        while (!list_empty(list)) {
7c2869
+                each = list_entry (list->next, afr_local_t,
7c2869
+                                   transaction.wait_list);
7c2869
+                list_del_init(&each->transaction.wait_list);
7c2869
+                each->op_ret = -1;
7c2869
+                each->op_errno = local->op_errno;
7c2869
+                afr_transaction_done (each->transaction.frame,
7c2869
+                                      each->transaction.frame->this);
7c2869
+        }
7c2869
+}
7c2869
+
7c2869
+static void
7c2869
+afr_handle_lock_acquire_failure (afr_local_t *local, gf_boolean_t locked)
7c2869
+{
7c2869
+        struct list_head shared;
7c2869
+        afr_lock_t *lock = NULL;
7c2869
+
7c2869
+        if (!local->transaction.eager_lock_on)
7c2869
+                goto out;
7c2869
+
7c2869
+        lock = &local->inode_ctx->lock[local->transaction.type];
7c2869
+
7c2869
+        INIT_LIST_HEAD (&shared);
7c2869
+        LOCK (&local->inode->lock);
7c2869
+        {
7c2869
+                list_splice_init (&lock->waiting, &shared);
7c2869
+        }
7c2869
+        UNLOCK (&local->inode->lock);
7c2869
+
7c2869
+        afr_lock_fail_shared (local, &shared);
7c2869
+        local->transaction.do_eager_unlock = _gf_true;
7c2869
+out:
7c2869
+        if (locked) {
7c2869
+                local->internal_lock.lock_cbk = afr_transaction_done;
7c2869
+                afr_unlock (local->transaction.frame,
7c2869
+                            local->transaction.frame->this);
7c2869
+        } else {
7c2869
+                afr_transaction_done (local->transaction.frame,
7c2869
+                                      local->transaction.frame->this);
7c2869
+        }
7c2869
+}
7c2869
 
7c2869
 call_frame_t*
7c2869
 afr_transaction_detach_fop_frame (call_frame_t *frame)
d1681e
@@ -334,6 +418,7 @@ afr_txn_arbitrate_fop (call_frame_t *frame, xlator_t *this)
7c2869
         afr_local_t *local = NULL;
7c2869
         afr_private_t *priv = NULL;
7c2869
         int pre_op_sources_count = 0;
7c2869
+        int i = 0;
7c2869
 
7c2869
         priv = this->private;
7c2869
         local = frame->local;
d1681e
@@ -345,11 +430,11 @@ afr_txn_arbitrate_fop (call_frame_t *frame, xlator_t *this)
7c2869
         /* If arbiter is the only source, do not proceed. */
7c2869
         if (pre_op_sources_count < 2 &&
7c2869
             local->transaction.pre_op_sources[ARBITER_BRICK_INDEX]) {
7c2869
-                local->internal_lock.lock_cbk = afr_transaction_done;
7c2869
                 local->op_ret = -1;
7c2869
                 local->op_errno =  ENOTCONN;
7c2869
-                afr_restore_lk_owner (frame);
7c2869
-                afr_unlock (frame, this);
7c2869
+                for (i = 0; i < priv->child_count; i++)
7c2869
+                        local->transaction.failed_subvols[i] = 1;
7c2869
+                afr_changelog_post_op (frame, this);/*uninherit should happen*/
7c2869
         } else {
7c2869
                 afr_transaction_fop (frame, this);
7c2869
         }
d1681e
@@ -362,14 +447,16 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)
7c2869
 {
d1681e
         afr_local_t   *local = NULL;
d1681e
         afr_private_t *priv  = NULL;
d1681e
-        fd_t          *fd    = NULL;
d1681e
         int           i      = 0;
d1681e
         int           ret    = 0;
7c2869
+        int     failure_count = 0;
7c2869
+        struct list_head shared;
7c2869
+        afr_lock_t *lock = NULL;
7c2869
 
7c2869
         local = frame->local;
7c2869
         priv = this->private;
7c2869
-        fd    = local->fd;
7c2869
 
7c2869
+        INIT_LIST_HEAD (&shared);
d1681e
         if (local->transaction.type == AFR_DATA_TRANSACTION &&
d1681e
             !local->transaction.inherited) {
d1681e
                 ret = afr_write_subvol_set (frame, this);
d1681e
@@ -394,22 +481,31 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)
7c2869
 		   just now, before OP */
7c2869
 		afr_changelog_pre_op_update (frame, this);
7c2869
 
7c2869
-        /* The wake up needs to happen independent of
7c2869
-           what type of fop arrives here. If it was
7c2869
-           a write, then it has already inherited the
7c2869
-           lock and changelog. If it was not a write,
7c2869
-           then the presumption of the optimization (of
7c2869
-           optimizing for successive write operations)
7c2869
-           fails.
7c2869
-        */
7c2869
-        if (fd)
7c2869
-                afr_delayed_changelog_wake_up (this, fd);
7c2869
+        if (!local->transaction.eager_lock_on ||
7c2869
+            local->transaction.inherited)
7c2869
+                goto fop;
7c2869
+        failure_count = AFR_COUNT (local->transaction.failed_subvols,
7c2869
+                                   priv->child_count);
7c2869
+        if (failure_count == priv->child_count) {
7c2869
+                afr_handle_lock_acquire_failure (local, _gf_true);
7c2869
+        } else {
7c2869
+                lock = &local->inode_ctx->lock[local->transaction.type];
7c2869
+                LOCK (&local->inode->lock);
7c2869
+                {
7c2869
+                        lock->acquired = _gf_true;
7c2869
+                        __afr_transaction_wake_shared (local, &shared);
7c2869
+                }
7c2869
+                UNLOCK (&local->inode->lock);
7c2869
+        }
7c2869
+
7c2869
+fop:
7c2869
         if (priv->arbiter_count == 1) {
7c2869
                 afr_txn_arbitrate_fop (frame, this);
7c2869
         } else {
7c2869
                 afr_transaction_fop (frame, this);
7c2869
         }
7c2869
 
7c2869
+        afr_lock_resume_shared (&shared);
7c2869
 	return 0;
7c2869
 }
7c2869
 
d1681e
@@ -486,30 +582,14 @@ afr_changelog_post_op_done (call_frame_t *frame, xlator_t *this)
7c2869
 }
7c2869
 
7c2869
 
7c2869
-afr_inodelk_t*
7c2869
-afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom)
7c2869
-{
7c2869
-        afr_inodelk_t *inodelk = NULL;
7c2869
-        int           i = 0;
7c2869
-
7c2869
-        for (i = 0; int_lock->inodelk[i].domain; i++) {
7c2869
-                inodelk = &int_lock->inodelk[i];
7c2869
-                if (strcmp (dom, inodelk->domain) == 0)
7c2869
-                        return inodelk;
7c2869
-        }
7c2869
-        return NULL;
7c2869
-}
7c2869
-
7c2869
 unsigned char*
7c2869
 afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock)
7c2869
 {
7c2869
         unsigned char *locked_nodes = NULL;
7c2869
-        afr_inodelk_t *inodelk = NULL;
7c2869
         switch (type) {
7c2869
         case AFR_DATA_TRANSACTION:
7c2869
         case AFR_METADATA_TRANSACTION:
7c2869
-                inodelk = afr_get_inodelk (int_lock, int_lock->domain);
7c2869
-                locked_nodes = inodelk->locked_nodes;
7c2869
+                locked_nodes = int_lock->locked_nodes;
7c2869
         break;
7c2869
 
7c2869
         case AFR_ENTRY_TRANSACTION:
d1681e
@@ -834,27 +914,19 @@ afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this)
7c2869
 {
7c2869
 	afr_local_t *local = NULL;
7c2869
 	afr_private_t *priv = NULL;
7c2869
-	fd_t *fd = NULL;
7c2869
+        afr_inode_ctx_t *ctx = NULL;
7c2869
 	int i = 0;
7c2869
 	gf_boolean_t ret = _gf_false;
7c2869
-	afr_fd_ctx_t *fd_ctx = NULL;
7c2869
 	int type = 0;
7c2869
 
7c2869
 	local = frame->local;
7c2869
 	priv = this->private;
7c2869
-	fd = local->fd;
7c2869
+        ctx = local->inode_ctx;
7c2869
 
7c2869
 	type = afr_index_for_transaction_type (local->transaction.type);
7c2869
 	if (type != AFR_DATA_TRANSACTION)
7c2869
 		return !local->transaction.dirtied;
7c2869
 
7c2869
-	if (!fd)
7c2869
-		return !local->transaction.dirtied;
7c2869
-
7c2869
-	fd_ctx = afr_fd_ctx_get (fd, this);
7c2869
-	if (!fd_ctx)
7c2869
-		return _gf_false;
7c2869
-
7c2869
 	if (local->transaction.no_uninherit)
7c2869
 		return _gf_false;
7c2869
 
d1681e
@@ -868,34 +940,34 @@ afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this)
7c2869
 	if (local->transaction.uninherit_done)
7c2869
 		return local->transaction.uninherit_value;
7c2869
 
7c2869
-	LOCK(&fd->lock);
7c2869
+	LOCK(&local->inode->lock);
7c2869
 	{
7c2869
 		for (i = 0; i < priv->child_count; i++) {
7c2869
 			if (local->transaction.pre_op[i] !=
7c2869
-			    fd_ctx->pre_op_done[type][i]) {
7c2869
+			    ctx->pre_op_done[type][i]) {
7c2869
 				ret = !local->transaction.dirtied;
7c2869
 				goto unlock;
7c2869
 			}
7c2869
 		}
7c2869
 
7c2869
-		if (fd_ctx->inherited[type]) {
7c2869
+		if (ctx->inherited[type]) {
7c2869
 			ret = _gf_true;
7c2869
-			fd_ctx->inherited[type]--;
7c2869
-		} else if (fd_ctx->on_disk[type]) {
7c2869
+			ctx->inherited[type]--;
7c2869
+		} else if (ctx->on_disk[type]) {
7c2869
 			ret = _gf_false;
7c2869
-			fd_ctx->on_disk[type]--;
7c2869
+			ctx->on_disk[type]--;
7c2869
 		} else {
7c2869
 			/* ASSERT */
7c2869
 			ret = _gf_false;
7c2869
 		}
7c2869
 
7c2869
-		if (!fd_ctx->inherited[type] && !fd_ctx->on_disk[type]) {
7c2869
+		if (!ctx->inherited[type] && !ctx->on_disk[type]) {
7c2869
 			for (i = 0; i < priv->child_count; i++)
7c2869
-				fd_ctx->pre_op_done[type][i] = 0;
7c2869
+				ctx->pre_op_done[type][i] = 0;
7c2869
 		}
7c2869
 	}
7c2869
 unlock:
7c2869
-	UNLOCK(&fd->lock);
7c2869
+	UNLOCK(&local->inode->lock);
7c2869
 
7c2869
 	local->transaction.uninherit_done = _gf_true;
7c2869
 	local->transaction.uninherit_value = ret;
d1681e
@@ -909,31 +981,21 @@ afr_changelog_pre_op_inherit (call_frame_t *frame, xlator_t *this)
7c2869
 {
7c2869
 	afr_local_t *local = NULL;
7c2869
 	afr_private_t *priv = NULL;
7c2869
-	fd_t *fd = NULL;
7c2869
 	int i = 0;
7c2869
 	gf_boolean_t ret = _gf_false;
7c2869
-	afr_fd_ctx_t *fd_ctx = NULL;
7c2869
 	int type = 0;
7c2869
 
7c2869
 	local = frame->local;
7c2869
 	priv = this->private;
7c2869
-	fd = local->fd;
7c2869
 
7c2869
 	if (local->transaction.type != AFR_DATA_TRANSACTION)
7c2869
 		return _gf_false;
7c2869
 
7c2869
 	type = afr_index_for_transaction_type (local->transaction.type);
7c2869
 
7c2869
-	if (!fd)
7c2869
-		return _gf_false;
7c2869
-
7c2869
-	fd_ctx = afr_fd_ctx_get (fd, this);
7c2869
-	if (!fd_ctx)
7c2869
-		return _gf_false;
7c2869
-
7c2869
-	LOCK(&fd->lock);
7c2869
+	LOCK(&local->inode->lock);
7c2869
 	{
7c2869
-		if (!fd_ctx->on_disk[type]) {
7c2869
+		if (!local->inode_ctx->on_disk[type]) {
7c2869
 			/* nothing to inherit yet */
7c2869
 			ret = _gf_false;
7c2869
 			goto unlock;
d1681e
@@ -941,21 +1003,21 @@ afr_changelog_pre_op_inherit (call_frame_t *frame, xlator_t *this)
7c2869
 
7c2869
 		for (i = 0; i < priv->child_count; i++) {
7c2869
 			if (local->transaction.pre_op[i] !=
7c2869
-			    fd_ctx->pre_op_done[type][i]) {
7c2869
+			    local->inode_ctx->pre_op_done[type][i]) {
7c2869
 				/* either inherit exactly, or don't */
7c2869
 				ret = _gf_false;
7c2869
 				goto unlock;
7c2869
 			}
7c2869
 		}
7c2869
 
7c2869
-		fd_ctx->inherited[type]++;
7c2869
+		local->inode_ctx->inherited[type]++;
7c2869
 
7c2869
 		ret = _gf_true;
7c2869
 
7c2869
 		local->transaction.inherited = _gf_true;
7c2869
 	}
7c2869
 unlock:
7c2869
-	UNLOCK(&fd->lock);
7c2869
+	UNLOCK(&local->inode->lock);
7c2869
 
7c2869
 	return ret;
7c2869
 }
d1681e
@@ -966,22 +1028,16 @@ afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this)
7c2869
 {
7c2869
 	afr_local_t *local = NULL;
7c2869
 	afr_private_t *priv = NULL;
7c2869
-	fd_t *fd = NULL;
7c2869
-	afr_fd_ctx_t *fd_ctx = NULL;
7c2869
 	int i = 0;
7c2869
 	gf_boolean_t ret = _gf_false;
7c2869
 	int type = 0;
7c2869
 
7c2869
 	local = frame->local;
7c2869
 	priv = this->private;
7c2869
-	fd = local->fd;
7c2869
 
7c2869
-	if (!fd)
7c2869
-		return _gf_false;
7c2869
-
7c2869
-	fd_ctx = afr_fd_ctx_get (fd, this);
7c2869
-	if (!fd_ctx)
7c2869
-		return _gf_false;
7c2869
+        if (local->transaction.type == AFR_ENTRY_TRANSACTION ||
7c2869
+            local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION)
7c2869
+                return _gf_false;
7c2869
 
7c2869
 	if (local->transaction.inherited)
7c2869
 		/* was already inherited in afr_changelog_pre_op */
d1681e
@@ -997,26 +1053,26 @@ afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this)
7c2869
 
7c2869
 	ret = _gf_false;
7c2869
 
7c2869
-	LOCK(&fd->lock);
7c2869
+	LOCK(&local->inode->lock);
7c2869
 	{
7c2869
-		if (!fd_ctx->on_disk[type]) {
7c2869
+		if (!local->inode_ctx->on_disk[type]) {
7c2869
 			for (i = 0; i < priv->child_count; i++)
7c2869
-				fd_ctx->pre_op_done[type][i] =
7c2869
+				local->inode_ctx->pre_op_done[type][i] =
7c2869
                                         (!local->transaction.failed_subvols[i]);
7c2869
 		} else {
7c2869
 			for (i = 0; i < priv->child_count; i++)
7c2869
-				if (fd_ctx->pre_op_done[type][i] !=
7c2869
+				if (local->inode_ctx->pre_op_done[type][i] !=
7c2869
 				    (!local->transaction.failed_subvols[i])) {
7c2869
 					local->transaction.no_uninherit = 1;
7c2869
 					goto unlock;
7c2869
 				}
7c2869
 		}
7c2869
-		fd_ctx->on_disk[type]++;
7c2869
+		local->inode_ctx->on_disk[type]++;
7c2869
 
7c2869
 		ret = _gf_true;
7c2869
 	}
7c2869
 unlock:
7c2869
-	UNLOCK(&fd->lock);
7c2869
+	UNLOCK(&local->inode->lock);
7c2869
 
7c2869
 	return ret;
7c2869
 }
d1681e
@@ -1324,6 +1380,9 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
7c2869
 
7c2869
         afr_init_optimistic_changelog_for_txn (this, local);
7c2869
 
7c2869
+        if (afr_changelog_pre_op_inherit (frame, this))
7c2869
+                goto next;
7c2869
+
7c2869
         /* This condition should not be met with present code, as
7c2869
          * transaction.done will be called if locks are not acquired on even a
7c2869
          * single node.
d1681e
@@ -1349,9 +1408,6 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
7c2869
 		goto err;
7c2869
 	}
7c2869
 
7c2869
-	if (afr_changelog_pre_op_inherit (frame, this))
7c2869
-		goto next;
7c2869
-
7c2869
         if (call_count < priv->child_count)
7c2869
                 pre_nop = _gf_false;
7c2869
 
d1681e
@@ -1408,7 +1464,7 @@ err:
7c2869
 	local->op_ret = -1;
7c2869
 	local->op_errno = op_errno;
7c2869
 
7c2869
-	afr_unlock (frame, this);
7c2869
+        afr_handle_lock_acquire_failure (local, _gf_true);
7c2869
 
7c2869
 	if (xdata_req)
7c2869
 		dict_unref (xdata_req);
d1681e
@@ -1418,31 +1474,6 @@ err:
7c2869
 
7c2869
 
7c2869
 int
7c2869
-afr_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
7c2869
-{
7c2869
-        afr_internal_lock_t *int_lock = NULL;
7c2869
-        afr_local_t         *local    = NULL;
7c2869
-
7c2869
-        local    = frame->local;
7c2869
-        int_lock = &local->internal_lock;
7c2869
-
7c2869
-        if (int_lock->lock_op_ret < 0) {
7c2869
-                gf_msg (this->name, GF_LOG_INFO,
7c2869
-                        0, AFR_MSG_BLOCKING_LKS_FAILED,
7c2869
-                        "Blocking inodelks failed.");
7c2869
-                afr_transaction_done (frame, this);
7c2869
-        } else {
7c2869
-
7c2869
-                gf_msg_debug (this->name, 0,
7c2869
-                              "Blocking inodelks done. Proceeding to FOP");
7c2869
-                afr_internal_lock_finish (frame, this);
7c2869
-        }
7c2869
-
7c2869
-        return 0;
7c2869
-}
7c2869
-
7c2869
-
7c2869
-int
7c2869
 afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
7c2869
 {
7c2869
         afr_internal_lock_t *int_lock = NULL;
d1681e
@@ -1455,7 +1486,7 @@ afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
7c2869
         if (int_lock->lock_op_ret < 0) {
7c2869
                 gf_msg_debug (this->name, 0,
7c2869
                               "Non blocking inodelks failed. Proceeding to blocking");
7c2869
-                int_lock->lock_cbk = afr_post_blocking_inodelk_cbk;
7c2869
+                int_lock->lock_cbk = afr_internal_lock_finish;
7c2869
                 afr_blocking_lock (frame, this);
7c2869
         } else {
7c2869
 
d1681e
@@ -1469,31 +1500,6 @@ afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
7c2869
 
7c2869
 
7c2869
 int
7c2869
-afr_post_blocking_entrylk_cbk (call_frame_t *frame, xlator_t *this)
7c2869
-{
7c2869
-        afr_internal_lock_t *int_lock = NULL;
7c2869
-        afr_local_t         *local    = NULL;
7c2869
-
7c2869
-        local    = frame->local;
7c2869
-        int_lock = &local->internal_lock;
7c2869
-
7c2869
-        if (int_lock->lock_op_ret < 0) {
7c2869
-                gf_msg (this->name, GF_LOG_INFO, 0,
7c2869
-                        AFR_MSG_BLOCKING_LKS_FAILED,
7c2869
-                        "Blocking entrylks failed.");
7c2869
-                afr_transaction_done (frame, this);
7c2869
-        } else {
7c2869
-
7c2869
-                gf_msg_debug (this->name, 0,
7c2869
-                             "Blocking entrylks done. Proceeding to FOP");
7c2869
-                afr_internal_lock_finish (frame, this);
7c2869
-        }
7c2869
-
7c2869
-        return 0;
7c2869
-}
7c2869
-
7c2869
-
7c2869
-int
7c2869
 afr_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this)
7c2869
 {
7c2869
         afr_internal_lock_t *int_lock = NULL;
d1681e
@@ -1506,7 +1512,7 @@ afr_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this)
7c2869
         if (int_lock->lock_op_ret < 0) {
7c2869
                 gf_msg_debug (this->name, 0,
7c2869
                               "Non blocking entrylks failed. Proceeding to blocking");
7c2869
-                int_lock->lock_cbk = afr_post_blocking_entrylk_cbk;
7c2869
+                int_lock->lock_cbk = afr_internal_lock_finish;
7c2869
                 afr_blocking_lock (frame, this);
7c2869
         } else {
7c2869
 
d1681e
@@ -1567,29 +1573,28 @@ int
7c2869
 afr_set_transaction_flock (xlator_t *this, afr_local_t *local)
7c2869
 {
7c2869
         afr_internal_lock_t *int_lock = NULL;
7c2869
-        afr_inodelk_t       *inodelk  = NULL;
7c2869
         afr_private_t       *priv     = NULL;
7c2869
 
7c2869
         int_lock = &local->internal_lock;
7c2869
-        inodelk = afr_get_inodelk (int_lock, int_lock->domain);
7c2869
         priv = this->private;
7c2869
 
d1681e
-        if ((priv->arbiter_count || priv->full_lock) &&
d1681e
+        if ((priv->arbiter_count || local->transaction.eager_lock_on ||
d1681e
+             priv->full_lock) &&
7c2869
             local->transaction.type == AFR_DATA_TRANSACTION) {
7c2869
                 /*Lock entire file to avoid network split brains.*/
7c2869
-                inodelk->flock.l_len   = 0;
7c2869
-                inodelk->flock.l_start = 0;
7c2869
+                int_lock->flock.l_len   = 0;
7c2869
+                int_lock->flock.l_start = 0;
7c2869
         } else {
7c2869
-                inodelk->flock.l_len   = local->transaction.len;
7c2869
-                inodelk->flock.l_start = local->transaction.start;
7c2869
+                int_lock->flock.l_len   = local->transaction.len;
7c2869
+                int_lock->flock.l_start = local->transaction.start;
7c2869
         }
7c2869
-        inodelk->flock.l_type  = F_WRLCK;
7c2869
+        int_lock->flock.l_type  = F_WRLCK;
7c2869
 
7c2869
         return 0;
7c2869
 }
7c2869
 
7c2869
 int
7c2869
-afr_lock_rec (call_frame_t *frame, xlator_t *this)
7c2869
+afr_lock (call_frame_t *frame, xlator_t *this)
7c2869
 {
7c2869
         afr_internal_lock_t *int_lock = NULL;
7c2869
         afr_local_t         *local    = NULL;
d1681e
@@ -1630,74 +1635,153 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this)
7c2869
         return 0;
7c2869
 }
7c2869
 
7c2869
+static gf_boolean_t
7c2869
+afr_locals_overlap (afr_local_t *local1, afr_local_t *local2)
7c2869
+{
7c2869
+        uint64_t start1 = local1->transaction.start;
7c2869
+        uint64_t start2 = local2->transaction.start;
7c2869
+        uint64_t end1 = 0;
7c2869
+        uint64_t end2 = 0;
7c2869
+
7c2869
+        if (local1->transaction.len)
7c2869
+                end1 = start1 + local1->transaction.len - 1;
7c2869
+        else
7c2869
+                end1 = ULLONG_MAX;
7c2869
+
7c2869
+        if (local2->transaction.len)
7c2869
+                end2 = start2 + local2->transaction.len - 1;
7c2869
+        else
7c2869
+                end2 = ULLONG_MAX;
7c2869
 
7c2869
-int
7c2869
-afr_lock (call_frame_t *frame, xlator_t *this)
7c2869
+        return ((end1 >= start2) && (end2 >= start1));
7c2869
+}
7c2869
+
7c2869
+gf_boolean_t
7c2869
+afr_has_lock_conflict (afr_local_t *local, gf_boolean_t waitlist_check)
7c2869
 {
7c2869
-        afr_set_lock_number (frame, this);
7c2869
+        afr_local_t     *each = NULL;
7c2869
+        afr_lock_t      *lock = NULL;
7c2869
 
7c2869
-        return afr_lock_rec (frame, this);
7c2869
+        lock = &local->inode_ctx->lock[local->transaction.type];
7c2869
+        /*
7c2869
+         * Once full file lock is acquired in eager-lock phase, overlapping
7c2869
+         * writes do not compete for inode-locks, instead are transferred to the
7c2869
+         * next writes. Because of this overlapping writes are not ordered.
7c2869
+         * This can cause inconsistencies in replication.
7c2869
+         * Example:
7c2869
+         * Two overlapping writes w1, w2 are sent in parallel on same fd
7c2869
+         * in two threads t1, t2.
7c2869
+         * Both threads can execute afr_writev_wind in the following manner.
7c2869
+         * t1 winds w1 on brick-0
7c2869
+         * t2 winds w2 on brick-0
7c2869
+         * t2 winds w2 on brick-1
7c2869
+         * t1 winds w1 on brick-1
7c2869
+         *
7c2869
+         * This check makes sure the locks are not transferred for
7c2869
+         * overlapping writes.
7c2869
+         */
7c2869
+        list_for_each_entry (each, &lock->owners, transaction.owner_list) {
7c2869
+                if (afr_locals_overlap (each, local)) {
7c2869
+                        return _gf_true;
7c2869
+                }
7c2869
+        }
7c2869
+
7c2869
+        if (!waitlist_check)
7c2869
+                return _gf_false;
7c2869
+        list_for_each_entry (each, &lock->waiting, transaction.wait_list) {
7c2869
+                if (afr_locals_overlap (each, local)) {
7c2869
+                        return _gf_true;
7c2869
+                }
7c2869
+        }
7c2869
+        return _gf_false;
7c2869
 }
7c2869
 
7c2869
 
7c2869
 /* }}} */
7c2869
-
7c2869
-int
7c2869
-afr_internal_lock_finish (call_frame_t *frame, xlator_t *this)
7c2869
+static void
7c2869
+afr_copy_inodelk_vars (afr_internal_lock_t *dst, afr_internal_lock_t *src,
7c2869
+                       xlator_t *this)
7c2869
 {
7c2869
-        afr_changelog_pre_op (frame, this);
7c2869
+        afr_private_t *priv = this->private;
7c2869
 
7c2869
-        return 0;
7c2869
+        dst->domain = src->domain;
7c2869
+        dst->flock.l_len  = src->flock.l_len;
7c2869
+        dst->flock.l_start  = src->flock.l_start;
7c2869
+        dst->flock.l_type  = src->flock.l_type;
7c2869
+        dst->lock_count = src->lock_count;
7c2869
+        memcpy (dst->locked_nodes, src->locked_nodes,
7c2869
+                priv->child_count * sizeof (*dst->locked_nodes));
7c2869
 }
7c2869
 
7c2869
-
7c2869
 void
7c2869
-afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this)
7c2869
+__afr_transaction_wake_shared (afr_local_t *local, struct list_head *shared)
7c2869
 {
7c2869
-        afr_local_t    *local = NULL;
7c2869
-        afr_private_t  *priv = NULL;
7c2869
+        gf_boolean_t conflict = _gf_false;
7c2869
+        afr_local_t *each = NULL;
7c2869
+        afr_lock_t *lock = &local->inode_ctx->lock[local->transaction.type];
7c2869
 
7c2869
-        /* call this function from any of the related optimizations
7c2869
-           which benefit from delaying post op are enabled, namely:
7c2869
-
7c2869
-           - changelog piggybacking
7c2869
-           - eager locking
7c2869
-        */
7c2869
+        while (!conflict) {
7c2869
+                if (list_empty (&lock->waiting))
7c2869
+                        return;
7c2869
+                each = list_entry(lock->waiting.next, afr_local_t,
7c2869
+                                  transaction.wait_list);
7c2869
+                if (afr_has_lock_conflict (each, _gf_false)) {
7c2869
+                        conflict = _gf_true;
7c2869
+                }
7c2869
+                if (conflict && !list_empty (&lock->owners))
7c2869
+                        return;
7c2869
+                afr_copy_inodelk_vars (&each->internal_lock,
7c2869
+                                       &local->internal_lock,
7c2869
+                                       each->transaction.frame->this);
7c2869
+                list_move_tail (&each->transaction.wait_list, shared);
7c2869
+                list_add_tail(&each->transaction.owner_list, &lock->owners);
7c2869
+        }
7c2869
+}
7c2869
 
7c2869
-        priv = this->private;
7c2869
-        if (!priv)
7c2869
-                return;
7c2869
+static void
7c2869
+afr_lock_resume_shared (struct list_head *list)
7c2869
+{
7c2869
+        afr_local_t *each = NULL;
7c2869
 
7c2869
-        if (!priv->post_op_delay_secs)
7c2869
-                return;
7c2869
+        while (!list_empty(list)) {
7c2869
+                each = list_entry(list->next, afr_local_t,
7c2869
+                                  transaction.wait_list);
7c2869
+                list_del_init(&each->transaction.wait_list);
7c2869
+                afr_changelog_pre_op (each->transaction.frame,
7c2869
+                                      each->transaction.frame->this);
7c2869
+        }
7c2869
+}
7c2869
 
7c2869
-        local = frame->local;
7c2869
-        if (!local)
7c2869
-                return;
7c2869
+int
7c2869
+afr_internal_lock_finish (call_frame_t *frame, xlator_t *this)
7c2869
+{
7c2869
+        afr_local_t *local = frame->local;
7c2869
+        afr_lock_t   *lock  = NULL;
7c2869
 
7c2869
-        if (!local->transaction.eager_lock_on)
7c2869
-                return;
7c2869
 
7c2869
-        if (!local->fd)
7c2869
-                return;
7c2869
+        local->internal_lock.lock_cbk = NULL;
7c2869
+        if (!local->transaction.eager_lock_on) {
7c2869
+                if (local->internal_lock.lock_op_ret < 0) {
7c2869
+                        afr_transaction_done (frame, this);
7c2869
+                        return 0;
7c2869
+                }
7c2869
+                afr_changelog_pre_op (frame, this);
7c2869
+        } else {
7c2869
+                lock = &local->inode_ctx->lock[local->transaction.type];
7c2869
+                if (local->internal_lock.lock_op_ret < 0) {
7c2869
+                        afr_handle_lock_acquire_failure (local, _gf_false);
7c2869
+                } else {
7c2869
+                        lock->event_generation = local->event_generation;
7c2869
+                        afr_changelog_pre_op (frame, this);
7c2869
+                }
7c2869
+        }
7c2869
 
7c2869
-        if (local->op == GF_FOP_WRITE)
7c2869
-                local->delayed_post_op = _gf_true;
7c2869
+        return 0;
7c2869
 }
7c2869
 
7c2869
 gf_boolean_t
7c2869
-afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this)
7c2869
+afr_are_multiple_fds_opened (afr_local_t *local, xlator_t *this)
7c2869
 {
7c2869
-        afr_fd_ctx_t *fd_ctx = NULL;
7c2869
-
7c2869
-        if (!fd) {
7c2869
-                /* If false is returned, it may keep on taking eager-lock
7c2869
-                 * which may lead to starvation, so return true to avoid that.
7c2869
-                 */
7c2869
-                gf_msg_callingfn (this->name, GF_LOG_ERROR, EBADF,
7c2869
-                                  AFR_MSG_INVALID_ARG, "Invalid fd");
7c2869
-                return _gf_true;
7c2869
-        }
7c2869
         /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock
7c2869
          * is taken mount2 opened the same file, it won't be able to
7c2869
          * perform any data operations until mount1 releases eager-lock.
d1681e
@@ -1705,11 +1789,7 @@ afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this)
7c2869
          * if open-fd-count is > 1
7c2869
          */
7c2869
 
7c2869
-        fd_ctx = afr_fd_ctx_get (fd, this);
7c2869
-        if (!fd_ctx)
7c2869
-                return _gf_true;
7c2869
-
7c2869
-        if (fd_ctx->open_fd_count > 1)
7c2869
+        if (local->inode_ctx->open_fd_count > 1)
7c2869
                 return _gf_true;
7c2869
 
7c2869
         return _gf_false;
d1681e
@@ -1717,24 +1797,45 @@ afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this)
7c2869
 
7c2869
 
7c2869
 gf_boolean_t
7c2869
-is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this)
7c2869
+afr_is_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this,
7c2869
+                                         int delay)
7c2869
 {
7c2869
-        afr_local_t      *local = NULL;
7c2869
-        gf_boolean_t      res = _gf_false;
7c2869
+        afr_local_t  *local = NULL;
7c2869
+        afr_lock_t   *lock  = NULL;
7c2869
+        gf_boolean_t res    = _gf_false;
7c2869
 
7c2869
         local = frame->local;
7c2869
-        if (!local)
7c2869
+        lock = &local->inode_ctx->lock[local->transaction.type];
7c2869
+
7c2869
+        if (!afr_txn_nothing_failed (frame, this)) {
7c2869
+                lock->release = _gf_true;
7c2869
                 goto out;
7c2869
+        }
7c2869
 
7c2869
-        if (!local->delayed_post_op)
7c2869
+        if (afr_are_multiple_fds_opened (local, this)) {
7c2869
+                lock->release = _gf_true;
7c2869
                 goto out;
7c2869
+        }
7c2869
 
7c2869
-        //Mark pending changelog ASAP
7c2869
-        if (!afr_txn_nothing_failed (frame, this))
7c2869
+        if (!list_empty (&lock->owners))
d1681e
+                goto out;
7c2869
+        else
7c2869
+                GF_ASSERT (list_empty (&lock->waiting));
7c2869
+
7c2869
+        if (lock->release) {
7c2869
+                goto out;
7c2869
+        }
7c2869
+
7c2869
+        if (!delay) {
d1681e
                 goto out;
7c2869
+        }
7c2869
 
7c2869
-        if (local->fd && afr_are_multiple_fds_opened (local->fd, this))
7c2869
+        if ((local->op != GF_FOP_WRITE) &&
7c2869
+            (local->op != GF_FOP_FXATTROP)) {
7c2869
+                /*Only allow writes but shard does [f]xattrops on writes, so
7c2869
+                 * they are fine too*/
7c2869
                 goto out;
7c2869
+        }
7c2869
 
7c2869
         res = _gf_true;
7c2869
 out:
d1681e
@@ -1745,50 +1846,61 @@ out:
7c2869
 void
7c2869
 afr_delayed_changelog_wake_up_cbk (void *data)
7c2869
 {
7c2869
-        fd_t           *fd = NULL;
7c2869
+        afr_lock_t  *lock  = NULL;
7c2869
+        afr_local_t *local = data;
7c2869
+        afr_local_t *timer_local = NULL;
7c2869
+        struct list_head shared;
7c2869
 
d1681e
-        fd = data;
d1681e
-
7c2869
-        afr_delayed_changelog_wake_up (THIS, fd);
7c2869
+        INIT_LIST_HEAD (&shared);
7c2869
+        lock = &local->inode_ctx->lock[local->transaction.type];
7c2869
+        LOCK (&local->inode->lock);
7c2869
+        {
7c2869
+                timer_local = list_entry(lock->post_op.next,
7c2869
+                                         afr_local_t,
7c2869
+                                        transaction.owner_list);
7c2869
+                if (list_empty (&lock->owners) && (local == timer_local)) {
7c2869
+                        GF_ASSERT (list_empty (&lock->waiting));
7c2869
+                        /*Last owner*/
7c2869
+                        lock->release = _gf_true;
7c2869
+                        lock->delay_timer = NULL;
7c2869
+                }
7c2869
+        }
7c2869
+        UNLOCK (&local->inode->lock);
7c2869
+        afr_changelog_post_op_now (local->transaction.frame,
7c2869
+                                   local->transaction.frame->this);
7c2869
 }
7c2869
 
7c2869
 
7c2869
 /* SET operation */
7c2869
 int
7c2869
-afr_fd_report_unstable_write (xlator_t *this, fd_t *fd)
7c2869
+afr_fd_report_unstable_write (xlator_t *this, afr_local_t *local)
7c2869
 {
7c2869
-        afr_fd_ctx_t *fdctx = NULL;
7c2869
-
7c2869
-        fdctx = afr_fd_ctx_get (fd, this);
7c2869
-
7c2869
-        LOCK(&fd->lock);
7c2869
+        LOCK(&local->inode->lock);
7c2869
         {
7c2869
-                fdctx->witnessed_unstable_write = _gf_true;
7c2869
+                local->inode_ctx->witnessed_unstable_write = _gf_true;
7c2869
         }
7c2869
-        UNLOCK(&fd->lock);
7c2869
+        UNLOCK(&local->inode->lock);
7c2869
 
7c2869
         return 0;
7c2869
 }
7c2869
 
7c2869
 /* TEST and CLEAR operation */
7c2869
 gf_boolean_t
7c2869
-afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd)
7c2869
+afr_fd_has_witnessed_unstable_write (xlator_t *this, inode_t *inode)
7c2869
 {
7c2869
-        afr_fd_ctx_t *fdctx = NULL;
7c2869
+        afr_inode_ctx_t *ctx = NULL;
7c2869
         gf_boolean_t witness = _gf_false;
7c2869
 
7c2869
-        fdctx = afr_fd_ctx_get (fd, this);
7c2869
-        if (!fdctx)
7c2869
-                return _gf_true;
7c2869
-
7c2869
-        LOCK(&fd->lock);
7c2869
+        LOCK(&inode->lock);
7c2869
         {
7c2869
-                if (fdctx->witnessed_unstable_write) {
7c2869
+                (void)__afr_inode_ctx_get (this, inode, &ctx;;
7c2869
+
7c2869
+                if (ctx->witnessed_unstable_write) {
7c2869
                         witness = _gf_true;
7c2869
-                        fdctx->witnessed_unstable_write = _gf_false;
7c2869
+                        ctx->witnessed_unstable_write = _gf_false;
7c2869
                 }
7c2869
         }
7c2869
-        UNLOCK (&fd->lock);
7c2869
+        UNLOCK (&inode->lock);
7c2869
 
7c2869
         return witness;
7c2869
 }
d1681e
@@ -1931,7 +2043,7 @@ afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this)
7c2869
            mark a flag in the fdctx whenever an unstable write is witnessed.
7c2869
            */
7c2869
 
7c2869
-        if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) {
7c2869
+        if (!afr_fd_has_witnessed_unstable_write (this, local->inode)) {
7c2869
                 afr_changelog_post_op_now (frame, this);
7c2869
                 return 0;
7c2869
         }
d1681e
@@ -1949,87 +2061,64 @@ afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this)
7c2869
         return 0;
7c2869
 }
7c2869
 
7c2869
-
7c2869
 void
7c2869
-afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd,
7c2869
-                               call_stub_t *stub)
7c2869
+afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
7c2869
 {
7c2869
-	afr_fd_ctx_t      *fd_ctx = NULL;
7c2869
-	call_frame_t      *prev_frame = NULL;
7c2869
-	struct timespec    delta = {0, };
7c2869
-	afr_private_t     *priv = NULL;
7c2869
-	afr_local_t       *local = NULL;
7c2869
+	struct timespec delta   = {0, };
7c2869
+	afr_private_t   *priv   = NULL;
7c2869
+	afr_local_t     *local  = frame->local;
7c2869
+        afr_lock_t      *lock   = NULL;
7c2869
+        gf_boolean_t    post_op = _gf_true;
7c2869
+        struct list_head  shared;
7c2869
 
7c2869
 	priv = this->private;
7c2869
-
7c2869
-	fd_ctx = afr_fd_ctx_get (fd, this);
7c2869
-	if (!fd_ctx)
7c2869
-                goto out;
7c2869
-
7c2869
 	delta.tv_sec = priv->post_op_delay_secs;
7c2869
 	delta.tv_nsec = 0;
7c2869
 
7c2869
-	pthread_mutex_lock (&fd_ctx->delay_lock);
7c2869
-	{
7c2869
-		prev_frame = fd_ctx->delay_frame;
7c2869
-		fd_ctx->delay_frame = NULL;
7c2869
-		if (fd_ctx->delay_timer)
7c2869
-			gf_timer_call_cancel (this->ctx, fd_ctx->delay_timer);
7c2869
-		fd_ctx->delay_timer = NULL;
7c2869
-		if (!frame)
7c2869
-			goto unlock;
7c2869
-		fd_ctx->delay_timer = gf_timer_call_after (this->ctx, delta,
7c2869
-							   afr_delayed_changelog_wake_up_cbk,
7c2869
-							   fd);
7c2869
-		fd_ctx->delay_frame = frame;
7c2869
-	}
7c2869
-unlock:
7c2869
-	pthread_mutex_unlock (&fd_ctx->delay_lock);
7c2869
-
7c2869
-out:
7c2869
-	if (prev_frame) {
7c2869
-		local = prev_frame->local;
7c2869
-		local->transaction.resume_stub = stub;
7c2869
-		afr_changelog_post_op_now (prev_frame, this);
7c2869
-	} else if (stub) {
7c2869
-		call_resume (stub);
7c2869
-	}
7c2869
-}
7c2869
-
7c2869
-
7c2869
-void
7c2869
-afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
7c2869
-{
7c2869
-        afr_local_t  *local = NULL;
7c2869
-
7c2869
-        local = frame->local;
7c2869
-
7c2869
-        if (is_afr_delayed_changelog_post_op_needed (frame, this))
7c2869
-                afr_delayed_changelog_post_op (this, frame, local->fd, NULL);
7c2869
-        else
7c2869
-                afr_changelog_post_op_safe (frame, this);
7c2869
-}
7c2869
-
7c2869
+        INIT_LIST_HEAD (&shared);
7c2869
+        if (!local->transaction.eager_lock_on)
7c2869
+                goto out;
7c2869
 
7c2869
+        lock = &local->inode_ctx->lock[local->transaction.type];
7c2869
+        LOCK (&local->inode->lock);
7c2869
+	{
7c2869
+                list_del_init (&local->transaction.owner_list);
7c2869
+                list_add (&local->transaction.owner_list, &lock->post_op);
7c2869
+                __afr_transaction_wake_shared (local, &shared);
7c2869
+
7c2869
+                if (!afr_is_delayed_changelog_post_op_needed (frame, this,
7c2869
+                                                              delta.tv_sec)) {
7c2869
+                        if (list_empty (&lock->owners))
7c2869
+                                lock->release = _gf_true;
7c2869
+                        goto unlock;
7c2869
+                }
7c2869
 
7c2869
-/* Wake up the sleeping/delayed post-op, and also register
7c2869
-   a stub to have it resumed after this transaction
7c2869
-   completely finishes.
7c2869
+                GF_ASSERT (lock->delay_timer == NULL);
7c2869
+		lock->delay_timer = gf_timer_call_after (this->ctx, delta,
7c2869
+                                              afr_delayed_changelog_wake_up_cbk,
7c2869
+                                              local);
7c2869
+                if (!lock->delay_timer) {
7c2869
+                        lock->release = _gf_true;
7c2869
+                } else {
7c2869
+                        post_op = _gf_false;
7c2869
+                }
7c2869
 
7c2869
-   The @stub gets saved in @local and gets resumed in
7c2869
-   afr_local_cleanup()
7c2869
-   */
7c2869
-void
7c2869
-afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub)
7c2869
-{
7c2869
-        afr_delayed_changelog_post_op (this, NULL, fd, stub);
7c2869
-}
7c2869
+	}
7c2869
+unlock:
7c2869
+        UNLOCK (&local->inode->lock);
7c2869
 
7c2869
+        if (!list_empty (&shared)) {
7c2869
+                afr_lock_resume_shared (&shared);
7c2869
+        }
7c2869
 
7c2869
-void
7c2869
-afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd)
7c2869
-{
7c2869
-        afr_delayed_changelog_post_op (this, NULL, fd, NULL);
7c2869
+out:
7c2869
+        if (post_op) {
7c2869
+                if (!local->transaction.eager_lock_on || lock->release) {
7c2869
+                        afr_changelog_post_op_safe (frame, this);
7c2869
+                } else {
7c2869
+                        afr_changelog_post_op_now (frame, this);
7c2869
+                }
7c2869
+        }
7c2869
 }
7c2869
 
7c2869
 int
d1681e
@@ -2039,13 +2128,6 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this)
7c2869
 
7c2869
         local    = frame->local;
7c2869
 
7c2869
-        if (local->transaction.eager_lock_on) {
7c2869
-                /* We don't need to retain "local" in the
7c2869
-                   fd list anymore, writes to all subvols
7c2869
-                   are finished by now */
7c2869
-                afr_remove_eager_lock_stub (local);
7c2869
-        }
7c2869
-
7c2869
         afr_restore_lk_owner (frame);
7c2869
 
7c2869
         afr_handle_symmetric_errors (frame, this);
d1681e
@@ -2076,114 +2158,149 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,
7c2869
 	local->transaction.failed_subvols[child_index] = 1;
7c2869
 }
7c2869
 
7c2869
-
7c2869
-
7c2869
 static gf_boolean_t
7c2869
-afr_locals_overlap (afr_local_t *local1, afr_local_t *local2)
7c2869
+__need_previous_lock_unlocked (afr_local_t *local)
7c2869
 {
7c2869
-        uint64_t start1 = local1->transaction.start;
7c2869
-        uint64_t start2 = local2->transaction.start;
7c2869
-        uint64_t end1 = 0;
7c2869
-        uint64_t end2 = 0;
7c2869
-
7c2869
-        if (local1->transaction.len)
7c2869
-                end1 = start1 + local1->transaction.len - 1;
7c2869
-        else
7c2869
-                end1 = ULLONG_MAX;
7c2869
+        afr_lock_t      *lock = NULL;
7c2869
 
7c2869
-        if (local2->transaction.len)
7c2869
-                end2 = start2 + local2->transaction.len - 1;
7c2869
-        else
7c2869
-                end2 = ULLONG_MAX;
7c2869
+        if (!local->transaction.eager_lock_on)
7c2869
+                return _gf_true;
7c2869
 
7c2869
-        return ((end1 >= start2) && (end2 >= start1));
7c2869
+        lock = &local->inode_ctx->lock[local->transaction.type];
7c2869
+        if (!lock->acquired)
7c2869
+                return _gf_false;
7c2869
+        if (lock->acquired && lock->event_generation != local->event_generation)
7c2869
+                return _gf_true;
7c2869
+        return _gf_false;
7c2869
 }
7c2869
 
7c2869
 void
7c2869
-afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this)
7c2869
+__afr_eager_lock_handle (afr_local_t *local, gf_boolean_t *take_lock,
7c2869
+                         gf_boolean_t *do_pre_op, afr_local_t **timer_local)
7c2869
 {
7c2869
-        afr_private_t *priv = NULL;
7c2869
-        afr_fd_ctx_t  *fdctx = NULL;
7c2869
-        afr_local_t   *each = NULL;
7c2869
+        afr_lock_t      *lock = NULL;
7c2869
+        afr_local_t     *owner_local = NULL;
7c2869
+        xlator_t        *this = local->transaction.frame->this;
7c2869
 
d1681e
-        priv = this->private;
d1681e
-
d1681e
-        if (!local->fd)
d1681e
-                return;
d1681e
-
7c2869
-        if (local->transaction.type != AFR_DATA_TRANSACTION)
7c2869
-                return;
7c2869
+        if (local->fd && !afr_are_multiple_fds_opened (local, this)) {
7c2869
+                local->transaction.eager_lock_on = _gf_true;
7c2869
+        }
7c2869
 
7c2869
-        if (!priv->eager_lock)
7c2869
-                return;
7c2869
+        lock = &local->inode_ctx->lock[local->transaction.type];
7c2869
+        if (__need_previous_lock_unlocked (local)) {
7c2869
+                if (!list_empty (&lock->owners)) {
7c2869
+                        lock->release = _gf_true;
7c2869
+                } else if (lock->delay_timer) {
7c2869
+                        lock->release = _gf_true;
7c2869
+                        if (gf_timer_call_cancel (this->ctx,
7c2869
+                                                  lock->delay_timer)) {
7c2869
+                                /* It will be put in frozen list
7c2869
+                                 * in the code flow below*/
7c2869
+                        } else {
7c2869
+                                *timer_local = list_entry(lock->post_op.next,
7c2869
+                                                          afr_local_t,
7c2869
+                                                        transaction.owner_list);
7c2869
+                                lock->delay_timer = NULL;
7c2869
+                        }
7c2869
+                }
7c2869
+                if (!local->transaction.eager_lock_on)
7c2869
+                        goto out;
7c2869
+        }
7c2869
 
7c2869
-        fdctx = afr_fd_ctx_get (local->fd, this);
7c2869
-        if (!fdctx)
7c2869
-                return;
7c2869
+        if (lock->release) {
7c2869
+                list_add_tail (&local->transaction.wait_list,
7c2869
+                               &lock->frozen);
7c2869
+                *take_lock = _gf_false;
7c2869
+                goto out;
7c2869
+        }
7c2869
 
7c2869
-        if (afr_are_multiple_fds_opened (local->fd, this))
7c2869
-                return;
7c2869
-        /*
7c2869
-         * Once full file lock is acquired in eager-lock phase, overlapping
7c2869
-         * writes do not compete for inode-locks, instead are transferred to the
7c2869
-         * next writes. Because of this overlapping writes are not ordered.
7c2869
-         * This can cause inconsistencies in replication.
7c2869
-         * Example:
7c2869
-         * Two overlapping writes w1, w2 are sent in parallel on same fd
7c2869
-         * in two threads t1, t2.
7c2869
-         * Both threads can execute afr_writev_wind in the following manner.
7c2869
-         * t1 winds w1 on brick-0
7c2869
-         * t2 winds w2 on brick-0
7c2869
-         * t2 winds w2 on brick-1
7c2869
-         * t1 winds w1 on brick-1
7c2869
-         *
7c2869
-         * This check makes sure the locks are not transferred for
7c2869
-         * overlapping writes.
7c2869
-         */
7c2869
-        LOCK (&local->fd->lock);
7c2869
-        {
7c2869
-                list_for_each_entry (each, &fdctx->eager_locked,
7c2869
-                                     transaction.eager_locked) {
7c2869
-                        if (afr_locals_overlap (each, local)) {
7c2869
-                                local->transaction.eager_lock_on = _gf_false;
7c2869
-                                goto unlock;
7c2869
-                        }
7c2869
+        if (lock->delay_timer) {
7c2869
+                *take_lock = _gf_false;
7c2869
+                if (gf_timer_call_cancel (this->ctx,
7c2869
+                                          lock->delay_timer)) {
7c2869
+                        list_add_tail (&local->transaction.wait_list,
7c2869
+                                       &lock->frozen);
7c2869
+                } else {
7c2869
+                        *timer_local = list_entry(lock->post_op.next,
7c2869
+                                                  afr_local_t,
7c2869
+                                                  transaction.owner_list);
7c2869
+                        afr_copy_inodelk_vars (&local->internal_lock,
7c2869
+                                               &(*timer_local)->internal_lock,
7c2869
+                                               this);
7c2869
+                        lock->delay_timer = NULL;
7c2869
+                        *do_pre_op = _gf_true;
7c2869
+                        list_add_tail (&local->transaction.owner_list,
7c2869
+                                       &lock->owners);
7c2869
                 }
7c2869
+                goto out;
7c2869
+        }
7c2869
 
7c2869
-                local->transaction.eager_lock_on = _gf_true;
7c2869
-                list_add_tail (&local->transaction.eager_locked,
7c2869
-                               &fdctx->eager_locked);
7c2869
+        if (!list_empty (&lock->owners)) {
7c2869
+                if (!lock->acquired ||
7c2869
+                    afr_has_lock_conflict (local, _gf_true)) {
7c2869
+                        list_add_tail (&local->transaction.wait_list,
7c2869
+                                       &lock->waiting);
7c2869
+                        *take_lock = _gf_false;
7c2869
+                        goto out;
7c2869
+                }
7c2869
+                owner_local = list_entry (lock->owners.next,
7c2869
+                                          afr_local_t,
7c2869
+                                          transaction.owner_list);
7c2869
+                afr_copy_inodelk_vars (&local->internal_lock,
7c2869
+                                       &owner_local->internal_lock,
7c2869
+                                       this);
7c2869
+                *take_lock = _gf_false;
7c2869
+                *do_pre_op = _gf_true;
7c2869
         }
7c2869
-unlock:
7c2869
-        UNLOCK (&local->fd->lock);
7c2869
+
7c2869
+        if (lock->acquired)
7c2869
+                GF_ASSERT (!(*take_lock));
7c2869
+        list_add_tail (&local->transaction.owner_list, &lock->owners);
7c2869
+out:
7c2869
+        return;
7c2869
 }
7c2869
 
7c2869
 void
7c2869
-afr_transaction_start (call_frame_t *frame, xlator_t *this)
7c2869
+afr_transaction_start (afr_local_t *local, xlator_t *this)
7c2869
 {
7c2869
-        afr_local_t   *local = frame->local;
7c2869
-        fd_t          *fd    = NULL;
7c2869
+        afr_private_t   *priv = NULL;
7c2869
+        gf_boolean_t    take_lock  = _gf_true;
7c2869
+        gf_boolean_t    do_pre_op  = _gf_false;
7c2869
+        afr_local_t     *timer_local = NULL;
7c2869
 
7c2869
-        afr_transaction_eager_lock_init (local, this);
7c2869
+        priv = this->private;
7c2869
 
7c2869
-        if (local->fd && local->transaction.eager_lock_on)
7c2869
-                afr_set_lk_owner (frame, this, local->fd);
7c2869
-        else
7c2869
-                afr_set_lk_owner (frame, this, frame->root);
7c2869
+        if (local->transaction.type != AFR_DATA_TRANSACTION &&
7c2869
+            local->transaction.type != AFR_METADATA_TRANSACTION)
7c2869
+                goto lock_phase;
7c2869
 
7c2869
-        if (!local->transaction.eager_lock_on && local->loc.inode) {
7c2869
-                fd = fd_lookup (local->loc.inode, frame->root->pid);
7c2869
-                if (fd == NULL)
7c2869
-                        fd = fd_lookup_anonymous (local->loc.inode,
7c2869
-                                                  GF_ANON_FD_FLAGS);
7c2869
+        if (!priv->eager_lock)
7c2869
+                goto lock_phase;
7c2869
 
7c2869
-                if (fd) {
7c2869
-                        afr_delayed_changelog_wake_up (this, fd);
7c2869
-                        fd_unref (fd);
7c2869
-                }
7c2869
+        LOCK (&local->inode->lock);
7c2869
+        {
7c2869
+                __afr_eager_lock_handle (local, &take_lock, &do_pre_op,
7c2869
+                                         &timer_local);
7c2869
         }
7c2869
+        UNLOCK (&local->inode->lock);
7c2869
+lock_phase:
7c2869
+        if (!local->transaction.eager_lock_on) {
7c2869
+                afr_set_lk_owner (local->transaction.frame, this,
7c2869
+                                  local->transaction.frame->root);
7c2869
+        } else {
7c2869
+                afr_set_lk_owner (local->transaction.frame, this, local->inode);
7c2869
+        }
7c2869
+
7c2869
 
7c2869
-        afr_lock (frame, this);
7c2869
+        if (take_lock) {
7c2869
+                afr_lock (local->transaction.frame, this);
7c2869
+        } else if (do_pre_op) {
7c2869
+                afr_changelog_pre_op (local->transaction.frame, this);
7c2869
+        }
7c2869
+        /*Always call delayed_changelog_wake_up_cbk after calling pre-op above
7c2869
+         * so that any inheriting can happen*/
7c2869
+        if (timer_local)
7c2869
+                afr_delayed_changelog_wake_up_cbk (timer_local);
7c2869
 }
7c2869
 
7c2869
 int
d1681e
@@ -2196,7 +2313,7 @@ afr_write_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err)
7c2869
                 goto fail;
7c2869
         }
7c2869
 
7c2869
-        afr_transaction_start (frame, this);
7c2869
+        afr_transaction_start (local, this);
7c2869
         return 0;
7c2869
 fail:
7c2869
         local->transaction.unwind (frame, this);
d1681e
@@ -2214,6 +2331,7 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
7c2869
 
7c2869
         local = frame->local;
7c2869
         priv  = this->private;
7c2869
+        local->transaction.frame = frame;
7c2869
 
7c2869
         local->transaction.type   = type;
7c2869
 
d1681e
@@ -2226,11 +2344,9 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
7c2869
         if (ret < 0)
7c2869
                 goto out;
7c2869
 
d1681e
-        if (type == AFR_ENTRY_TRANSACTION ||
d1681e
-            type == AFR_ENTRY_RENAME_TRANSACTION) {
7c2869
-                afr_transaction_start (frame, this);
7c2869
-                ret = 0;
7c2869
-                goto out;
d1681e
+
d1681e
+        if (type != AFR_METADATA_TRANSACTION) {
7c2869
+                goto txn_start;
7c2869
         }
7c2869
 
7c2869
         ret = afr_inode_get_readable (frame, local->inode, this,
d1681e
@@ -2240,10 +2356,13 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
7c2869
                                                   event_generation)) {
7c2869
                 afr_inode_refresh (frame, this, local->inode, local->loc.gfid,
7c2869
                                    afr_write_txn_refresh_done);
7c2869
-        } else {
7c2869
-                afr_transaction_start (frame, this);
7c2869
+                ret = 0;
7c2869
+                goto out;
7c2869
         }
7c2869
+
7c2869
+txn_start:
7c2869
         ret = 0;
7c2869
+        afr_transaction_start (local, this);
7c2869
 out:
7c2869
         return ret;
7c2869
 }
7c2869
diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h
7c2869
index ddcb1eb..a27e9a3 100644
7c2869
--- a/xlators/cluster/afr/src/afr-transaction.h
7c2869
+++ b/xlators/cluster/afr/src/afr-transaction.h
7c2869
@@ -17,12 +17,6 @@ void
7c2869
 afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,
7c2869
 			    int child_index);
7c2869
 
7c2869
-int
7c2869
-afr_lock_server_count (afr_private_t *priv, afr_transaction_type type);
7c2869
-
7c2869
-afr_inodelk_t*
7c2869
-afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom);
7c2869
-
7c2869
 int32_t
7c2869
 afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type);
7c2869
 
7c2869
@@ -30,9 +24,6 @@ int
7c2869
 afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending);
7c2869
 
7c2869
 void
7c2869
-afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this);
7c2869
-
7c2869
-void
7c2869
 afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd);
7c2869
 
7c2869
 void
7c2869
@@ -57,4 +48,8 @@ afr_pick_error_xdata (afr_local_t *local, afr_private_t *priv,
7c2869
                       inode_t *inode2, unsigned char *readable2);
7c2869
 int
7c2869
 afr_transaction_resume (call_frame_t *frame, xlator_t *this);
7c2869
+int
7c2869
+afr_lock (call_frame_t *frame, xlator_t *this);
7c2869
+void
7c2869
+afr_delayed_changelog_wake_up_cbk (void *data);
7c2869
 #endif /* __TRANSACTION_H__ */
7c2869
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
d1681e
index 5ff57c0..6be59dc 100644
7c2869
--- a/xlators/cluster/afr/src/afr.h
7c2869
+++ b/xlators/cluster/afr/src/afr.h
d1681e
@@ -230,19 +230,12 @@ int
7c2869
 afr_entry_lockee_cmp (const void *l1, const void *l2);
7c2869
 
7c2869
 typedef struct {
7c2869
-        char    *domain; /* Domain on which inodelk is taken */
7c2869
-        struct gf_flock flock;
7c2869
-        unsigned char *locked_nodes;
7c2869
-        int32_t lock_count;
7c2869
-} afr_inodelk_t;
7c2869
-
7c2869
-typedef struct {
7c2869
         loc_t *lk_loc;
7c2869
 
7c2869
         int                     lockee_count;
7c2869
         afr_entry_lockee_t      lockee[AFR_LOCKEE_COUNT_MAX];
7c2869
 
7c2869
-        afr_inodelk_t       inodelk[AFR_DOM_COUNT_MAX];
7c2869
+        struct gf_flock flock;
7c2869
         const char *lk_basename;
7c2869
         const char *lower_basename;
7c2869
         const char *higher_basename;
d1681e
@@ -255,7 +248,6 @@ typedef struct {
7c2869
         int32_t lock_count;
7c2869
         int32_t entrylk_lock_count;
7c2869
 
7c2869
-        uint64_t lock_number;
7c2869
         int32_t lk_call_count;
7c2869
         int32_t lk_expected_count;
7c2869
         int32_t lk_attempted_count;
d1681e
@@ -292,37 +284,9 @@ typedef enum {
7c2869
 } afr_fd_open_status_t;
7c2869
 
7c2869
 typedef struct {
7c2869
-        unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS];
7c2869
-	int inherited[AFR_NUM_CHANGE_LOGS];
7c2869
-	int on_disk[AFR_NUM_CHANGE_LOGS];
7c2869
         afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */
7c2869
-
7c2869
-        unsigned int *lock_piggyback;
7c2869
-        unsigned int *lock_acquired;
7c2869
-
7c2869
         int flags;
7c2869
 
7c2869
-	/* used for delayed-post-op optimization */
7c2869
-	pthread_mutex_t    delay_lock;
7c2869
-	gf_timer_t        *delay_timer;
7c2869
-	call_frame_t      *delay_frame;
7c2869
-
7c2869
-	/* set if any write on this fd was a non stable write
7c2869
-	   (i.e, without O_SYNC or O_DSYNC)
7c2869
-	*/
7c2869
-	gf_boolean_t      witnessed_unstable_write;
7c2869
-
7c2869
-	/* @open_fd_count:
7c2869
-	   Number of open FDs queried from the server, as queried through
7c2869
-	   xdata in FOPs. Currently, used to decide if eager-locking must be
7c2869
-	   temporarily disabled.
7c2869
-	*/
7c2869
-        uint32_t        open_fd_count;
7c2869
-
7c2869
-
7c2869
-	/* list of frames currently in progress */
7c2869
-	struct list_head  eager_locked;
7c2869
-
7c2869
 	/* the subvolume on which the latest sequence of readdirs (starting
7c2869
 	   at offset 0) has begun. Till the next readdir request with 0 offset
7c2869
 	   arrives, we continue to read off this subvol.
d1681e
@@ -336,6 +300,20 @@ typedef enum {
7c2869
         AFR_FOP_LOCK_QUORUM_FAILED,
7c2869
 } afr_fop_lock_state_t;
7c2869
 
7c2869
+typedef struct _afr_inode_lock_t {
7c2869
+        unsigned int event_generation;
7c2869
+        gf_boolean_t    release;
7c2869
+        gf_boolean_t    acquired;
7c2869
+        gf_timer_t        *delay_timer;
7c2869
+        struct list_head  owners; /*Transactions that are performing fop*/
7c2869
+        struct list_head  post_op;/*Transactions that are done with the fop
7c2869
+                                   *So can not conflict with the fops*/
7c2869
+        struct list_head waiting;/*Transaction that are waiting for
7c2869
+                                   *conflicting transactions to complete*/
7c2869
+        struct list_head frozen;/*Transactions that need to go as part of
7c2869
+                                 * next batch of eager-lock*/
7c2869
+} afr_lock_t;
7c2869
+
d1681e
 typedef struct _afr_inode_ctx {
d1681e
         uint64_t        read_subvol;
d1681e
         uint64_t        write_subvol;
d1681e
@@ -343,6 +321,23 @@ typedef struct _afr_inode_ctx {
d1681e
         int             spb_choice;
d1681e
         gf_timer_t      *timer;
d1681e
         gf_boolean_t    need_refresh;
7c2869
+        unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS];
7c2869
+        int inherited[AFR_NUM_CHANGE_LOGS];
7c2869
+        int on_disk[AFR_NUM_CHANGE_LOGS];
7c2869
+
7c2869
+        /* set if any write on this fd was a non stable write
7c2869
+           (i.e, without O_SYNC or O_DSYNC)
7c2869
+        */
7c2869
+        gf_boolean_t      witnessed_unstable_write;
7c2869
+
7c2869
+        /* @open_fd_count:
7c2869
+           Number of open FDs queried from the server, as queried through
7c2869
+           xdata in FOPs. Currently, used to decide if eager-locking must be
7c2869
+           temporarily disabled.
7c2869
+        */
7c2869
+        uint32_t        open_fd_count;
7c2869
+        /*Only 2 types of transactions support eager-locks now. DATA/METADATA*/
7c2869
+        afr_lock_t lock[2];
d1681e
 } afr_inode_ctx_t;
d1681e
 
d1681e
 
d1681e
@@ -457,7 +452,6 @@ typedef struct _afr_local {
7c2869
         dict_t  *dict;
7c2869
 
7c2869
         int      optimistic_change_log;
7c2869
-	gf_boolean_t      delayed_post_op;
7c2869
 
7c2869
 	/* Is the current writev() going to perform a stable write?
7c2869
 	   i.e, is fd->flags or @flags writev param have O_SYNC or
d1681e
@@ -693,7 +687,7 @@ typedef struct _afr_local {
7c2869
                 off_t start, len;
7c2869
 
7c2869
                 gf_boolean_t    eager_lock_on;
7c2869
-                int *eager_lock;
7c2869
+                gf_boolean_t    do_eager_unlock;
7c2869
 
7c2869
                 char *basename;
7c2869
                 char *new_basename;
d1681e
@@ -707,7 +701,8 @@ typedef struct _afr_local {
7c2869
 		   of the transaction frame */
7c2869
 		call_stub_t      *resume_stub;
7c2869
 
7c2869
-		struct list_head  eager_locked;
7c2869
+		struct list_head  owner_list;
7c2869
+                struct list_head  wait_list;
7c2869
 
7c2869
                 unsigned char   *pre_op;
7c2869
 
d1681e
@@ -768,7 +763,8 @@ typedef struct _afr_local {
7c2869
 		*/
7c2869
 		afr_changelog_resume_t changelog_resume;
7c2869
 
7c2869
-                call_frame_t *main_frame;
7c2869
+                call_frame_t *main_frame; /*Fop frame*/
7c2869
+                call_frame_t *frame; /*Transaction frame*/
7c2869
 
7c2869
                 int (*wind) (call_frame_t *frame, xlator_t *this, int subvol);
7c2869
 
d1681e
@@ -1009,7 +1005,7 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd);
7c2869
 		afr_local_cleanup (frame->local, THIS);		       \
7c2869
 		mem_put (frame->local);				       \
7c2869
 		frame->local = NULL; };				       \
7c2869
-	frame->local;})
7c2869
+	frame->local; })
7c2869
 
7c2869
 #define AFR_STACK_RESET(frame)                                         \
7c2869
         do {                                                           \
d1681e
@@ -1096,22 +1092,10 @@ afr_filter_xattrs (dict_t *xattr);
7c2869
 #define AFR_QUORUM_AUTO INT_MAX
7c2869
 
7c2869
 int
7c2869
-afr_fd_report_unstable_write (xlator_t *this, fd_t *fd);
7c2869
+afr_fd_report_unstable_write (xlator_t *this, afr_local_t *local);
7c2869
 
7c2869
 gf_boolean_t
7c2869
-afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd);
7c2869
-
7c2869
-void
7c2869
-afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub);
7c2869
-
7c2869
-int
7c2869
-afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count);
7c2869
-
7c2869
-void
7c2869
-afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this);
7c2869
-
7c2869
-void
7c2869
-afr_remove_eager_lock_stub (afr_local_t *local);
7c2869
+afr_fd_has_witnessed_unstable_write (xlator_t *this, inode_t *inode);
7c2869
 
7c2869
 void
d1681e
 afr_reply_wipe (struct afr_reply *reply);
7c2869
-- 
7c2869
1.8.3.1
7c2869