a3470f
From c098fa2192eedbfaad7ac850d0fb152695a3becf Mon Sep 17 00:00:00 2001
a3470f
From: Pranith Kumar K <pkarampu@redhat.com>
a3470f
Date: Sun, 25 Jun 2017 16:34:01 +0530
a3470f
Subject: [PATCH 079/128] cluster/ec: Allow parallel writes in EC if possible
a3470f
a3470f
Problem:
a3470f
Ec at the moment sends one modification fop after another, so if some of
a3470f
the disks become slow, for a while then the wait time for the writes that
a3470f
are waiting in the queue becomes really bad.
a3470f
a3470f
Fix:
a3470f
Allow parallel writes when possible. For this we need to make 3 changes.
a3470f
1) Each fop now has range parameters they will be updating.
a3470f
2) Xattrop is changed to handle parallel xattrop requests where some
a3470f
   would be modifying just dirty xattr.
a3470f
3) Fops that refer to size now take locks and update the locks.
a3470f
a3470f
upstream patch: https://review.gluster.org/#/c/17625/
a3470f
>Fixes #251
a3470f
>Change-Id: Ibc3c15372f91bbd6fb617f0d99399b3149fa64b2
a3470f
>Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
a3470f
a3470f
Note:
a3470f
There is a delta compared to upstream patch:
a3470f
a3470f
For "disperse.parallel-writes" key we have reverted the flags
a3470f
to reflect old type. Added New OP_VERSION for 3.13.0 in globals.h.
a3470f
a3470f
BUG: 1459101
a3470f
Change-Id: Ibc3c15372f91bbd6fb617f0d99399b3149fa64b2
a3470f
Signed-off-by: Sunil Kumar Acharya <sheggodu@redhat.com>
a3470f
Reviewed-on: https://code.engineering.redhat.com/gerrit/123561
a3470f
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
a3470f
Tested-by: RHGS Build Bot <nigelb@redhat.com>
a3470f
Reviewed-by: Ashish Pandey <aspandey@redhat.com>
a3470f
---
a3470f
 libglusterfs/src/globals.h                      |   4 +-
a3470f
 xlators/cluster/ec/src/ec-common.c              | 191 ++++++++++++++++--------
a3470f
 xlators/cluster/ec/src/ec-common.h              |  10 +-
a3470f
 xlators/cluster/ec/src/ec-dir-read.c            |   6 +-
a3470f
 xlators/cluster/ec/src/ec-generic.c             |  12 +-
a3470f
 xlators/cluster/ec/src/ec-inode-read.c          |  22 ++-
a3470f
 xlators/cluster/ec/src/ec-inode-write.c         | 124 +++++++++------
a3470f
 xlators/cluster/ec/src/ec-types.h               |   8 +-
a3470f
 xlators/cluster/ec/src/ec.c                     |  51 ++++---
a3470f
 xlators/mgmt/glusterd/src/glusterd-volume-set.c |   6 +
a3470f
 10 files changed, 291 insertions(+), 143 deletions(-)
a3470f
a3470f
diff --git a/libglusterfs/src/globals.h b/libglusterfs/src/globals.h
a3470f
index bd7cffe..c627cfe 100644
a3470f
--- a/libglusterfs/src/globals.h
a3470f
+++ b/libglusterfs/src/globals.h
a3470f
@@ -43,7 +43,7 @@
a3470f
  */
a3470f
 #define GD_OP_VERSION_MIN  1 /* MIN is the fresh start op-version, mostly
a3470f
                                 should not change */
a3470f
-#define GD_OP_VERSION_MAX  GD_OP_VERSION_3_12_2 /* MAX VERSION is the maximum
a3470f
+#define GD_OP_VERSION_MAX  GD_OP_VERSION_3_13_0 /* MAX VERSION is the maximum
a3470f
                                                   count in VME table, should
a3470f
                                                   keep changing with
a3470f
                                                   introduction of newer
a3470f
@@ -101,6 +101,8 @@
a3470f
 
a3470f
 #define GD_OP_VERSION_3_12_2   31202 /* Op-version for GlusterFS 3.12.2 */
a3470f
 
a3470f
+#define GD_OP_VERSION_3_13_0   31300 /* Op-version for GlusterFS 3.13.0 */
a3470f
+
a3470f
 #include "xlator.h"
a3470f
 
a3470f
 /* THIS */
a3470f
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
a3470f
index 6963907..f86ecf8 100644
a3470f
--- a/xlators/cluster/ec/src/ec-common.c
a3470f
+++ b/xlators/cluster/ec/src/ec-common.c
a3470f
@@ -25,6 +25,40 @@
a3470f
                                    EC_FLAG_WAITING_DATA_DIRTY |\
a3470f
                                    EC_FLAG_WAITING_METADATA_DIRTY)
a3470f
 
a3470f
+off_t
a3470f
+ec_range_end_get (off_t fl_start, size_t fl_size)
a3470f
+{
a3470f
+        off_t fl_end = 0;
a3470f
+        switch (fl_size) {
a3470f
+        case 0:
a3470f
+                return fl_start;
a3470f
+        case LLONG_MAX: /*Infinity*/
a3470f
+                return LLONG_MAX;
a3470f
+        default:
a3470f
+                fl_end = fl_start + fl_size - 1;
a3470f
+                if (fl_end < 0) /*over-flow*/
a3470f
+                        return LLONG_MAX;
a3470f
+                else
a3470f
+                        return fl_end;
a3470f
+        }
a3470f
+}
a3470f
+
a3470f
+static gf_boolean_t
a3470f
+ec_is_range_conflict (ec_lock_link_t *l1, ec_lock_link_t *l2)
a3470f
+{
a3470f
+        return ((l1->fl_end >= l2->fl_start) && (l2->fl_end >= l1->fl_start));
a3470f
+}
a3470f
+
a3470f
+static gf_boolean_t
a3470f
+ec_lock_conflict (ec_lock_link_t *l1, ec_lock_link_t *l2)
a3470f
+{
a3470f
+        if ((l1->fop->flags & EC_FLAG_LOCK_SHARED) &&
a3470f
+            (l2->fop->flags & EC_FLAG_LOCK_SHARED))
a3470f
+                return _gf_false;
a3470f
+
a3470f
+        return ec_is_range_conflict (l1, l2);
a3470f
+}
a3470f
+
a3470f
 uint32_t
a3470f
 ec_select_first_by_read_policy (ec_t *ec, ec_fop_data_t *fop)
a3470f
 {
a3470f
@@ -724,7 +758,7 @@ int32_t ec_lock_compare(ec_lock_t * lock1, ec_lock_t * lock2)
a3470f
 }
a3470f
 
a3470f
 void ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, uint32_t flags,
a3470f
-                    loc_t *base)
a3470f
+                    loc_t *base, off_t fl_start, size_t fl_size)
a3470f
 {
a3470f
     ec_lock_link_t *link;
a3470f
 
a3470f
@@ -758,12 +792,15 @@ void ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, uint32_t flags,
a3470f
     link->update[EC_DATA_TXN] = (flags & EC_UPDATE_DATA) != 0;
a3470f
     link->update[EC_METADATA_TXN] = (flags & EC_UPDATE_META) != 0;
a3470f
     link->base = base;
a3470f
+    link->fl_start = fl_start;
a3470f
+    link->fl_end = ec_range_end_get (fl_start, fl_size);
a3470f
 
a3470f
     lock->refs_pending++;
a3470f
 }
a3470f
 
a3470f
 void ec_lock_prepare_inode_internal(ec_fop_data_t *fop, loc_t *loc,
a3470f
-                                    uint32_t flags, loc_t *base)
a3470f
+                                    uint32_t flags, loc_t *base,
a3470f
+                                    off_t fl_start, size_t fl_size)
a3470f
 {
a3470f
     ec_lock_t *lock = NULL;
a3470f
     ec_inode_t *ctx;
a3470f
@@ -824,16 +861,17 @@ void ec_lock_prepare_inode_internal(ec_fop_data_t *fop, loc_t *loc,
a3470f
     ctx->inode_lock = lock;
a3470f
 
a3470f
 insert:
a3470f
-    ec_lock_insert(fop, lock, flags, base);
a3470f
+    ec_lock_insert(fop, lock, flags, base, fl_start, fl_size);
a3470f
 update_query:
a3470f
     lock->query |= (flags & EC_QUERY_INFO) != 0;
a3470f
 unlock:
a3470f
     UNLOCK(&loc->inode->lock);
a3470f
 }
a3470f
 
a3470f
-void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, uint32_t flags)
a3470f
+void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, uint32_t flags,
a3470f
+                           off_t fl_start, size_t fl_size)
a3470f
 {
a3470f
-    ec_lock_prepare_inode_internal(fop, loc, flags, NULL);
a3470f
+    ec_lock_prepare_inode_internal(fop, loc, flags, NULL, fl_start, fl_size);
a3470f
 }
a3470f
 
a3470f
 void ec_lock_prepare_parent_inode(ec_fop_data_t *fop, loc_t *loc, loc_t *base,
a3470f
@@ -859,12 +897,13 @@ void ec_lock_prepare_parent_inode(ec_fop_data_t *fop, loc_t *loc, loc_t *base,
a3470f
             base = NULL;
a3470f
     }
a3470f
 
a3470f
-    ec_lock_prepare_inode_internal(fop, &tmp, flags, base);
a3470f
+    ec_lock_prepare_inode_internal(fop, &tmp, flags, base, 0, LLONG_MAX);
a3470f
 
a3470f
     loc_wipe(&tmp);
a3470f
 }
a3470f
 
a3470f
-void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags)
a3470f
+void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags,
a3470f
+                        off_t fl_start, size_t fl_size)
a3470f
 {
a3470f
     loc_t loc;
a3470f
     int32_t err;
a3470f
@@ -880,7 +919,7 @@ void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags)
a3470f
         return;
a3470f
     }
a3470f
 
a3470f
-    ec_lock_prepare_inode_internal(fop, &loc, flags, NULL);
a3470f
+    ec_lock_prepare_inode_internal(fop, &loc, flags, NULL, fl_start, fl_size);
a3470f
 
a3470f
     loc_wipe(&loc;;
a3470f
 }
a3470f
@@ -1314,17 +1353,16 @@ out:
a3470f
     }
a3470f
 }
a3470f
 
a3470f
-gf_boolean_t ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode,
a3470f
-                               uint64_t *size)
a3470f
+gf_boolean_t
a3470f
+__ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode,
a3470f
+                    uint64_t *size)
a3470f
 {
a3470f
     ec_inode_t *ctx;
a3470f
     gf_boolean_t found = _gf_false;
a3470f
 
a3470f
-    LOCK(&inode->lock);
a3470f
-
a3470f
     ctx = __ec_inode_get(inode, fop->xl);
a3470f
     if (ctx == NULL) {
a3470f
-        goto unlock;
a3470f
+        goto out;
a3470f
     }
a3470f
 
a3470f
     if (ctx->have_size) {
a3470f
@@ -1332,23 +1370,35 @@ gf_boolean_t ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode,
a3470f
         found = _gf_true;
a3470f
     }
a3470f
 
a3470f
-unlock:
a3470f
+out:
a3470f
+    return found;
a3470f
+}
a3470f
+
a3470f
+gf_boolean_t
a3470f
+ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode,
a3470f
+                  uint64_t *size)
a3470f
+{
a3470f
+    gf_boolean_t found = _gf_false;
a3470f
+
a3470f
+    LOCK(&inode->lock);
a3470f
+    {
a3470f
+            found = __ec_get_inode_size (fop, inode, size);
a3470f
+    }
a3470f
     UNLOCK(&inode->lock);
a3470f
 
a3470f
     return found;
a3470f
 }
a3470f
 
a3470f
-gf_boolean_t ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode,
a3470f
-                               uint64_t size)
a3470f
+gf_boolean_t
a3470f
+__ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode,
a3470f
+                    uint64_t size)
a3470f
 {
a3470f
     ec_inode_t *ctx;
a3470f
     gf_boolean_t found = _gf_false;
a3470f
 
a3470f
-    LOCK(&inode->lock);
a3470f
-
a3470f
     ctx = __ec_inode_get(inode, fop->xl);
a3470f
     if (ctx == NULL) {
a3470f
-        goto unlock;
a3470f
+        goto out;
a3470f
     }
a3470f
 
a3470f
     /* Normal fops always have ctx->have_size set. However self-heal calls this
a3470f
@@ -1363,8 +1413,21 @@ gf_boolean_t ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode,
a3470f
 
a3470f
     found = _gf_true;
a3470f
 
a3470f
-unlock:
a3470f
-    UNLOCK(&inode->lock);
a3470f
+out:
a3470f
+    return found;
a3470f
+}
a3470f
+
a3470f
+gf_boolean_t
a3470f
+ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode,
a3470f
+                  uint64_t size)
a3470f
+{
a3470f
+    gf_boolean_t found = _gf_false;
a3470f
+
a3470f
+    LOCK (&inode->lock);
a3470f
+    {
a3470f
+            found = __ec_set_inode_size (fop, inode, size);
a3470f
+    }
a3470f
+    UNLOCK (&inode->lock);
a3470f
 
a3470f
     return found;
a3470f
 }
a3470f
@@ -1471,34 +1534,47 @@ ec_lock_update_fd(ec_lock_t *lock, ec_fop_data_t *fop)
a3470f
     }
a3470f
 }
a3470f
 
a3470f
+static gf_boolean_t
a3470f
+ec_link_has_lock_conflict (ec_lock_link_t *link, struct list_head *owners)
a3470f
+{
a3470f
+        ec_lock_link_t *owner_link = NULL;
a3470f
+        ec_t           *ec = link->fop->xl->private;
a3470f
+
a3470f
+        if (!ec->parallel_writes)
a3470f
+                return _gf_true;
a3470f
+
a3470f
+        list_for_each_entry (owner_link, owners, owner_list) {
a3470f
+                if (ec_lock_conflict (owner_link, link))
a3470f
+                        return _gf_true;
a3470f
+        }
a3470f
+        return _gf_false;
a3470f
+}
a3470f
+
a3470f
 static void
a3470f
 ec_lock_wake_shared(ec_lock_t *lock, struct list_head *list)
a3470f
 {
a3470f
     ec_fop_data_t *fop;
a3470f
     ec_lock_link_t *link;
a3470f
-    gf_boolean_t exclusive = _gf_false;
a3470f
+    gf_boolean_t conflict = _gf_false;
a3470f
 
a3470f
-    while (!exclusive && !list_empty(&lock->waiting)) {
a3470f
+    while (!conflict && !list_empty(&lock->waiting)) {
a3470f
         link = list_entry(lock->waiting.next, ec_lock_link_t, wait_list);
a3470f
         fop = link->fop;
a3470f
 
a3470f
         /* If lock is not acquired, at most one fop can be assigned as owner.
a3470f
          * The following fops will need to wait in the lock->waiting queue
a3470f
          * until the lock has been fully acquired. */
a3470f
-        exclusive = !lock->acquired;
a3470f
+        conflict = !lock->acquired;
a3470f
 
a3470f
         /* If the fop is not shareable, only this fop can be assigned as owner.
a3470f
          * Other fops will need to wait until this one finishes. */
a3470f
-        if ((fop->flags & EC_FLAG_LOCK_SHARED) == 0) {
a3470f
-            exclusive = _gf_true;
a3470f
-
a3470f
-            /* Avoid other requests to be assigned as owners. */
a3470f
-            lock->exclusive = 1;
a3470f
+        if (ec_link_has_lock_conflict (link, &lock->owners)) {
a3470f
+            conflict = _gf_true;
a3470f
         }
a3470f
 
a3470f
         /* If only one fop is allowed, it can be assigned as the owner of the
a3470f
          * lock only if there weren't any other owner. */
a3470f
-        if (exclusive && !list_empty(&lock->owners)) {
a3470f
+        if (conflict && !list_empty(&lock->owners)) {
a3470f
             break;
a3470f
         }
a3470f
 
a3470f
@@ -1565,9 +1641,7 @@ void ec_lock_acquired(ec_lock_link_t *link)
a3470f
     lock->acquired = _gf_true;
a3470f
 
a3470f
     ec_lock_update_fd(lock, fop);
a3470f
-    if ((fop->flags & EC_FLAG_LOCK_SHARED) != 0) {
a3470f
-        ec_lock_wake_shared(lock, &list);
a3470f
-    }
a3470f
+    ec_lock_wake_shared(lock, &list);
a3470f
 
a3470f
     UNLOCK(&lock->loc.inode->lock);
a3470f
 
a3470f
@@ -1678,11 +1752,11 @@ ec_lock_assign_owner(ec_lock_link_t *link)
a3470f
         /* We are trying to acquire a lock that has an unlock timer active.
a3470f
          * This means that the lock must be idle, i.e. no fop can be in the
a3470f
          * owner, waiting or frozen lists. It also means that the lock cannot
a3470f
-         * have been marked as being released (this is done without timers)
a3470f
-         * and it must not be exclusive. There should only be one owner
a3470f
-         * reference, but it's possible that some fops are being prepared to
a3470f
-         * use this lock. */
a3470f
-        GF_ASSERT ((lock->exclusive == 0) && (lock->refs_owners == 1) &&
a3470f
+         * have been marked as being released (this is done without timers).
a3470f
+         * There should only be one owner reference, but it's possible that
a3470f
+         * some fops are being prepared to use this lock.
a3470f
+         */
a3470f
+        GF_ASSERT ((lock->refs_owners == 1) &&
a3470f
                    list_empty(&lock->owners) && list_empty(&lock->waiting));
a3470f
 
a3470f
         /* We take the timer_link before cancelling the timer, since a
a3470f
@@ -1730,13 +1804,15 @@ ec_lock_assign_owner(ec_lock_link_t *link)
a3470f
         lock->timer = NULL;
a3470f
     }
a3470f
 
a3470f
-    lock->exclusive |= (fop->flags & EC_FLAG_LOCK_SHARED) == 0;
a3470f
-
a3470f
     if (!list_empty(&lock->owners)) {
a3470f
         /* There are other owners of this lock. We can only take ownership if
a3470f
-         * the lock is already acquired and can be shared. Otherwise we need
a3470f
-         * to wait. */
a3470f
-        if (!lock->acquired || (lock->exclusive != 0)) {
a3470f
+         * the lock is already acquired and doesn't have conflict with existing
a3470f
+         * owners, or waiters(to prevent starvation).
a3470f
+         * Otherwise we need to wait.
a3470f
+         */
a3470f
+        if (!lock->acquired ||
a3470f
+            ec_link_has_lock_conflict (link, &lock->owners) ||
a3470f
+            ec_link_has_lock_conflict (link, &lock->waiting)) {
a3470f
             ec_trace("LOCK_QUEUE_WAIT", fop, "lock=%p", lock);
a3470f
 
a3470f
             list_add_tail(&link->wait_list, &lock->waiting);
a3470f
@@ -1814,10 +1890,7 @@ ec_lock_next_owner(ec_lock_link_t *link, ec_cbk_data_t *cbk,
a3470f
     }
a3470f
     ec_lock_update_good(lock, fop);
a3470f
 
a3470f
-    lock->exclusive -= (fop->flags & EC_FLAG_LOCK_SHARED) == 0;
a3470f
-    if (list_empty(&lock->owners)) {
a3470f
-        ec_lock_wake_shared(lock, &list);
a3470f
-    }
a3470f
+    ec_lock_wake_shared(lock, &list);
a3470f
 
a3470f
     UNLOCK(&lock->loc.inode->lock);
a3470f
 
a3470f
@@ -1871,11 +1944,11 @@ ec_lock_unfreeze(ec_lock_link_t *link)
a3470f
     lock->acquired = _gf_false;
a3470f
 
a3470f
     /* We are unfreezing a lock. This means that the lock has already been
a3470f
-     * released. In this state it shouldn't be exclusive nor have a pending
a3470f
-     * timer nor have any owner, and the waiting list should be empty. Only
a3470f
-     * the frozen list can contain some fop. */
a3470f
-    GF_ASSERT((lock->exclusive == 0) && (lock->timer == NULL) &&
a3470f
-              list_empty(&lock->waiting) && list_empty(&lock->owners));
a3470f
+     * released. In this state it shouldn't have a pending timer nor have any
a3470f
+     * owner, and the waiting list should be empty. Only the frozen list can
a3470f
+     * contain some fop. */
a3470f
+    GF_ASSERT((lock->timer == NULL) && list_empty(&lock->waiting) &&
a3470f
+              list_empty(&lock->owners));
a3470f
 
a3470f
     /* We move all frozen fops to the waiting list. */
a3470f
     list_splice_init(&lock->frozen, &lock->waiting);
a3470f
@@ -2008,7 +2081,7 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version,
a3470f
     ec_fop_data_t *fop;
a3470f
     ec_lock_t *lock;
a3470f
     ec_inode_t *ctx;
a3470f
-    dict_t * dict;
a3470f
+    dict_t *dict = NULL;
a3470f
     uintptr_t   update_on = 0;
a3470f
 
a3470f
     int32_t err = -ENOMEM;
a3470f
@@ -2198,12 +2271,12 @@ ec_unlock_timer_del(ec_lock_link_t *link)
a3470f
                 ec_trace("UNLOCK_DELAYED", link->fop, "lock=%p", lock);
a3470f
 
a3470f
                 /* The unlock timer has expired without anyone cancelling it.
a3470f
-                 * This means that it shouldn't have any owner, and the
a3470f
-                 * waiting and frozen lists should be empty. It shouldn't have
a3470f
-                 * been marked as release nor be exclusive either. It must have
a3470f
-                 * only one owner reference, but there can be fops being
a3470f
-                 * prepared though. */
a3470f
-                GF_ASSERT(!lock->release && (lock->exclusive == 0) &&
a3470f
+                 * This means that it shouldn't have any owner, and the waiting
a3470f
+                 * and frozen lists should be empty.  It must have only one
a3470f
+                 * owner reference, but there can be fops being prepared
a3470f
+                 * though.
a3470f
+                 * */
a3470f
+                GF_ASSERT(!lock->release &&
a3470f
                           (lock->refs_owners == 1) &&
a3470f
                           list_empty(&lock->owners) &&
a3470f
                           list_empty(&lock->waiting) &&
a3470f
diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h
a3470f
index 8f5d20a..1a947cc 100644
a3470f
--- a/xlators/cluster/ec/src/ec-common.h
a3470f
+++ b/xlators/cluster/ec/src/ec-common.h
a3470f
@@ -91,18 +91,24 @@ ec_fop_prepare_answer(ec_fop_data_t *fop, gf_boolean_t ro);
a3470f
 gf_boolean_t
a3470f
 ec_cbk_set_error(ec_cbk_data_t *cbk, int32_t error, gf_boolean_t ro);
a3470f
 
a3470f
-void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, uint32_t flags);
a3470f
+void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, uint32_t flags,
a3470f
+                           off_t fl_start, size_t fl_size);
a3470f
 void ec_lock_prepare_parent_inode(ec_fop_data_t *fop, loc_t *loc, loc_t *base,
a3470f
                                   uint32_t flags);
a3470f
-void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags);
a3470f
+void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags,
a3470f
+                        off_t fl_start, size_t fl_size);
a3470f
 void ec_lock(ec_fop_data_t * fop);
a3470f
 void ec_lock_reuse(ec_fop_data_t *fop);
a3470f
 void ec_unlock(ec_fop_data_t * fop);
a3470f
 
a3470f
 gf_boolean_t ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode,
a3470f
                                uint64_t *size);
a3470f
+gf_boolean_t __ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode,
a3470f
+                                 uint64_t *size);
a3470f
 gf_boolean_t ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode,
a3470f
                                uint64_t size);
a3470f
+gf_boolean_t __ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode,
a3470f
+                                 uint64_t size);
a3470f
 void ec_clear_inode_info(ec_fop_data_t *fop, inode_t *inode);
a3470f
 
a3470f
 void ec_flush_size_version(ec_fop_data_t * fop);
a3470f
diff --git a/xlators/cluster/ec/src/ec-dir-read.c b/xlators/cluster/ec/src/ec-dir-read.c
a3470f
index 4fe82e3..48afe54 100644
a3470f
--- a/xlators/cluster/ec/src/ec-dir-read.c
a3470f
+++ b/xlators/cluster/ec/src/ec-dir-read.c
a3470f
@@ -141,7 +141,8 @@ int32_t ec_manager_opendir(ec_fop_data_t * fop, int32_t state)
a3470f
             /* Fall through */
a3470f
 
a3470f
         case EC_STATE_LOCK:
a3470f
-            ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO);
a3470f
+            ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO, 0,
a3470f
+                                  LLONG_MAX);
a3470f
             ec_lock(fop);
a3470f
 
a3470f
             return EC_STATE_DISPATCH;
a3470f
@@ -432,7 +433,8 @@ int32_t ec_manager_readdir(ec_fop_data_t * fop, int32_t state)
a3470f
                 }
a3470f
                 fop->mask &= 1ULL << idx;
a3470f
             } else {
a3470f
-                    ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO);
a3470f
+                    ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, 0,
a3470f
+                                       LLONG_MAX);
a3470f
                     ec_lock(fop);
a3470f
             }
a3470f
 
a3470f
diff --git a/xlators/cluster/ec/src/ec-generic.c b/xlators/cluster/ec/src/ec-generic.c
a3470f
index ddb90ce..a5f986e 100644
a3470f
--- a/xlators/cluster/ec/src/ec-generic.c
a3470f
+++ b/xlators/cluster/ec/src/ec-generic.c
a3470f
@@ -85,7 +85,7 @@ int32_t ec_manager_flush(ec_fop_data_t * fop, int32_t state)
a3470f
     {
a3470f
         case EC_STATE_INIT:
a3470f
         case EC_STATE_LOCK:
a3470f
-            ec_lock_prepare_fd(fop, fop->fd, 0);
a3470f
+            ec_lock_prepare_fd(fop, fop->fd, 0, 0, LLONG_MAX);
a3470f
             ec_lock(fop);
a3470f
 
a3470f
             return EC_STATE_DISPATCH;
a3470f
@@ -300,7 +300,7 @@ int32_t ec_manager_fsync(ec_fop_data_t * fop, int32_t state)
a3470f
     {
a3470f
         case EC_STATE_INIT:
a3470f
         case EC_STATE_LOCK:
a3470f
-            ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO);
a3470f
+            ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, 0, LLONG_MAX);
a3470f
             ec_lock(fop);
a3470f
 
a3470f
             return EC_STATE_DISPATCH;
a3470f
@@ -501,7 +501,7 @@ int32_t ec_manager_fsyncdir(ec_fop_data_t * fop, int32_t state)
a3470f
     {
a3470f
         case EC_STATE_INIT:
a3470f
         case EC_STATE_LOCK:
a3470f
-            ec_lock_prepare_fd(fop, fop->fd, 0);
a3470f
+            ec_lock_prepare_fd(fop, fop->fd, 0, 0, LLONG_MAX);
a3470f
             ec_lock(fop);
a3470f
 
a3470f
             return EC_STATE_DISPATCH;
a3470f
@@ -1220,9 +1220,11 @@ int32_t ec_manager_xattrop(ec_fop_data_t * fop, int32_t state)
a3470f
         case EC_STATE_INIT:
a3470f
         case EC_STATE_LOCK:
a3470f
             if (fop->fd == NULL) {
a3470f
-                ec_lock_prepare_inode(fop, &fop->loc[0], EC_UPDATE_META);
a3470f
+                ec_lock_prepare_inode(fop, &fop->loc[0], EC_UPDATE_META, 0,
a3470f
+                                      LLONG_MAX);
a3470f
             } else {
a3470f
-                ec_lock_prepare_fd(fop, fop->fd, EC_UPDATE_META);
a3470f
+                ec_lock_prepare_fd(fop, fop->fd, EC_UPDATE_META, 0,
a3470f
+                                   LLONG_MAX);
a3470f
             }
a3470f
             ec_lock(fop);
a3470f
 
a3470f
diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c
a3470f
index 829f47f..33fd7f5 100644
a3470f
--- a/xlators/cluster/ec/src/ec-inode-read.c
a3470f
+++ b/xlators/cluster/ec/src/ec-inode-read.c
a3470f
@@ -72,7 +72,8 @@ ec_manager_access(ec_fop_data_t *fop, int32_t state)
a3470f
         switch (state) {
a3470f
         case EC_STATE_INIT:
a3470f
         case EC_STATE_LOCK:
a3470f
-            ec_lock_prepare_inode (fop, &fop->loc[0], EC_QUERY_INFO);
a3470f
+            ec_lock_prepare_inode (fop, &fop->loc[0], EC_QUERY_INFO, 0,
a3470f
+                                   LLONG_MAX);
a3470f
             ec_lock (fop);
a3470f
 
a3470f
             return EC_STATE_DISPATCH;
a3470f
@@ -311,9 +312,11 @@ int32_t ec_manager_getxattr(ec_fop_data_t * fop, int32_t state)
a3470f
                 (strncmp(fop->str[0], GF_XATTR_CLRLK_CMD,
a3470f
                          strlen(GF_XATTR_CLRLK_CMD)) != 0)) {
a3470f
                 if (fop->fd == NULL) {
a3470f
-                    ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO);
a3470f
+                    ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO,
a3470f
+                                          0, LLONG_MAX);
a3470f
                 } else {
a3470f
-                    ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO);
a3470f
+                    ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, 0,
a3470f
+                                       LLONG_MAX);
a3470f
                 }
a3470f
                 ec_lock(fop);
a3470f
             }
a3470f
@@ -1029,7 +1032,8 @@ int32_t ec_manager_readlink(ec_fop_data_t * fop, int32_t state)
a3470f
     {
a3470f
         case EC_STATE_INIT:
a3470f
         case EC_STATE_LOCK:
a3470f
-            ec_lock_prepare_inode (fop, &fop->loc[0], EC_QUERY_INFO);
a3470f
+            ec_lock_prepare_inode (fop, &fop->loc[0], EC_QUERY_INFO, 0,
a3470f
+                                   LLONG_MAX);
a3470f
             ec_lock (fop);
a3470f
             return EC_STATE_DISPATCH;
a3470f
 
a3470f
@@ -1364,7 +1368,8 @@ int32_t ec_manager_readv(ec_fop_data_t * fop, int32_t state)
a3470f
         /* Fall through */
a3470f
 
a3470f
         case EC_STATE_LOCK:
a3470f
-            ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO);
a3470f
+            ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, fop->offset,
a3470f
+                               fop->size);
a3470f
             ec_lock(fop);
a3470f
 
a3470f
             return EC_STATE_DISPATCH;
a3470f
@@ -1568,7 +1573,7 @@ int32_t ec_manager_seek(ec_fop_data_t *fop, int32_t state)
a3470f
     /* Fall through */
a3470f
 
a3470f
     case EC_STATE_LOCK:
a3470f
-        ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO);
a3470f
+        ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, fop->offset, LLONG_MAX);
a3470f
         ec_lock(fop);
a3470f
 
a3470f
         return EC_STATE_DISPATCH;
a3470f
@@ -1788,9 +1793,10 @@ int32_t ec_manager_stat(ec_fop_data_t * fop, int32_t state)
a3470f
         case EC_STATE_INIT:
a3470f
         case EC_STATE_LOCK:
a3470f
             if (fop->fd == NULL) {
a3470f
-                ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO);
a3470f
+                ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO, 0,
a3470f
+                                      LLONG_MAX);
a3470f
             } else {
a3470f
-                ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO);
a3470f
+                ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, 0, LLONG_MAX);
a3470f
             }
a3470f
             ec_lock(fop);
a3470f
 
a3470f
diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c
a3470f
index 3ed9b2a..e6a67cf 100644
a3470f
--- a/xlators/cluster/ec/src/ec-inode-write.c
a3470f
+++ b/xlators/cluster/ec/src/ec-inode-write.c
a3470f
@@ -127,10 +127,12 @@ ec_manager_xattr (ec_fop_data_t *fop, int32_t state)
a3470f
         case EC_STATE_LOCK:
a3470f
                 if (fop->fd == NULL) {
a3470f
                         ec_lock_prepare_inode(fop, &fop->loc[0],
a3470f
-                                              EC_UPDATE_META | EC_QUERY_INFO);
a3470f
+                                              EC_UPDATE_META | EC_QUERY_INFO,
a3470f
+                                              0, LLONG_MAX);
a3470f
                 } else {
a3470f
                         ec_lock_prepare_fd(fop, fop->fd,
a3470f
-                                           EC_UPDATE_META | EC_QUERY_INFO);
a3470f
+                                           EC_UPDATE_META | EC_QUERY_INFO,
a3470f
+                                           0, LLONG_MAX);
a3470f
                 }
a3470f
                 ec_lock(fop);
a3470f
 
a3470f
@@ -369,10 +371,11 @@ int32_t ec_manager_setattr(ec_fop_data_t * fop, int32_t state)
a3470f
         case EC_STATE_LOCK:
a3470f
             if (fop->fd == NULL) {
a3470f
                 ec_lock_prepare_inode(fop, &fop->loc[0],
a3470f
-                                      EC_UPDATE_META | EC_QUERY_INFO);
a3470f
+                                      EC_UPDATE_META | EC_QUERY_INFO,
a3470f
+                                      0, LLONG_MAX);
a3470f
             } else {
a3470f
-                ec_lock_prepare_fd(fop, fop->fd,
a3470f
-                                   EC_UPDATE_META | EC_QUERY_INFO);
a3470f
+                ec_lock_prepare_fd(fop, fop->fd, EC_UPDATE_META | EC_QUERY_INFO,
a3470f
+                                   0, LLONG_MAX);
a3470f
             }
a3470f
             ec_lock(fop);
a3470f
 
a3470f
@@ -879,8 +882,8 @@ int32_t ec_manager_fallocate(ec_fop_data_t *fop, int32_t state)
a3470f
 
a3470f
     case EC_STATE_LOCK:
a3470f
         ec_lock_prepare_fd(fop, fop->fd,
a3470f
-                           EC_UPDATE_DATA | EC_UPDATE_META |
a3470f
-                           EC_QUERY_INFO);
a3470f
+                           EC_UPDATE_DATA | EC_UPDATE_META | EC_QUERY_INFO,
a3470f
+                           fop->offset, fop->size);
a3470f
         ec_lock(fop);
a3470f
 
a3470f
         return EC_STATE_DISPATCH;
a3470f
@@ -898,24 +901,28 @@ int32_t ec_manager_fallocate(ec_fop_data_t *fop, int32_t state)
a3470f
                             cbk->count);
a3470f
 
a3470f
                 /* This shouldn't fail because we have the inode locked. */
a3470f
-                GF_ASSERT(ec_get_inode_size(fop, fop->locks[0].lock->loc.inode,
a3470f
-                                        &cbk->iatt[0].ia_size));
a3470f
+                LOCK(&fop->locks[0].lock->loc.inode->lock);
a3470f
+                {
a3470f
+                        GF_ASSERT(__ec_get_inode_size(fop,
a3470f
+                                                fop->locks[0].lock->loc.inode,
a3470f
+                                                &cbk->iatt[0].ia_size));
a3470f
 
a3470f
-                /*If mode has FALLOC_FL_KEEP_SIZE keep the size */
a3470f
-                if (fop->int32 & FALLOC_FL_KEEP_SIZE) {
a3470f
-                        cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
a3470f
-                } else if (fop->user_size > cbk->iatt[0].ia_size) {
a3470f
-                        cbk->iatt[1].ia_size = fop->user_size;
a3470f
-
a3470f
-                        /* This shouldn't fail because we have the inode
a3470f
-                         * locked. */
a3470f
-                        GF_ASSERT(ec_set_inode_size(fop,
a3470f
-                                  fop->locks[0].lock->loc.inode,
a3470f
-                                            cbk->iatt[1].ia_size));
a3470f
-                } else {
a3470f
-                        cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
a3470f
+                        /*If mode has FALLOC_FL_KEEP_SIZE keep the size */
a3470f
+                        if (fop->int32 & FALLOC_FL_KEEP_SIZE) {
a3470f
+                                cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
a3470f
+                        } else if (fop->user_size > cbk->iatt[0].ia_size) {
a3470f
+                                cbk->iatt[1].ia_size = fop->user_size;
a3470f
+
a3470f
+                                /* This shouldn't fail because we have the inode
a3470f
+                                 * locked. */
a3470f
+                                GF_ASSERT(__ec_set_inode_size(fop,
a3470f
+                                          fop->locks[0].lock->loc.inode,
a3470f
+                                                    cbk->iatt[1].ia_size));
a3470f
+                        } else {
a3470f
+                                cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
a3470f
+                        }
a3470f
                 }
a3470f
-
a3470f
+                UNLOCK(&fop->locks[0].lock->loc.inode->lock);
a3470f
         }
a3470f
 
a3470f
         return EC_STATE_REPORT;
a3470f
@@ -1155,11 +1162,11 @@ int32_t ec_manager_truncate(ec_fop_data_t * fop, int32_t state)
a3470f
             if (fop->id == GF_FOP_TRUNCATE) {
a3470f
                 ec_lock_prepare_inode(fop, &fop->loc[0],
a3470f
                                       EC_UPDATE_DATA | EC_UPDATE_META |
a3470f
-                                      EC_QUERY_INFO);
a3470f
+                                      EC_QUERY_INFO, fop->offset, LLONG_MAX);
a3470f
             } else {
a3470f
                 ec_lock_prepare_fd(fop, fop->fd,
a3470f
                                    EC_UPDATE_DATA | EC_UPDATE_META |
a3470f
-                                   EC_QUERY_INFO);
a3470f
+                                   EC_QUERY_INFO, fop->offset, LLONG_MAX);
a3470f
             }
a3470f
             ec_lock(fop);
a3470f
 
a3470f
@@ -1179,6 +1186,9 @@ int32_t ec_manager_truncate(ec_fop_data_t * fop, int32_t state)
a3470f
                                 cbk->count);
a3470f
 
a3470f
                 /* This shouldn't fail because we have the inode locked. */
a3470f
+                /* Inode size doesn't need to be updated under locks, because
a3470f
+                 * conflicting operations won't be in-flight
a3470f
+                 */
a3470f
                 GF_ASSERT(ec_get_inode_size(fop, fop->locks[0].lock->loc.inode,
a3470f
                                             &cbk->iatt[0].ia_size));
a3470f
                 cbk->iatt[1].ia_size = fop->user_size;
a3470f
@@ -1582,6 +1592,9 @@ void ec_writev_start(ec_fop_data_t *fop)
a3470f
     ctx = ec_fd_get(fop->fd, fop->xl);
a3470f
     if (ctx != NULL) {
a3470f
         if ((ctx->flags & O_APPEND) != 0) {
a3470f
+            /* Appending writes take full locks so size won't change because
a3470f
+             * of any parallel operations
a3470f
+             */
a3470f
             fop->offset = current;
a3470f
         }
a3470f
     }
a3470f
@@ -1601,6 +1614,10 @@ void ec_writev_start(ec_fop_data_t *fop)
a3470f
     }
a3470f
     tail = fop->size - fop->user_size - fop->head;
a3470f
     if ((tail > 0) && ((fop->head == 0) || (fop->size > ec->stripe_size))) {
a3470f
+            /* Current locking scheme will make sure the 'current' below will
a3470f
+             * never decrease while the fop is in progress, so the checks will
a3470f
+             * work as expected
a3470f
+             */
a3470f
         if (current > fop->offset + fop->head + fop->user_size) {
a3470f
             if (ec_make_internal_fop_xdata (&xdata)) {
a3470f
                     err = -ENOMEM;
a3470f
@@ -1678,14 +1695,32 @@ ec_writev_encode(ec_fop_data_t *fop)
a3470f
 int32_t ec_manager_writev(ec_fop_data_t *fop, int32_t state)
a3470f
 {
a3470f
     ec_cbk_data_t *cbk;
a3470f
+    ec_fd_t *ctx = NULL;
a3470f
+    ec_t    *ec = fop->xl->private;
a3470f
+    off_t fl_start = 0;
a3470f
+    size_t fl_size = LLONG_MAX;
a3470f
 
a3470f
     switch (state)
a3470f
     {
a3470f
         case EC_STATE_INIT:
a3470f
         case EC_STATE_LOCK:
a3470f
+                ctx = ec_fd_get(fop->fd, fop->xl);
a3470f
+                if (ctx != NULL) {
a3470f
+                    if ((ctx->flags & O_APPEND) == 0) {
a3470f
+                            off_t user_size = 0;
a3470f
+                            off_t head = 0;
a3470f
+
a3470f
+                            fl_start = fop->offset;
a3470f
+                            user_size = iov_length(fop->vector, fop->int32);
a3470f
+                            head = ec_adjust_offset_down(ec, &fl_start,
a3470f
+                                                         _gf_true);
a3470f
+                            fl_size = user_size + head;
a3470f
+                            ec_adjust_size_up(ec, &fl_size, _gf_true);
a3470f
+                    }
a3470f
+                }
a3470f
             ec_lock_prepare_fd(fop, fop->fd,
a3470f
                                EC_UPDATE_DATA | EC_UPDATE_META |
a3470f
-                               EC_QUERY_INFO);
a3470f
+                               EC_QUERY_INFO, fl_start, fl_size);
a3470f
             ec_lock(fop);
a3470f
 
a3470f
             return EC_STATE_DISPATCH;
a3470f
@@ -1717,23 +1752,28 @@ int32_t ec_manager_writev(ec_fop_data_t *fop, int32_t state)
a3470f
                                 cbk->count);
a3470f
 
a3470f
                 /* This shouldn't fail because we have the inode locked. */
a3470f
-                GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode,
a3470f
-                                            &cbk->iatt[0].ia_size));
a3470f
-                cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
a3470f
-                size = fop->offset + fop->head + fop->user_size;
a3470f
-                if (size > cbk->iatt[0].ia_size) {
a3470f
-                    /* Only update inode size if this is a top level fop.
a3470f
-                     * Otherwise this is an internal write and the top
a3470f
-                     * level fop should take care of the real inode size.
a3470f
-                     */
a3470f
-                    if (fop->parent == NULL) {
a3470f
-                        /* This shouldn't fail because we have the inode
a3470f
-                         * locked. */
a3470f
-                        GF_ASSERT(ec_set_inode_size(fop, fop->fd->inode,
a3470f
-                                                    size));
a3470f
-                    }
a3470f
-                    cbk->iatt[1].ia_size = size;
a3470f
+                LOCK(&fop->fd->inode->lock);
a3470f
+                {
a3470f
+                        GF_ASSERT(__ec_get_inode_size(fop, fop->fd->inode,
a3470f
+                                                    &cbk->iatt[0].ia_size));
a3470f
+                        cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
a3470f
+                        size = fop->offset + fop->head + fop->user_size;
a3470f
+                        if (size > cbk->iatt[0].ia_size) {
a3470f
+                           /* Only update inode size if this is a top level fop.
a3470f
+                            * Otherwise this is an internal write and the top
a3470f
+                            * level fop should take care of the real inode size.
a3470f
+                            */
a3470f
+                            if (fop->parent == NULL) {
a3470f
+                                /* This shouldn't fail because we have the inode
a3470f
+                                 * locked. */
a3470f
+                                GF_ASSERT(__ec_set_inode_size(fop,
a3470f
+                                                        fop->fd->inode, size));
a3470f
+                            }
a3470f
+                            cbk->iatt[1].ia_size = size;
a3470f
+                        }
a3470f
                 }
a3470f
+                UNLOCK(&fop->fd->inode->lock);
a3470f
+
a3470f
                 if (fop->error == 0) {
a3470f
                     cbk->op_ret *= ec->fragments;
a3470f
                     if (cbk->op_ret < fop->head) {
a3470f
diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
a3470f
index 5601f96..354b4ed 100644
a3470f
--- a/xlators/cluster/ec/src/ec-types.h
a3470f
+++ b/xlators/cluster/ec/src/ec-types.h
a3470f
@@ -211,8 +211,8 @@ struct _ec_lock {
a3470f
     struct list_head   owners;
a3470f
 
a3470f
     /* List of fops waiting to be an owner of the lock. Fops are added to this
a3470f
-     * list when the current owner has an incompatible access (shared vs
a3470f
-     * exclusive) or the lock is not acquired yet. */
a3470f
+     * list when the current owner has an incompatible access (conflicting lock)
a3470f
+     * or the lock is not acquired yet. */
a3470f
     struct list_head   waiting;
a3470f
 
a3470f
     /* List of fops that will wait until the next unlock/lock cycle. This
a3470f
@@ -221,7 +221,6 @@ struct _ec_lock {
a3470f
      * after the lock is reacquired. */
a3470f
     struct list_head   frozen;
a3470f
 
a3470f
-    int32_t            exclusive;
a3470f
     uintptr_t          mask;
a3470f
     uintptr_t          good_mask;
a3470f
     uintptr_t          healing;
a3470f
@@ -251,6 +250,8 @@ struct _ec_lock_link {
a3470f
     loc_t            *base;
a3470f
     uint64_t          size;
a3470f
     uint32_t          waiting_flags;
a3470f
+    off_t             fl_start;
a3470f
+    off_t             fl_end;
a3470f
 };
a3470f
 
a3470f
 struct _ec_fop_data {
a3470f
@@ -564,6 +565,7 @@ struct _ec {
a3470f
     gf_boolean_t       shutdown;
a3470f
     gf_boolean_t       eager_lock;
a3470f
     gf_boolean_t       optimistic_changelog;
a3470f
+    gf_boolean_t       parallel_writes;
a3470f
     uint32_t           background_heals;
a3470f
     uint32_t           heal_wait_qlen;
a3470f
     uint32_t           self_heal_window_size; /* max size of read/writes */
a3470f
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
a3470f
index c32f4ef..856d60c 100644
a3470f
--- a/xlators/cluster/ec/src/ec.c
a3470f
+++ b/xlators/cluster/ec/src/ec.c
a3470f
@@ -295,6 +295,8 @@ reconfigure (xlator_t *this, dict_t *options)
a3470f
 
a3470f
         GF_OPTION_RECONF ("optimistic-change-log", ec->optimistic_changelog,
a3470f
                           options, bool, failed);
a3470f
+        GF_OPTION_RECONF ("parallel-writes", ec->parallel_writes,
a3470f
+                          options, bool, failed);
a3470f
         ret = 0;
a3470f
         if (ec_assign_read_policy (ec, read_policy)) {
a3470f
                 ret = -1;
a3470f
@@ -665,6 +667,7 @@ init (xlator_t *this)
a3470f
     GF_OPTION_INIT ("shd-max-threads", ec->shd.max_threads, uint32, failed);
a3470f
     GF_OPTION_INIT ("shd-wait-qlength", ec->shd.wait_qlength, uint32, failed);
a3470f
     GF_OPTION_INIT ("optimistic-change-log", ec->optimistic_changelog, bool, failed);
a3470f
+    GF_OPTION_INIT ("parallel-writes", ec->parallel_writes, bool, failed);
a3470f
 
a3470f
     this->itable = inode_table_new (EC_SHD_INODE_LRU_LIMIT, this);
a3470f
     if (!this->itable)
a3470f
@@ -1466,28 +1469,34 @@ struct volume_options options[] =
a3470f
                        "galois field computations."
a3470f
     },
a3470f
     { .key  = {"self-heal-window-size"},
a3470f
-        .type = GF_OPTION_TYPE_INT,
a3470f
-        .min  = 1,
a3470f
-        .max  = 1024,
a3470f
-        .default_value = "1",
a3470f
-        .description = "Maximum number blocks(128KB) per file for which "
a3470f
-                       "self-heal process would be applied simultaneously."
a3470f
+      .type = GF_OPTION_TYPE_INT,
a3470f
+      .min  = 1,
a3470f
+      .max  = 1024,
a3470f
+      .default_value = "1",
a3470f
+      .description = "Maximum number blocks(128KB) per file for which "
a3470f
+                     "self-heal process would be applied simultaneously."
a3470f
     },
a3470f
-    {   .key = {"optimistic-change-log"},
a3470f
-        .type = GF_OPTION_TYPE_BOOL,
a3470f
-        .default_value = "on",
a3470f
-        .description =  "Set/Unset dirty flag for every update fop at the start"
a3470f
-                        "of the fop. If OFF, this option impacts performance of"
a3470f
-                        "entry  operations or metadata operations as it will"
a3470f
-                        "set dirty flag at the start and unset it at the end of"
a3470f
-                        "ALL update fop. If ON and all the bricks are good,"
a3470f
-                        "dirty flag will be set at the start only for file fops"
a3470f
-                        "For metadata and entry fops dirty flag will not be set"
a3470f
-                        "at the start, if all the bricks are good. This does"
a3470f
-                        "not impact performance for metadata operations and"
a3470f
-                        "entry operation but has a very small window to miss"
a3470f
-                        "marking entry as dirty in case it is required to be"
a3470f
-                        "healed"
a3470f
+    { .key = {"optimistic-change-log"},
a3470f
+      .type = GF_OPTION_TYPE_BOOL,
a3470f
+      .default_value = "on",
a3470f
+      .description =  "Set/Unset dirty flag for every update fop at the start"
a3470f
+                      "of the fop. If OFF, this option impacts performance of"
a3470f
+                      "entry  operations or metadata operations as it will"
a3470f
+                      "set dirty flag at the start and unset it at the end of"
a3470f
+                      "ALL update fop. If ON and all the bricks are good,"
a3470f
+                      "dirty flag will be set at the start only for file fops"
a3470f
+                      "For metadata and entry fops dirty flag will not be set"
a3470f
+                      "at the start, if all the bricks are good. This does"
a3470f
+                      "not impact performance for metadata operations and"
a3470f
+                      "entry operation but has a very small window to miss"
a3470f
+                      "marking entry as dirty in case it is required to be"
a3470f
+                      "healed"
a3470f
+    },
a3470f
+    { .key = {"parallel-writes"},
a3470f
+      .type = GF_OPTION_TYPE_BOOL,
a3470f
+      .default_value = "on",
a3470f
+      .description = "This controls if writes can be wound in parallel as long"
a3470f
+                     "as it doesn't modify same stripes"
a3470f
     },
a3470f
     { .key = {NULL} }
a3470f
 };
a3470f
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
a3470f
index 7fe76e5..b15a5af 100644
a3470f
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
a3470f
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
a3470f
@@ -3510,6 +3510,12 @@ struct volopt_map_entry glusterd_volopt_map[] = {
a3470f
           .op_version  = GD_OP_VERSION_3_12_0,
a3470f
           .validate_fn = validate_boolean
a3470f
         },
a3470f
+        { .key        = "disperse.parallel-writes",
a3470f
+          .voltype    = "cluster/disperse",
a3470f
+          .type       = NO_DOC,
a3470f
+          .op_version = GD_OP_VERSION_3_13_0,
a3470f
+          .flags      = OPT_FLAG_CLIENT_OPT
a3470f
+        },
a3470f
         { .key         = NULL
a3470f
         }
a3470f
 };
a3470f
-- 
a3470f
1.8.3.1
a3470f