|
|
d1681e |
From c098fa2192eedbfaad7ac850d0fb152695a3becf Mon Sep 17 00:00:00 2001
|
|
|
d1681e |
From: Pranith Kumar K <pkarampu@redhat.com>
|
|
|
d1681e |
Date: Sun, 25 Jun 2017 16:34:01 +0530
|
|
|
d1681e |
Subject: [PATCH 079/128] cluster/ec: Allow parallel writes in EC if possible
|
|
|
d1681e |
|
|
|
d1681e |
Problem:
|
|
|
d1681e |
Ec at the moment sends one modification fop after another, so if some of
|
|
|
d1681e |
the disks become slow, for a while then the wait time for the writes that
|
|
|
d1681e |
are waiting in the queue becomes really bad.
|
|
|
d1681e |
|
|
|
d1681e |
Fix:
|
|
|
d1681e |
Allow parallel writes when possible. For this we need to make 3 changes.
|
|
|
d1681e |
1) Each fop now has range parameters they will be updating.
|
|
|
d1681e |
2) Xattrop is changed to handle parallel xattrop requests where some
|
|
|
d1681e |
would be modifying just dirty xattr.
|
|
|
d1681e |
3) Fops that refer to size now take locks and update the locks.
|
|
|
d1681e |
|
|
|
d1681e |
upstream patch: https://review.gluster.org/#/c/17625/
|
|
|
d1681e |
>Fixes #251
|
|
|
d1681e |
>Change-Id: Ibc3c15372f91bbd6fb617f0d99399b3149fa64b2
|
|
|
d1681e |
>Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
|
|
|
d1681e |
|
|
|
d1681e |
Note:
|
|
|
d1681e |
There is a delta compared to upstream patch:
|
|
|
d1681e |
|
|
|
d1681e |
For "disperse.parallel-writes" key we have reverted the flags
|
|
|
d1681e |
to reflect old type. Added New OP_VERSION for 3.13.0 in globals.h.
|
|
|
d1681e |
|
|
|
d1681e |
BUG: 1459101
|
|
|
d1681e |
Change-Id: Ibc3c15372f91bbd6fb617f0d99399b3149fa64b2
|
|
|
d1681e |
Signed-off-by: Sunil Kumar Acharya <sheggodu@redhat.com>
|
|
|
d1681e |
Reviewed-on: https://code.engineering.redhat.com/gerrit/123561
|
|
|
d1681e |
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
|
|
|
d1681e |
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
|
|
d1681e |
Reviewed-by: Ashish Pandey <aspandey@redhat.com>
|
|
|
d1681e |
---
|
|
|
d1681e |
libglusterfs/src/globals.h | 4 +-
|
|
|
d1681e |
xlators/cluster/ec/src/ec-common.c | 191 ++++++++++++++++--------
|
|
|
d1681e |
xlators/cluster/ec/src/ec-common.h | 10 +-
|
|
|
d1681e |
xlators/cluster/ec/src/ec-dir-read.c | 6 +-
|
|
|
d1681e |
xlators/cluster/ec/src/ec-generic.c | 12 +-
|
|
|
d1681e |
xlators/cluster/ec/src/ec-inode-read.c | 22 ++-
|
|
|
d1681e |
xlators/cluster/ec/src/ec-inode-write.c | 124 +++++++++------
|
|
|
d1681e |
xlators/cluster/ec/src/ec-types.h | 8 +-
|
|
|
d1681e |
xlators/cluster/ec/src/ec.c | 51 ++++---
|
|
|
d1681e |
xlators/mgmt/glusterd/src/glusterd-volume-set.c | 6 +
|
|
|
d1681e |
10 files changed, 291 insertions(+), 143 deletions(-)
|
|
|
d1681e |
|
|
|
d1681e |
diff --git a/libglusterfs/src/globals.h b/libglusterfs/src/globals.h
|
|
|
d1681e |
index bd7cffe..c627cfe 100644
|
|
|
d1681e |
--- a/libglusterfs/src/globals.h
|
|
|
d1681e |
+++ b/libglusterfs/src/globals.h
|
|
|
d1681e |
@@ -43,7 +43,7 @@
|
|
|
d1681e |
*/
|
|
|
d1681e |
#define GD_OP_VERSION_MIN 1 /* MIN is the fresh start op-version, mostly
|
|
|
d1681e |
should not change */
|
|
|
d1681e |
-#define GD_OP_VERSION_MAX GD_OP_VERSION_3_12_2 /* MAX VERSION is the maximum
|
|
|
d1681e |
+#define GD_OP_VERSION_MAX GD_OP_VERSION_3_13_0 /* MAX VERSION is the maximum
|
|
|
d1681e |
count in VME table, should
|
|
|
d1681e |
keep changing with
|
|
|
d1681e |
introduction of newer
|
|
|
d1681e |
@@ -101,6 +101,8 @@
|
|
|
d1681e |
|
|
|
d1681e |
#define GD_OP_VERSION_3_12_2 31202 /* Op-version for GlusterFS 3.12.2 */
|
|
|
d1681e |
|
|
|
d1681e |
+#define GD_OP_VERSION_3_13_0 31300 /* Op-version for GlusterFS 3.13.0 */
|
|
|
d1681e |
+
|
|
|
d1681e |
#include "xlator.h"
|
|
|
d1681e |
|
|
|
d1681e |
/* THIS */
|
|
|
d1681e |
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
|
|
|
d1681e |
index 6963907..f86ecf8 100644
|
|
|
d1681e |
--- a/xlators/cluster/ec/src/ec-common.c
|
|
|
d1681e |
+++ b/xlators/cluster/ec/src/ec-common.c
|
|
|
d1681e |
@@ -25,6 +25,40 @@
|
|
|
d1681e |
EC_FLAG_WAITING_DATA_DIRTY |\
|
|
|
d1681e |
EC_FLAG_WAITING_METADATA_DIRTY)
|
|
|
d1681e |
|
|
|
d1681e |
+off_t
|
|
|
d1681e |
+ec_range_end_get (off_t fl_start, size_t fl_size)
|
|
|
d1681e |
+{
|
|
|
d1681e |
+ off_t fl_end = 0;
|
|
|
d1681e |
+ switch (fl_size) {
|
|
|
d1681e |
+ case 0:
|
|
|
d1681e |
+ return fl_start;
|
|
|
d1681e |
+ case LLONG_MAX: /*Infinity*/
|
|
|
d1681e |
+ return LLONG_MAX;
|
|
|
d1681e |
+ default:
|
|
|
d1681e |
+ fl_end = fl_start + fl_size - 1;
|
|
|
d1681e |
+ if (fl_end < 0) /*over-flow*/
|
|
|
d1681e |
+ return LLONG_MAX;
|
|
|
d1681e |
+ else
|
|
|
d1681e |
+ return fl_end;
|
|
|
d1681e |
+ }
|
|
|
d1681e |
+}
|
|
|
d1681e |
+
|
|
|
d1681e |
+static gf_boolean_t
|
|
|
d1681e |
+ec_is_range_conflict (ec_lock_link_t *l1, ec_lock_link_t *l2)
|
|
|
d1681e |
+{
|
|
|
d1681e |
+ return ((l1->fl_end >= l2->fl_start) && (l2->fl_end >= l1->fl_start));
|
|
|
d1681e |
+}
|
|
|
d1681e |
+
|
|
|
d1681e |
+static gf_boolean_t
|
|
|
d1681e |
+ec_lock_conflict (ec_lock_link_t *l1, ec_lock_link_t *l2)
|
|
|
d1681e |
+{
|
|
|
d1681e |
+ if ((l1->fop->flags & EC_FLAG_LOCK_SHARED) &&
|
|
|
d1681e |
+ (l2->fop->flags & EC_FLAG_LOCK_SHARED))
|
|
|
d1681e |
+ return _gf_false;
|
|
|
d1681e |
+
|
|
|
d1681e |
+ return ec_is_range_conflict (l1, l2);
|
|
|
d1681e |
+}
|
|
|
d1681e |
+
|
|
|
d1681e |
uint32_t
|
|
|
d1681e |
ec_select_first_by_read_policy (ec_t *ec, ec_fop_data_t *fop)
|
|
|
d1681e |
{
|
|
|
d1681e |
@@ -724,7 +758,7 @@ int32_t ec_lock_compare(ec_lock_t * lock1, ec_lock_t * lock2)
|
|
|
d1681e |
}
|
|
|
d1681e |
|
|
|
d1681e |
void ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, uint32_t flags,
|
|
|
d1681e |
- loc_t *base)
|
|
|
d1681e |
+ loc_t *base, off_t fl_start, size_t fl_size)
|
|
|
d1681e |
{
|
|
|
d1681e |
ec_lock_link_t *link;
|
|
|
d1681e |
|
|
|
d1681e |
@@ -758,12 +792,15 @@ void ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, uint32_t flags,
|
|
|
d1681e |
link->update[EC_DATA_TXN] = (flags & EC_UPDATE_DATA) != 0;
|
|
|
d1681e |
link->update[EC_METADATA_TXN] = (flags & EC_UPDATE_META) != 0;
|
|
|
d1681e |
link->base = base;
|
|
|
d1681e |
+ link->fl_start = fl_start;
|
|
|
d1681e |
+ link->fl_end = ec_range_end_get (fl_start, fl_size);
|
|
|
d1681e |
|
|
|
d1681e |
lock->refs_pending++;
|
|
|
d1681e |
}
|
|
|
d1681e |
|
|
|
d1681e |
void ec_lock_prepare_inode_internal(ec_fop_data_t *fop, loc_t *loc,
|
|
|
d1681e |
- uint32_t flags, loc_t *base)
|
|
|
d1681e |
+ uint32_t flags, loc_t *base,
|
|
|
d1681e |
+ off_t fl_start, size_t fl_size)
|
|
|
d1681e |
{
|
|
|
d1681e |
ec_lock_t *lock = NULL;
|
|
|
d1681e |
ec_inode_t *ctx;
|
|
|
d1681e |
@@ -824,16 +861,17 @@ void ec_lock_prepare_inode_internal(ec_fop_data_t *fop, loc_t *loc,
|
|
|
d1681e |
ctx->inode_lock = lock;
|
|
|
d1681e |
|
|
|
d1681e |
insert:
|
|
|
d1681e |
- ec_lock_insert(fop, lock, flags, base);
|
|
|
d1681e |
+ ec_lock_insert(fop, lock, flags, base, fl_start, fl_size);
|
|
|
d1681e |
update_query:
|
|
|
d1681e |
lock->query |= (flags & EC_QUERY_INFO) != 0;
|
|
|
d1681e |
unlock:
|
|
|
d1681e |
UNLOCK(&loc->inode->lock);
|
|
|
d1681e |
}
|
|
|
d1681e |
|
|
|
d1681e |
-void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, uint32_t flags)
|
|
|
d1681e |
+void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, uint32_t flags,
|
|
|
d1681e |
+ off_t fl_start, size_t fl_size)
|
|
|
d1681e |
{
|
|
|
d1681e |
- ec_lock_prepare_inode_internal(fop, loc, flags, NULL);
|
|
|
d1681e |
+ ec_lock_prepare_inode_internal(fop, loc, flags, NULL, fl_start, fl_size);
|
|
|
d1681e |
}
|
|
|
d1681e |
|
|
|
d1681e |
void ec_lock_prepare_parent_inode(ec_fop_data_t *fop, loc_t *loc, loc_t *base,
|
|
|
d1681e |
@@ -859,12 +897,13 @@ void ec_lock_prepare_parent_inode(ec_fop_data_t *fop, loc_t *loc, loc_t *base,
|
|
|
d1681e |
base = NULL;
|
|
|
d1681e |
}
|
|
|
d1681e |
|
|
|
d1681e |
- ec_lock_prepare_inode_internal(fop, &tmp, flags, base);
|
|
|
d1681e |
+ ec_lock_prepare_inode_internal(fop, &tmp, flags, base, 0, LLONG_MAX);
|
|
|
d1681e |
|
|
|
d1681e |
loc_wipe(&tmp);
|
|
|
d1681e |
}
|
|
|
d1681e |
|
|
|
d1681e |
-void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags)
|
|
|
d1681e |
+void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags,
|
|
|
d1681e |
+ off_t fl_start, size_t fl_size)
|
|
|
d1681e |
{
|
|
|
d1681e |
loc_t loc;
|
|
|
d1681e |
int32_t err;
|
|
|
d1681e |
@@ -880,7 +919,7 @@ void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags)
|
|
|
d1681e |
return;
|
|
|
d1681e |
}
|
|
|
d1681e |
|
|
|
d1681e |
- ec_lock_prepare_inode_internal(fop, &loc, flags, NULL);
|
|
|
d1681e |
+ ec_lock_prepare_inode_internal(fop, &loc, flags, NULL, fl_start, fl_size);
|
|
|
d1681e |
|
|
|
d1681e |
loc_wipe(&loc;;
|
|
|
d1681e |
}
|
|
|
d1681e |
@@ -1314,17 +1353,16 @@ out:
|
|
|
d1681e |
}
|
|
|
d1681e |
}
|
|
|
d1681e |
|
|
|
d1681e |
-gf_boolean_t ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode,
|
|
|
d1681e |
- uint64_t *size)
|
|
|
d1681e |
+gf_boolean_t
|
|
|
d1681e |
+__ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode,
|
|
|
d1681e |
+ uint64_t *size)
|
|
|
d1681e |
{
|
|
|
d1681e |
ec_inode_t *ctx;
|
|
|
d1681e |
gf_boolean_t found = _gf_false;
|
|
|
d1681e |
|
|
|
d1681e |
- LOCK(&inode->lock);
|
|
|
d1681e |
-
|
|
|
d1681e |
ctx = __ec_inode_get(inode, fop->xl);
|
|
|
d1681e |
if (ctx == NULL) {
|
|
|
d1681e |
- goto unlock;
|
|
|
d1681e |
+ goto out;
|
|
|
d1681e |
}
|
|
|
d1681e |
|
|
|
d1681e |
if (ctx->have_size) {
|
|
|
d1681e |
@@ -1332,23 +1370,35 @@ gf_boolean_t ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode,
|
|
|
d1681e |
found = _gf_true;
|
|
|
d1681e |
}
|
|
|
d1681e |
|
|
|
d1681e |
-unlock:
|
|
|
d1681e |
+out:
|
|
|
d1681e |
+ return found;
|
|
|
d1681e |
+}
|
|
|
d1681e |
+
|
|
|
d1681e |
+gf_boolean_t
|
|
|
d1681e |
+ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode,
|
|
|
d1681e |
+ uint64_t *size)
|
|
|
d1681e |
+{
|
|
|
d1681e |
+ gf_boolean_t found = _gf_false;
|
|
|
d1681e |
+
|
|
|
d1681e |
+ LOCK(&inode->lock);
|
|
|
d1681e |
+ {
|
|
|
d1681e |
+ found = __ec_get_inode_size (fop, inode, size);
|
|
|
d1681e |
+ }
|
|
|
d1681e |
UNLOCK(&inode->lock);
|
|
|
d1681e |
|
|
|
d1681e |
return found;
|
|
|
d1681e |
}
|
|
|
d1681e |
|
|
|
d1681e |
-gf_boolean_t ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode,
|
|
|
d1681e |
- uint64_t size)
|
|
|
d1681e |
+gf_boolean_t
|
|
|
d1681e |
+__ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode,
|
|
|
d1681e |
+ uint64_t size)
|
|
|
d1681e |
{
|
|
|
d1681e |
ec_inode_t *ctx;
|
|
|
d1681e |
gf_boolean_t found = _gf_false;
|
|
|
d1681e |
|
|
|
d1681e |
- LOCK(&inode->lock);
|
|
|
d1681e |
-
|
|
|
d1681e |
ctx = __ec_inode_get(inode, fop->xl);
|
|
|
d1681e |
if (ctx == NULL) {
|
|
|
d1681e |
- goto unlock;
|
|
|
d1681e |
+ goto out;
|
|
|
d1681e |
}
|
|
|
d1681e |
|
|
|
d1681e |
/* Normal fops always have ctx->have_size set. However self-heal calls this
|
|
|
d1681e |
@@ -1363,8 +1413,21 @@ gf_boolean_t ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode,
|
|
|
d1681e |
|
|
|
d1681e |
found = _gf_true;
|
|
|
d1681e |
|
|
|
d1681e |
-unlock:
|
|
|
d1681e |
- UNLOCK(&inode->lock);
|
|
|
d1681e |
+out:
|
|
|
d1681e |
+ return found;
|
|
|
d1681e |
+}
|
|
|
d1681e |
+
|
|
|
d1681e |
+gf_boolean_t
|
|
|
d1681e |
+ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode,
|
|
|
d1681e |
+ uint64_t size)
|
|
|
d1681e |
+{
|
|
|
d1681e |
+ gf_boolean_t found = _gf_false;
|
|
|
d1681e |
+
|
|
|
d1681e |
+ LOCK (&inode->lock);
|
|
|
d1681e |
+ {
|
|
|
d1681e |
+ found = __ec_set_inode_size (fop, inode, size);
|
|
|
d1681e |
+ }
|
|
|
d1681e |
+ UNLOCK (&inode->lock);
|
|
|
d1681e |
|
|
|
d1681e |
return found;
|
|
|
d1681e |
}
|
|
|
d1681e |
@@ -1471,34 +1534,47 @@ ec_lock_update_fd(ec_lock_t *lock, ec_fop_data_t *fop)
|
|
|
d1681e |
}
|
|
|
d1681e |
}
|
|
|
d1681e |
|
|
|
d1681e |
+static gf_boolean_t
|
|
|
d1681e |
+ec_link_has_lock_conflict (ec_lock_link_t *link, struct list_head *owners)
|
|
|
d1681e |
+{
|
|
|
d1681e |
+ ec_lock_link_t *owner_link = NULL;
|
|
|
d1681e |
+ ec_t *ec = link->fop->xl->private;
|
|
|
d1681e |
+
|
|
|
d1681e |
+ if (!ec->parallel_writes)
|
|
|
d1681e |
+ return _gf_true;
|
|
|
d1681e |
+
|
|
|
d1681e |
+ list_for_each_entry (owner_link, owners, owner_list) {
|
|
|
d1681e |
+ if (ec_lock_conflict (owner_link, link))
|
|
|
d1681e |
+ return _gf_true;
|
|
|
d1681e |
+ }
|
|
|
d1681e |
+ return _gf_false;
|
|
|
d1681e |
+}
|
|
|
d1681e |
+
|
|
|
d1681e |
static void
|
|
|
d1681e |
ec_lock_wake_shared(ec_lock_t *lock, struct list_head *list)
|
|
|
d1681e |
{
|
|
|
d1681e |
ec_fop_data_t *fop;
|
|
|
d1681e |
ec_lock_link_t *link;
|
|
|
d1681e |
- gf_boolean_t exclusive = _gf_false;
|
|
|
d1681e |
+ gf_boolean_t conflict = _gf_false;
|
|
|
d1681e |
|
|
|
d1681e |
- while (!exclusive && !list_empty(&lock->waiting)) {
|
|
|
d1681e |
+ while (!conflict && !list_empty(&lock->waiting)) {
|
|
|
d1681e |
link = list_entry(lock->waiting.next, ec_lock_link_t, wait_list);
|
|
|
d1681e |
fop = link->fop;
|
|
|
d1681e |
|
|
|
d1681e |
/* If lock is not acquired, at most one fop can be assigned as owner.
|
|
|
d1681e |
* The following fops will need to wait in the lock->waiting queue
|
|
|
d1681e |
* until the lock has been fully acquired. */
|
|
|
d1681e |
- exclusive = !lock->acquired;
|
|
|
d1681e |
+ conflict = !lock->acquired;
|
|
|
d1681e |
|
|
|
d1681e |
/* If the fop is not shareable, only this fop can be assigned as owner.
|
|
|
d1681e |
* Other fops will need to wait until this one finishes. */
|
|
|
d1681e |
- if ((fop->flags & EC_FLAG_LOCK_SHARED) == 0) {
|
|
|
d1681e |
- exclusive = _gf_true;
|
|
|
d1681e |
-
|
|
|
d1681e |
- /* Avoid other requests to be assigned as owners. */
|
|
|
d1681e |
- lock->exclusive = 1;
|
|
|
d1681e |
+ if (ec_link_has_lock_conflict (link, &lock->owners)) {
|
|
|
d1681e |
+ conflict = _gf_true;
|
|
|
d1681e |
}
|
|
|
d1681e |
|
|
|
d1681e |
/* If only one fop is allowed, it can be assigned as the owner of the
|
|
|
d1681e |
* lock only if there weren't any other owner. */
|
|
|
d1681e |
- if (exclusive && !list_empty(&lock->owners)) {
|
|
|
d1681e |
+ if (conflict && !list_empty(&lock->owners)) {
|
|
|
d1681e |
break;
|
|
|
d1681e |
}
|
|
|
d1681e |
|
|
|
d1681e |
@@ -1565,9 +1641,7 @@ void ec_lock_acquired(ec_lock_link_t *link)
|
|
|
d1681e |
lock->acquired = _gf_true;
|
|
|
d1681e |
|
|
|
d1681e |
ec_lock_update_fd(lock, fop);
|
|
|
d1681e |
- if ((fop->flags & EC_FLAG_LOCK_SHARED) != 0) {
|
|
|
d1681e |
- ec_lock_wake_shared(lock, &list);
|
|
|
d1681e |
- }
|
|
|
d1681e |
+ ec_lock_wake_shared(lock, &list);
|
|
|
d1681e |
|
|
|
d1681e |
UNLOCK(&lock->loc.inode->lock);
|
|
|
d1681e |
|
|
|
d1681e |
@@ -1678,11 +1752,11 @@ ec_lock_assign_owner(ec_lock_link_t *link)
|
|
|
d1681e |
/* We are trying to acquire a lock that has an unlock timer active.
|
|
|
d1681e |
* This means that the lock must be idle, i.e. no fop can be in the
|
|
|
d1681e |
* owner, waiting or frozen lists. It also means that the lock cannot
|
|
|
d1681e |
- * have been marked as being released (this is done without timers)
|
|
|
d1681e |
- * and it must not be exclusive. There should only be one owner
|
|
|
d1681e |
- * reference, but it's possible that some fops are being prepared to
|
|
|
d1681e |
- * use this lock. */
|
|
|
d1681e |
- GF_ASSERT ((lock->exclusive == 0) && (lock->refs_owners == 1) &&
|
|
|
d1681e |
+ * have been marked as being released (this is done without timers).
|
|
|
d1681e |
+ * There should only be one owner reference, but it's possible that
|
|
|
d1681e |
+ * some fops are being prepared to use this lock.
|
|
|
d1681e |
+ */
|
|
|
d1681e |
+ GF_ASSERT ((lock->refs_owners == 1) &&
|
|
|
d1681e |
list_empty(&lock->owners) && list_empty(&lock->waiting));
|
|
|
d1681e |
|
|
|
d1681e |
/* We take the timer_link before cancelling the timer, since a
|
|
|
d1681e |
@@ -1730,13 +1804,15 @@ ec_lock_assign_owner(ec_lock_link_t *link)
|
|
|
d1681e |
lock->timer = NULL;
|
|
|
d1681e |
}
|
|
|
d1681e |
|
|
|
d1681e |
- lock->exclusive |= (fop->flags & EC_FLAG_LOCK_SHARED) == 0;
|
|
|
d1681e |
-
|
|
|
d1681e |
if (!list_empty(&lock->owners)) {
|
|
|
d1681e |
/* There are other owners of this lock. We can only take ownership if
|
|
|
d1681e |
- * the lock is already acquired and can be shared. Otherwise we need
|
|
|
d1681e |
- * to wait. */
|
|
|
d1681e |
- if (!lock->acquired || (lock->exclusive != 0)) {
|
|
|
d1681e |
+ * the lock is already acquired and doesn't have conflict with existing
|
|
|
d1681e |
+ * owners, or waiters(to prevent starvation).
|
|
|
d1681e |
+ * Otherwise we need to wait.
|
|
|
d1681e |
+ */
|
|
|
d1681e |
+ if (!lock->acquired ||
|
|
|
d1681e |
+ ec_link_has_lock_conflict (link, &lock->owners) ||
|
|
|
d1681e |
+ ec_link_has_lock_conflict (link, &lock->waiting)) {
|
|
|
d1681e |
ec_trace("LOCK_QUEUE_WAIT", fop, "lock=%p", lock);
|
|
|
d1681e |
|
|
|
d1681e |
list_add_tail(&link->wait_list, &lock->waiting);
|
|
|
d1681e |
@@ -1814,10 +1890,7 @@ ec_lock_next_owner(ec_lock_link_t *link, ec_cbk_data_t *cbk,
|
|
|
d1681e |
}
|
|
|
d1681e |
ec_lock_update_good(lock, fop);
|
|
|
d1681e |
|
|
|
d1681e |
- lock->exclusive -= (fop->flags & EC_FLAG_LOCK_SHARED) == 0;
|
|
|
d1681e |
- if (list_empty(&lock->owners)) {
|
|
|
d1681e |
- ec_lock_wake_shared(lock, &list);
|
|
|
d1681e |
- }
|
|
|
d1681e |
+ ec_lock_wake_shared(lock, &list);
|
|
|
d1681e |
|
|
|
d1681e |
UNLOCK(&lock->loc.inode->lock);
|
|
|
d1681e |
|
|
|
d1681e |
@@ -1871,11 +1944,11 @@ ec_lock_unfreeze(ec_lock_link_t *link)
|
|
|
d1681e |
lock->acquired = _gf_false;
|
|
|
d1681e |
|
|
|
d1681e |
/* We are unfreezing a lock. This means that the lock has already been
|
|
|
d1681e |
- * released. In this state it shouldn't be exclusive nor have a pending
|
|
|
d1681e |
- * timer nor have any owner, and the waiting list should be empty. Only
|
|
|
d1681e |
- * the frozen list can contain some fop. */
|
|
|
d1681e |
- GF_ASSERT((lock->exclusive == 0) && (lock->timer == NULL) &&
|
|
|
d1681e |
- list_empty(&lock->waiting) && list_empty(&lock->owners));
|
|
|
d1681e |
+ * released. In this state it shouldn't have a pending timer nor have any
|
|
|
d1681e |
+ * owner, and the waiting list should be empty. Only the frozen list can
|
|
|
d1681e |
+ * contain some fop. */
|
|
|
d1681e |
+ GF_ASSERT((lock->timer == NULL) && list_empty(&lock->waiting) &&
|
|
|
d1681e |
+ list_empty(&lock->owners));
|
|
|
d1681e |
|
|
|
d1681e |
/* We move all frozen fops to the waiting list. */
|
|
|
d1681e |
list_splice_init(&lock->frozen, &lock->waiting);
|
|
|
d1681e |
@@ -2008,7 +2081,7 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version,
|
|
|
d1681e |
ec_fop_data_t *fop;
|
|
|
d1681e |
ec_lock_t *lock;
|
|
|
d1681e |
ec_inode_t *ctx;
|
|
|
d1681e |
- dict_t * dict;
|
|
|
d1681e |
+ dict_t *dict = NULL;
|
|
|
d1681e |
uintptr_t update_on = 0;
|
|
|
d1681e |
|
|
|
d1681e |
int32_t err = -ENOMEM;
|
|
|
d1681e |
@@ -2198,12 +2271,12 @@ ec_unlock_timer_del(ec_lock_link_t *link)
|
|
|
d1681e |
ec_trace("UNLOCK_DELAYED", link->fop, "lock=%p", lock);
|
|
|
d1681e |
|
|
|
d1681e |
/* The unlock timer has expired without anyone cancelling it.
|
|
|
d1681e |
- * This means that it shouldn't have any owner, and the
|
|
|
d1681e |
- * waiting and frozen lists should be empty. It shouldn't have
|
|
|
d1681e |
- * been marked as release nor be exclusive either. It must have
|
|
|
d1681e |
- * only one owner reference, but there can be fops being
|
|
|
d1681e |
- * prepared though. */
|
|
|
d1681e |
- GF_ASSERT(!lock->release && (lock->exclusive == 0) &&
|
|
|
d1681e |
+ * This means that it shouldn't have any owner, and the waiting
|
|
|
d1681e |
+ * and frozen lists should be empty. It must have only one
|
|
|
d1681e |
+ * owner reference, but there can be fops being prepared
|
|
|
d1681e |
+ * though.
|
|
|
d1681e |
+ * */
|
|
|
d1681e |
+ GF_ASSERT(!lock->release &&
|
|
|
d1681e |
(lock->refs_owners == 1) &&
|
|
|
d1681e |
list_empty(&lock->owners) &&
|
|
|
d1681e |
list_empty(&lock->waiting) &&
|
|
|
d1681e |
diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h
|
|
|
d1681e |
index 8f5d20a..1a947cc 100644
|
|
|
d1681e |
--- a/xlators/cluster/ec/src/ec-common.h
|
|
|
d1681e |
+++ b/xlators/cluster/ec/src/ec-common.h
|
|
|
d1681e |
@@ -91,18 +91,24 @@ ec_fop_prepare_answer(ec_fop_data_t *fop, gf_boolean_t ro);
|
|
|
d1681e |
gf_boolean_t
|
|
|
d1681e |
ec_cbk_set_error(ec_cbk_data_t *cbk, int32_t error, gf_boolean_t ro);
|
|
|
d1681e |
|
|
|
d1681e |
-void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, uint32_t flags);
|
|
|
d1681e |
+void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, uint32_t flags,
|
|
|
d1681e |
+ off_t fl_start, size_t fl_size);
|
|
|
d1681e |
void ec_lock_prepare_parent_inode(ec_fop_data_t *fop, loc_t *loc, loc_t *base,
|
|
|
d1681e |
uint32_t flags);
|
|
|
d1681e |
-void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags);
|
|
|
d1681e |
+void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags,
|
|
|
d1681e |
+ off_t fl_start, size_t fl_size);
|
|
|
d1681e |
void ec_lock(ec_fop_data_t * fop);
|
|
|
d1681e |
void ec_lock_reuse(ec_fop_data_t *fop);
|
|
|
d1681e |
void ec_unlock(ec_fop_data_t * fop);
|
|
|
d1681e |
|
|
|
d1681e |
gf_boolean_t ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode,
|
|
|
d1681e |
uint64_t *size);
|
|
|
d1681e |
+gf_boolean_t __ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode,
|
|
|
d1681e |
+ uint64_t *size);
|
|
|
d1681e |
gf_boolean_t ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode,
|
|
|
d1681e |
uint64_t size);
|
|
|
d1681e |
+gf_boolean_t __ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode,
|
|
|
d1681e |
+ uint64_t size);
|
|
|
d1681e |
void ec_clear_inode_info(ec_fop_data_t *fop, inode_t *inode);
|
|
|
d1681e |
|
|
|
d1681e |
void ec_flush_size_version(ec_fop_data_t * fop);
|
|
|
d1681e |
diff --git a/xlators/cluster/ec/src/ec-dir-read.c b/xlators/cluster/ec/src/ec-dir-read.c
|
|
|
d1681e |
index 4fe82e3..48afe54 100644
|
|
|
d1681e |
--- a/xlators/cluster/ec/src/ec-dir-read.c
|
|
|
d1681e |
+++ b/xlators/cluster/ec/src/ec-dir-read.c
|
|
|
d1681e |
@@ -141,7 +141,8 @@ int32_t ec_manager_opendir(ec_fop_data_t * fop, int32_t state)
|
|
|
d1681e |
/* Fall through */
|
|
|
d1681e |
|
|
|
d1681e |
case EC_STATE_LOCK:
|
|
|
d1681e |
- ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO);
|
|
|
d1681e |
+ ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO, 0,
|
|
|
d1681e |
+ LLONG_MAX);
|
|
|
d1681e |
ec_lock(fop);
|
|
|
d1681e |
|
|
|
d1681e |
return EC_STATE_DISPATCH;
|
|
|
d1681e |
@@ -432,7 +433,8 @@ int32_t ec_manager_readdir(ec_fop_data_t * fop, int32_t state)
|
|
|
d1681e |
}
|
|
|
d1681e |
fop->mask &= 1ULL << idx;
|
|
|
d1681e |
} else {
|
|
|
d1681e |
- ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO);
|
|
|
d1681e |
+ ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, 0,
|
|
|
d1681e |
+ LLONG_MAX);
|
|
|
d1681e |
ec_lock(fop);
|
|
|
d1681e |
}
|
|
|
d1681e |
|
|
|
d1681e |
diff --git a/xlators/cluster/ec/src/ec-generic.c b/xlators/cluster/ec/src/ec-generic.c
|
|
|
d1681e |
index ddb90ce..a5f986e 100644
|
|
|
d1681e |
--- a/xlators/cluster/ec/src/ec-generic.c
|
|
|
d1681e |
+++ b/xlators/cluster/ec/src/ec-generic.c
|
|
|
d1681e |
@@ -85,7 +85,7 @@ int32_t ec_manager_flush(ec_fop_data_t * fop, int32_t state)
|
|
|
d1681e |
{
|
|
|
d1681e |
case EC_STATE_INIT:
|
|
|
d1681e |
case EC_STATE_LOCK:
|
|
|
d1681e |
- ec_lock_prepare_fd(fop, fop->fd, 0);
|
|
|
d1681e |
+ ec_lock_prepare_fd(fop, fop->fd, 0, 0, LLONG_MAX);
|
|
|
d1681e |
ec_lock(fop);
|
|
|
d1681e |
|
|
|
d1681e |
return EC_STATE_DISPATCH;
|
|
|
d1681e |
@@ -300,7 +300,7 @@ int32_t ec_manager_fsync(ec_fop_data_t * fop, int32_t state)
|
|
|
d1681e |
{
|
|
|
d1681e |
case EC_STATE_INIT:
|
|
|
d1681e |
case EC_STATE_LOCK:
|
|
|
d1681e |
- ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO);
|
|
|
d1681e |
+ ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, 0, LLONG_MAX);
|
|
|
d1681e |
ec_lock(fop);
|
|
|
d1681e |
|
|
|
d1681e |
return EC_STATE_DISPATCH;
|
|
|
d1681e |
@@ -501,7 +501,7 @@ int32_t ec_manager_fsyncdir(ec_fop_data_t * fop, int32_t state)
|
|
|
d1681e |
{
|
|
|
d1681e |
case EC_STATE_INIT:
|
|
|
d1681e |
case EC_STATE_LOCK:
|
|
|
d1681e |
- ec_lock_prepare_fd(fop, fop->fd, 0);
|
|
|
d1681e |
+ ec_lock_prepare_fd(fop, fop->fd, 0, 0, LLONG_MAX);
|
|
|
d1681e |
ec_lock(fop);
|
|
|
d1681e |
|
|
|
d1681e |
return EC_STATE_DISPATCH;
|
|
|
d1681e |
@@ -1220,9 +1220,11 @@ int32_t ec_manager_xattrop(ec_fop_data_t * fop, int32_t state)
|
|
|
d1681e |
case EC_STATE_INIT:
|
|
|
d1681e |
case EC_STATE_LOCK:
|
|
|
d1681e |
if (fop->fd == NULL) {
|
|
|
d1681e |
- ec_lock_prepare_inode(fop, &fop->loc[0], EC_UPDATE_META);
|
|
|
d1681e |
+ ec_lock_prepare_inode(fop, &fop->loc[0], EC_UPDATE_META, 0,
|
|
|
d1681e |
+ LLONG_MAX);
|
|
|
d1681e |
} else {
|
|
|
d1681e |
- ec_lock_prepare_fd(fop, fop->fd, EC_UPDATE_META);
|
|
|
d1681e |
+ ec_lock_prepare_fd(fop, fop->fd, EC_UPDATE_META, 0,
|
|
|
d1681e |
+ LLONG_MAX);
|
|
|
d1681e |
}
|
|
|
d1681e |
ec_lock(fop);
|
|
|
d1681e |
|
|
|
d1681e |
diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c
|
|
|
d1681e |
index 829f47f..33fd7f5 100644
|
|
|
d1681e |
--- a/xlators/cluster/ec/src/ec-inode-read.c
|
|
|
d1681e |
+++ b/xlators/cluster/ec/src/ec-inode-read.c
|
|
|
d1681e |
@@ -72,7 +72,8 @@ ec_manager_access(ec_fop_data_t *fop, int32_t state)
|
|
|
d1681e |
switch (state) {
|
|
|
d1681e |
case EC_STATE_INIT:
|
|
|
d1681e |
case EC_STATE_LOCK:
|
|
|
d1681e |
- ec_lock_prepare_inode (fop, &fop->loc[0], EC_QUERY_INFO);
|
|
|
d1681e |
+ ec_lock_prepare_inode (fop, &fop->loc[0], EC_QUERY_INFO, 0,
|
|
|
d1681e |
+ LLONG_MAX);
|
|
|
d1681e |
ec_lock (fop);
|
|
|
d1681e |
|
|
|
d1681e |
return EC_STATE_DISPATCH;
|
|
|
d1681e |
@@ -311,9 +312,11 @@ int32_t ec_manager_getxattr(ec_fop_data_t * fop, int32_t state)
|
|
|
d1681e |
(strncmp(fop->str[0], GF_XATTR_CLRLK_CMD,
|
|
|
d1681e |
strlen(GF_XATTR_CLRLK_CMD)) != 0)) {
|
|
|
d1681e |
if (fop->fd == NULL) {
|
|
|
d1681e |
- ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO);
|
|
|
d1681e |
+ ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO,
|
|
|
d1681e |
+ 0, LLONG_MAX);
|
|
|
d1681e |
} else {
|
|
|
d1681e |
- ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO);
|
|
|
d1681e |
+ ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, 0,
|
|
|
d1681e |
+ LLONG_MAX);
|
|
|
d1681e |
}
|
|
|
d1681e |
ec_lock(fop);
|
|
|
d1681e |
}
|
|
|
d1681e |
@@ -1029,7 +1032,8 @@ int32_t ec_manager_readlink(ec_fop_data_t * fop, int32_t state)
|
|
|
d1681e |
{
|
|
|
d1681e |
case EC_STATE_INIT:
|
|
|
d1681e |
case EC_STATE_LOCK:
|
|
|
d1681e |
- ec_lock_prepare_inode (fop, &fop->loc[0], EC_QUERY_INFO);
|
|
|
d1681e |
+ ec_lock_prepare_inode (fop, &fop->loc[0], EC_QUERY_INFO, 0,
|
|
|
d1681e |
+ LLONG_MAX);
|
|
|
d1681e |
ec_lock (fop);
|
|
|
d1681e |
return EC_STATE_DISPATCH;
|
|
|
d1681e |
|
|
|
d1681e |
@@ -1364,7 +1368,8 @@ int32_t ec_manager_readv(ec_fop_data_t * fop, int32_t state)
|
|
|
d1681e |
/* Fall through */
|
|
|
d1681e |
|
|
|
d1681e |
case EC_STATE_LOCK:
|
|
|
d1681e |
- ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO);
|
|
|
d1681e |
+ ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, fop->offset,
|
|
|
d1681e |
+ fop->size);
|
|
|
d1681e |
ec_lock(fop);
|
|
|
d1681e |
|
|
|
d1681e |
return EC_STATE_DISPATCH;
|
|
|
d1681e |
@@ -1568,7 +1573,7 @@ int32_t ec_manager_seek(ec_fop_data_t *fop, int32_t state)
|
|
|
d1681e |
/* Fall through */
|
|
|
d1681e |
|
|
|
d1681e |
case EC_STATE_LOCK:
|
|
|
d1681e |
- ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO);
|
|
|
d1681e |
+ ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, fop->offset, LLONG_MAX);
|
|
|
d1681e |
ec_lock(fop);
|
|
|
d1681e |
|
|
|
d1681e |
return EC_STATE_DISPATCH;
|
|
|
d1681e |
@@ -1788,9 +1793,10 @@ int32_t ec_manager_stat(ec_fop_data_t * fop, int32_t state)
|
|
|
d1681e |
case EC_STATE_INIT:
|
|
|
d1681e |
case EC_STATE_LOCK:
|
|
|
d1681e |
if (fop->fd == NULL) {
|
|
|
d1681e |
- ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO);
|
|
|
d1681e |
+ ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO, 0,
|
|
|
d1681e |
+ LLONG_MAX);
|
|
|
d1681e |
} else {
|
|
|
d1681e |
- ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO);
|
|
|
d1681e |
+ ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, 0, LLONG_MAX);
|
|
|
d1681e |
}
|
|
|
d1681e |
ec_lock(fop);
|
|
|
d1681e |
|
|
|
d1681e |
diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c
|
|
|
d1681e |
index 3ed9b2a..e6a67cf 100644
|
|
|
d1681e |
--- a/xlators/cluster/ec/src/ec-inode-write.c
|
|
|
d1681e |
+++ b/xlators/cluster/ec/src/ec-inode-write.c
|
|
|
d1681e |
@@ -127,10 +127,12 @@ ec_manager_xattr (ec_fop_data_t *fop, int32_t state)
|
|
|
d1681e |
case EC_STATE_LOCK:
|
|
|
d1681e |
if (fop->fd == NULL) {
|
|
|
d1681e |
ec_lock_prepare_inode(fop, &fop->loc[0],
|
|
|
d1681e |
- EC_UPDATE_META | EC_QUERY_INFO);
|
|
|
d1681e |
+ EC_UPDATE_META | EC_QUERY_INFO,
|
|
|
d1681e |
+ 0, LLONG_MAX);
|
|
|
d1681e |
} else {
|
|
|
d1681e |
ec_lock_prepare_fd(fop, fop->fd,
|
|
|
d1681e |
- EC_UPDATE_META | EC_QUERY_INFO);
|
|
|
d1681e |
+ EC_UPDATE_META | EC_QUERY_INFO,
|
|
|
d1681e |
+ 0, LLONG_MAX);
|
|
|
d1681e |
}
|
|
|
d1681e |
ec_lock(fop);
|
|
|
d1681e |
|
|
|
d1681e |
@@ -369,10 +371,11 @@ int32_t ec_manager_setattr(ec_fop_data_t * fop, int32_t state)
|
|
|
d1681e |
case EC_STATE_LOCK:
|
|
|
d1681e |
if (fop->fd == NULL) {
|
|
|
d1681e |
ec_lock_prepare_inode(fop, &fop->loc[0],
|
|
|
d1681e |
- EC_UPDATE_META | EC_QUERY_INFO);
|
|
|
d1681e |
+ EC_UPDATE_META | EC_QUERY_INFO,
|
|
|
d1681e |
+ 0, LLONG_MAX);
|
|
|
d1681e |
} else {
|
|
|
d1681e |
- ec_lock_prepare_fd(fop, fop->fd,
|
|
|
d1681e |
- EC_UPDATE_META | EC_QUERY_INFO);
|
|
|
d1681e |
+ ec_lock_prepare_fd(fop, fop->fd, EC_UPDATE_META | EC_QUERY_INFO,
|
|
|
d1681e |
+ 0, LLONG_MAX);
|
|
|
d1681e |
}
|
|
|
d1681e |
ec_lock(fop);
|
|
|
d1681e |
|
|
|
d1681e |
@@ -879,8 +882,8 @@ int32_t ec_manager_fallocate(ec_fop_data_t *fop, int32_t state)
|
|
|
d1681e |
|
|
|
d1681e |
case EC_STATE_LOCK:
|
|
|
d1681e |
ec_lock_prepare_fd(fop, fop->fd,
|
|
|
d1681e |
- EC_UPDATE_DATA | EC_UPDATE_META |
|
|
|
d1681e |
- EC_QUERY_INFO);
|
|
|
d1681e |
+ EC_UPDATE_DATA | EC_UPDATE_META | EC_QUERY_INFO,
|
|
|
d1681e |
+ fop->offset, fop->size);
|
|
|
d1681e |
ec_lock(fop);
|
|
|
d1681e |
|
|
|
d1681e |
return EC_STATE_DISPATCH;
|
|
|
d1681e |
@@ -898,24 +901,28 @@ int32_t ec_manager_fallocate(ec_fop_data_t *fop, int32_t state)
|
|
|
d1681e |
cbk->count);
|
|
|
d1681e |
|
|
|
d1681e |
/* This shouldn't fail because we have the inode locked. */
|
|
|
d1681e |
- GF_ASSERT(ec_get_inode_size(fop, fop->locks[0].lock->loc.inode,
|
|
|
d1681e |
- &cbk->iatt[0].ia_size));
|
|
|
d1681e |
+ LOCK(&fop->locks[0].lock->loc.inode->lock);
|
|
|
d1681e |
+ {
|
|
|
d1681e |
+ GF_ASSERT(__ec_get_inode_size(fop,
|
|
|
d1681e |
+ fop->locks[0].lock->loc.inode,
|
|
|
d1681e |
+ &cbk->iatt[0].ia_size));
|
|
|
d1681e |
|
|
|
d1681e |
- /*If mode has FALLOC_FL_KEEP_SIZE keep the size */
|
|
|
d1681e |
- if (fop->int32 & FALLOC_FL_KEEP_SIZE) {
|
|
|
d1681e |
- cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
|
|
|
d1681e |
- } else if (fop->user_size > cbk->iatt[0].ia_size) {
|
|
|
d1681e |
- cbk->iatt[1].ia_size = fop->user_size;
|
|
|
d1681e |
-
|
|
|
d1681e |
- /* This shouldn't fail because we have the inode
|
|
|
d1681e |
- * locked. */
|
|
|
d1681e |
- GF_ASSERT(ec_set_inode_size(fop,
|
|
|
d1681e |
- fop->locks[0].lock->loc.inode,
|
|
|
d1681e |
- cbk->iatt[1].ia_size));
|
|
|
d1681e |
- } else {
|
|
|
d1681e |
- cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
|
|
|
d1681e |
+ /*If mode has FALLOC_FL_KEEP_SIZE keep the size */
|
|
|
d1681e |
+ if (fop->int32 & FALLOC_FL_KEEP_SIZE) {
|
|
|
d1681e |
+ cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
|
|
|
d1681e |
+ } else if (fop->user_size > cbk->iatt[0].ia_size) {
|
|
|
d1681e |
+ cbk->iatt[1].ia_size = fop->user_size;
|
|
|
d1681e |
+
|
|
|
d1681e |
+ /* This shouldn't fail because we have the inode
|
|
|
d1681e |
+ * locked. */
|
|
|
d1681e |
+ GF_ASSERT(__ec_set_inode_size(fop,
|
|
|
d1681e |
+ fop->locks[0].lock->loc.inode,
|
|
|
d1681e |
+ cbk->iatt[1].ia_size));
|
|
|
d1681e |
+ } else {
|
|
|
d1681e |
+ cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
|
|
|
d1681e |
+ }
|
|
|
d1681e |
}
|
|
|
d1681e |
-
|
|
|
d1681e |
+ UNLOCK(&fop->locks[0].lock->loc.inode->lock);
|
|
|
d1681e |
}
|
|
|
d1681e |
|
|
|
d1681e |
return EC_STATE_REPORT;
|
|
|
d1681e |
@@ -1155,11 +1162,11 @@ int32_t ec_manager_truncate(ec_fop_data_t * fop, int32_t state)
|
|
|
d1681e |
if (fop->id == GF_FOP_TRUNCATE) {
|
|
|
d1681e |
ec_lock_prepare_inode(fop, &fop->loc[0],
|
|
|
d1681e |
EC_UPDATE_DATA | EC_UPDATE_META |
|
|
|
d1681e |
- EC_QUERY_INFO);
|
|
|
d1681e |
+ EC_QUERY_INFO, fop->offset, LLONG_MAX);
|
|
|
d1681e |
} else {
|
|
|
d1681e |
ec_lock_prepare_fd(fop, fop->fd,
|
|
|
d1681e |
EC_UPDATE_DATA | EC_UPDATE_META |
|
|
|
d1681e |
- EC_QUERY_INFO);
|
|
|
d1681e |
+ EC_QUERY_INFO, fop->offset, LLONG_MAX);
|
|
|
d1681e |
}
|
|
|
d1681e |
ec_lock(fop);
|
|
|
d1681e |
|
|
|
d1681e |
@@ -1179,6 +1186,9 @@ int32_t ec_manager_truncate(ec_fop_data_t * fop, int32_t state)
|
|
|
d1681e |
cbk->count);
|
|
|
d1681e |
|
|
|
d1681e |
/* This shouldn't fail because we have the inode locked. */
|
|
|
d1681e |
+ /* Inode size doesn't need to be updated under locks, because
|
|
|
d1681e |
+ * conflicting operations won't be in-flight
|
|
|
d1681e |
+ */
|
|
|
d1681e |
GF_ASSERT(ec_get_inode_size(fop, fop->locks[0].lock->loc.inode,
|
|
|
d1681e |
&cbk->iatt[0].ia_size));
|
|
|
d1681e |
cbk->iatt[1].ia_size = fop->user_size;
|
|
|
d1681e |
@@ -1582,6 +1592,9 @@ void ec_writev_start(ec_fop_data_t *fop)
|
|
|
d1681e |
ctx = ec_fd_get(fop->fd, fop->xl);
|
|
|
d1681e |
if (ctx != NULL) {
|
|
|
d1681e |
if ((ctx->flags & O_APPEND) != 0) {
|
|
|
d1681e |
+ /* Appending writes take full locks so size won't change because
|
|
|
d1681e |
+ * of any parallel operations
|
|
|
d1681e |
+ */
|
|
|
d1681e |
fop->offset = current;
|
|
|
d1681e |
}
|
|
|
d1681e |
}
|
|
|
d1681e |
@@ -1601,6 +1614,10 @@ void ec_writev_start(ec_fop_data_t *fop)
|
|
|
d1681e |
}
|
|
|
d1681e |
tail = fop->size - fop->user_size - fop->head;
|
|
|
d1681e |
if ((tail > 0) && ((fop->head == 0) || (fop->size > ec->stripe_size))) {
|
|
|
d1681e |
+ /* Current locking scheme will make sure the 'current' below will
|
|
|
d1681e |
+ * never decrease while the fop is in progress, so the checks will
|
|
|
d1681e |
+ * work as expected
|
|
|
d1681e |
+ */
|
|
|
d1681e |
if (current > fop->offset + fop->head + fop->user_size) {
|
|
|
d1681e |
if (ec_make_internal_fop_xdata (&xdata)) {
|
|
|
d1681e |
err = -ENOMEM;
|
|
|
d1681e |
@@ -1678,14 +1695,32 @@ ec_writev_encode(ec_fop_data_t *fop)
|
|
|
d1681e |
int32_t ec_manager_writev(ec_fop_data_t *fop, int32_t state)
|
|
|
d1681e |
{
|
|
|
d1681e |
ec_cbk_data_t *cbk;
|
|
|
d1681e |
+ ec_fd_t *ctx = NULL;
|
|
|
d1681e |
+ ec_t *ec = fop->xl->private;
|
|
|
d1681e |
+ off_t fl_start = 0;
|
|
|
d1681e |
+ size_t fl_size = LLONG_MAX;
|
|
|
d1681e |
|
|
|
d1681e |
switch (state)
|
|
|
d1681e |
{
|
|
|
d1681e |
case EC_STATE_INIT:
|
|
|
d1681e |
case EC_STATE_LOCK:
|
|
|
d1681e |
+ ctx = ec_fd_get(fop->fd, fop->xl);
|
|
|
d1681e |
+ if (ctx != NULL) {
|
|
|
d1681e |
+ if ((ctx->flags & O_APPEND) == 0) {
|
|
|
d1681e |
+ off_t user_size = 0;
|
|
|
d1681e |
+ off_t head = 0;
|
|
|
d1681e |
+
|
|
|
d1681e |
+ fl_start = fop->offset;
|
|
|
d1681e |
+ user_size = iov_length(fop->vector, fop->int32);
|
|
|
d1681e |
+ head = ec_adjust_offset_down(ec, &fl_start,
|
|
|
d1681e |
+ _gf_true);
|
|
|
d1681e |
+ fl_size = user_size + head;
|
|
|
d1681e |
+ ec_adjust_size_up(ec, &fl_size, _gf_true);
|
|
|
d1681e |
+ }
|
|
|
d1681e |
+ }
|
|
|
d1681e |
ec_lock_prepare_fd(fop, fop->fd,
|
|
|
d1681e |
EC_UPDATE_DATA | EC_UPDATE_META |
|
|
|
d1681e |
- EC_QUERY_INFO);
|
|
|
d1681e |
+ EC_QUERY_INFO, fl_start, fl_size);
|
|
|
d1681e |
ec_lock(fop);
|
|
|
d1681e |
|
|
|
d1681e |
return EC_STATE_DISPATCH;
|
|
|
d1681e |
@@ -1717,23 +1752,28 @@ int32_t ec_manager_writev(ec_fop_data_t *fop, int32_t state)
|
|
|
d1681e |
cbk->count);
|
|
|
d1681e |
|
|
|
d1681e |
/* This shouldn't fail because we have the inode locked. */
|
|
|
d1681e |
- GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode,
|
|
|
d1681e |
- &cbk->iatt[0].ia_size));
|
|
|
d1681e |
- cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
|
|
|
d1681e |
- size = fop->offset + fop->head + fop->user_size;
|
|
|
d1681e |
- if (size > cbk->iatt[0].ia_size) {
|
|
|
d1681e |
- /* Only update inode size if this is a top level fop.
|
|
|
d1681e |
- * Otherwise this is an internal write and the top
|
|
|
d1681e |
- * level fop should take care of the real inode size.
|
|
|
d1681e |
- */
|
|
|
d1681e |
- if (fop->parent == NULL) {
|
|
|
d1681e |
- /* This shouldn't fail because we have the inode
|
|
|
d1681e |
- * locked. */
|
|
|
d1681e |
- GF_ASSERT(ec_set_inode_size(fop, fop->fd->inode,
|
|
|
d1681e |
- size));
|
|
|
d1681e |
- }
|
|
|
d1681e |
- cbk->iatt[1].ia_size = size;
|
|
|
d1681e |
+ LOCK(&fop->fd->inode->lock);
|
|
|
d1681e |
+ {
|
|
|
d1681e |
+ GF_ASSERT(__ec_get_inode_size(fop, fop->fd->inode,
|
|
|
d1681e |
+ &cbk->iatt[0].ia_size));
|
|
|
d1681e |
+ cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
|
|
|
d1681e |
+ size = fop->offset + fop->head + fop->user_size;
|
|
|
d1681e |
+ if (size > cbk->iatt[0].ia_size) {
|
|
|
d1681e |
+ /* Only update inode size if this is a top level fop.
|
|
|
d1681e |
+ * Otherwise this is an internal write and the top
|
|
|
d1681e |
+ * level fop should take care of the real inode size.
|
|
|
d1681e |
+ */
|
|
|
d1681e |
+ if (fop->parent == NULL) {
|
|
|
d1681e |
+ /* This shouldn't fail because we have the inode
|
|
|
d1681e |
+ * locked. */
|
|
|
d1681e |
+ GF_ASSERT(__ec_set_inode_size(fop,
|
|
|
d1681e |
+ fop->fd->inode, size));
|
|
|
d1681e |
+ }
|
|
|
d1681e |
+ cbk->iatt[1].ia_size = size;
|
|
|
d1681e |
+ }
|
|
|
d1681e |
}
|
|
|
d1681e |
+ UNLOCK(&fop->fd->inode->lock);
|
|
|
d1681e |
+
|
|
|
d1681e |
if (fop->error == 0) {
|
|
|
d1681e |
cbk->op_ret *= ec->fragments;
|
|
|
d1681e |
if (cbk->op_ret < fop->head) {
|
|
|
d1681e |
diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
|
|
|
d1681e |
index 5601f96..354b4ed 100644
|
|
|
d1681e |
--- a/xlators/cluster/ec/src/ec-types.h
|
|
|
d1681e |
+++ b/xlators/cluster/ec/src/ec-types.h
|
|
|
d1681e |
@@ -211,8 +211,8 @@ struct _ec_lock {
|
|
|
d1681e |
struct list_head owners;
|
|
|
d1681e |
|
|
|
d1681e |
/* List of fops waiting to be an owner of the lock. Fops are added to this
|
|
|
d1681e |
- * list when the current owner has an incompatible access (shared vs
|
|
|
d1681e |
- * exclusive) or the lock is not acquired yet. */
|
|
|
d1681e |
+ * list when the current owner has an incompatible access (conflicting lock)
|
|
|
d1681e |
+ * or the lock is not acquired yet. */
|
|
|
d1681e |
struct list_head waiting;
|
|
|
d1681e |
|
|
|
d1681e |
/* List of fops that will wait until the next unlock/lock cycle. This
|
|
|
d1681e |
@@ -221,7 +221,6 @@ struct _ec_lock {
|
|
|
d1681e |
* after the lock is reacquired. */
|
|
|
d1681e |
struct list_head frozen;
|
|
|
d1681e |
|
|
|
d1681e |
- int32_t exclusive;
|
|
|
d1681e |
uintptr_t mask;
|
|
|
d1681e |
uintptr_t good_mask;
|
|
|
d1681e |
uintptr_t healing;
|
|
|
d1681e |
@@ -251,6 +250,8 @@ struct _ec_lock_link {
|
|
|
d1681e |
loc_t *base;
|
|
|
d1681e |
uint64_t size;
|
|
|
d1681e |
uint32_t waiting_flags;
|
|
|
d1681e |
+ off_t fl_start;
|
|
|
d1681e |
+ off_t fl_end;
|
|
|
d1681e |
};
|
|
|
d1681e |
|
|
|
d1681e |
struct _ec_fop_data {
|
|
|
d1681e |
@@ -564,6 +565,7 @@ struct _ec {
|
|
|
d1681e |
gf_boolean_t shutdown;
|
|
|
d1681e |
gf_boolean_t eager_lock;
|
|
|
d1681e |
gf_boolean_t optimistic_changelog;
|
|
|
d1681e |
+ gf_boolean_t parallel_writes;
|
|
|
d1681e |
uint32_t background_heals;
|
|
|
d1681e |
uint32_t heal_wait_qlen;
|
|
|
d1681e |
uint32_t self_heal_window_size; /* max size of read/writes */
|
|
|
d1681e |
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
|
|
|
d1681e |
index c32f4ef..856d60c 100644
|
|
|
d1681e |
--- a/xlators/cluster/ec/src/ec.c
|
|
|
d1681e |
+++ b/xlators/cluster/ec/src/ec.c
|
|
|
d1681e |
@@ -295,6 +295,8 @@ reconfigure (xlator_t *this, dict_t *options)
|
|
|
d1681e |
|
|
|
d1681e |
GF_OPTION_RECONF ("optimistic-change-log", ec->optimistic_changelog,
|
|
|
d1681e |
options, bool, failed);
|
|
|
d1681e |
+ GF_OPTION_RECONF ("parallel-writes", ec->parallel_writes,
|
|
|
d1681e |
+ options, bool, failed);
|
|
|
d1681e |
ret = 0;
|
|
|
d1681e |
if (ec_assign_read_policy (ec, read_policy)) {
|
|
|
d1681e |
ret = -1;
|
|
|
d1681e |
@@ -665,6 +667,7 @@ init (xlator_t *this)
|
|
|
d1681e |
GF_OPTION_INIT ("shd-max-threads", ec->shd.max_threads, uint32, failed);
|
|
|
d1681e |
GF_OPTION_INIT ("shd-wait-qlength", ec->shd.wait_qlength, uint32, failed);
|
|
|
d1681e |
GF_OPTION_INIT ("optimistic-change-log", ec->optimistic_changelog, bool, failed);
|
|
|
d1681e |
+ GF_OPTION_INIT ("parallel-writes", ec->parallel_writes, bool, failed);
|
|
|
d1681e |
|
|
|
d1681e |
this->itable = inode_table_new (EC_SHD_INODE_LRU_LIMIT, this);
|
|
|
d1681e |
if (!this->itable)
|
|
|
d1681e |
@@ -1466,28 +1469,34 @@ struct volume_options options[] =
|
|
|
d1681e |
"galois field computations."
|
|
|
d1681e |
},
|
|
|
d1681e |
{ .key = {"self-heal-window-size"},
|
|
|
d1681e |
- .type = GF_OPTION_TYPE_INT,
|
|
|
d1681e |
- .min = 1,
|
|
|
d1681e |
- .max = 1024,
|
|
|
d1681e |
- .default_value = "1",
|
|
|
d1681e |
- .description = "Maximum number blocks(128KB) per file for which "
|
|
|
d1681e |
- "self-heal process would be applied simultaneously."
|
|
|
d1681e |
+ .type = GF_OPTION_TYPE_INT,
|
|
|
d1681e |
+ .min = 1,
|
|
|
d1681e |
+ .max = 1024,
|
|
|
d1681e |
+ .default_value = "1",
|
|
|
d1681e |
+ .description = "Maximum number blocks(128KB) per file for which "
|
|
|
d1681e |
+ "self-heal process would be applied simultaneously."
|
|
|
d1681e |
},
|
|
|
d1681e |
- { .key = {"optimistic-change-log"},
|
|
|
d1681e |
- .type = GF_OPTION_TYPE_BOOL,
|
|
|
d1681e |
- .default_value = "on",
|
|
|
d1681e |
- .description = "Set/Unset dirty flag for every update fop at the start"
|
|
|
d1681e |
- "of the fop. If OFF, this option impacts performance of"
|
|
|
d1681e |
- "entry operations or metadata operations as it will"
|
|
|
d1681e |
- "set dirty flag at the start and unset it at the end of"
|
|
|
d1681e |
- "ALL update fop. If ON and all the bricks are good,"
|
|
|
d1681e |
- "dirty flag will be set at the start only for file fops"
|
|
|
d1681e |
- "For metadata and entry fops dirty flag will not be set"
|
|
|
d1681e |
- "at the start, if all the bricks are good. This does"
|
|
|
d1681e |
- "not impact performance for metadata operations and"
|
|
|
d1681e |
- "entry operation but has a very small window to miss"
|
|
|
d1681e |
- "marking entry as dirty in case it is required to be"
|
|
|
d1681e |
- "healed"
|
|
|
d1681e |
+ { .key = {"optimistic-change-log"},
|
|
|
d1681e |
+ .type = GF_OPTION_TYPE_BOOL,
|
|
|
d1681e |
+ .default_value = "on",
|
|
|
d1681e |
+ .description = "Set/Unset dirty flag for every update fop at the start"
|
|
|
d1681e |
+ "of the fop. If OFF, this option impacts performance of"
|
|
|
d1681e |
+ "entry operations or metadata operations as it will"
|
|
|
d1681e |
+ "set dirty flag at the start and unset it at the end of"
|
|
|
d1681e |
+ "ALL update fop. If ON and all the bricks are good,"
|
|
|
d1681e |
+ "dirty flag will be set at the start only for file fops"
|
|
|
d1681e |
+ "For metadata and entry fops dirty flag will not be set"
|
|
|
d1681e |
+ "at the start, if all the bricks are good. This does"
|
|
|
d1681e |
+ "not impact performance for metadata operations and"
|
|
|
d1681e |
+ "entry operation but has a very small window to miss"
|
|
|
d1681e |
+ "marking entry as dirty in case it is required to be"
|
|
|
d1681e |
+ "healed"
|
|
|
d1681e |
+ },
|
|
|
d1681e |
+ { .key = {"parallel-writes"},
|
|
|
d1681e |
+ .type = GF_OPTION_TYPE_BOOL,
|
|
|
d1681e |
+ .default_value = "on",
|
|
|
d1681e |
+ .description = "This controls if writes can be wound in parallel as long"
|
|
|
d1681e |
+ "as it doesn't modify same stripes"
|
|
|
d1681e |
},
|
|
|
d1681e |
{ .key = {NULL} }
|
|
|
d1681e |
};
|
|
|
d1681e |
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
|
|
|
d1681e |
index 7fe76e5..b15a5af 100644
|
|
|
d1681e |
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
|
|
|
d1681e |
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
|
|
|
d1681e |
@@ -3510,6 +3510,12 @@ struct volopt_map_entry glusterd_volopt_map[] = {
|
|
|
d1681e |
.op_version = GD_OP_VERSION_3_12_0,
|
|
|
d1681e |
.validate_fn = validate_boolean
|
|
|
d1681e |
},
|
|
|
d1681e |
+ { .key = "disperse.parallel-writes",
|
|
|
d1681e |
+ .voltype = "cluster/disperse",
|
|
|
d1681e |
+ .type = NO_DOC,
|
|
|
d1681e |
+ .op_version = GD_OP_VERSION_3_13_0,
|
|
|
d1681e |
+ .flags = OPT_FLAG_CLIENT_OPT
|
|
|
d1681e |
+ },
|
|
|
d1681e |
{ .key = NULL
|
|
|
d1681e |
}
|
|
|
d1681e |
};
|
|
|
d1681e |
--
|
|
|
d1681e |
1.8.3.1
|
|
|
d1681e |
|