|
|
12a457 |
From 13baca9f9e2e86a66713db708d06fa4927d298b5 Mon Sep 17 00:00:00 2001
|
|
|
12a457 |
From: Xavier Hernandez <xhernandez@datalab.es>
|
|
|
12a457 |
Date: Thu, 28 Apr 2016 08:42:40 +0200
|
|
|
12a457 |
Subject: [PATCH 173/178] cluster/ec: Fix issues with eager locking
|
|
|
12a457 |
|
|
|
12a457 |
Due to a race in timer cancellation, in some cases it was possible
|
|
|
12a457 |
to unlock the lock while another concurrent fop that needed it
|
|
|
12a457 |
continues execution as if it were not released.
|
|
|
12a457 |
|
|
|
12a457 |
This patch also fixes an issue that caused a lock to not be released
|
|
|
12a457 |
if an error was found while preparing ec_update_size_version().
|
|
|
12a457 |
|
|
|
12a457 |
master -
|
|
|
12a457 |
http://review.gluster.org/#/c/14112/
|
|
|
12a457 |
|
|
|
12a457 |
release-3.7 -
|
|
|
12a457 |
http://review.gluster.org/#/c/14174/
|
|
|
12a457 |
|
|
|
12a457 |
release-3.8 -
|
|
|
12a457 |
http://review.gluster.org/#/c/14206/
|
|
|
12a457 |
|
|
|
12a457 |
Change-Id: I1344a3f5ecfc333f05a09e62653838264c9c26b1
|
|
|
12a457 |
BUG: 1330997
|
|
|
12a457 |
Signed-off-by: Xavier Hernandez <xhernandez@datalab.es>
|
|
|
12a457 |
Reviewed-on: http://review.gluster.org/14112
|
|
|
12a457 |
Smoke: Gluster Build System <jenkins@build.gluster.com>
|
|
|
12a457 |
CentOS-regression: Gluster Build System <jenkins@build.gluster.com>
|
|
|
12a457 |
Reviewed-by: Chen Chen <chenchen@smartquerier.com>
|
|
|
12a457 |
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
|
|
|
12a457 |
Signed-off-by: Ashish Pandey <aspandey@redhat.com>
|
|
|
12a457 |
Reviewed-on: https://code.engineering.redhat.com/gerrit/75107
|
|
|
12a457 |
Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
|
|
|
12a457 |
Tested-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
|
|
|
12a457 |
---
|
|
|
12a457 |
xlators/cluster/ec/src/ec-common.c | 242 ++++++++++++++++++++++++++----------
|
|
|
12a457 |
xlators/cluster/ec/src/ec-data.h | 25 +++-
|
|
|
12a457 |
2 files changed, 192 insertions(+), 75 deletions(-)
|
|
|
12a457 |
|
|
|
12a457 |
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
|
|
|
12a457 |
index de0e597..58cfc73 100644
|
|
|
12a457 |
--- a/xlators/cluster/ec/src/ec-common.c
|
|
|
12a457 |
+++ b/xlators/cluster/ec/src/ec-common.c
|
|
|
12a457 |
@@ -711,8 +711,7 @@ void ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, uint32_t flags,
|
|
|
12a457 |
link->update[EC_METADATA_TXN] = (flags & EC_UPDATE_META) != 0;
|
|
|
12a457 |
link->base = base;
|
|
|
12a457 |
|
|
|
12a457 |
- lock->refs++;
|
|
|
12a457 |
- lock->inserted++;
|
|
|
12a457 |
+ lock->refs_pending++;
|
|
|
12a457 |
}
|
|
|
12a457 |
|
|
|
12a457 |
void ec_lock_prepare_inode_internal(ec_fop_data_t *fop, loc_t *loc,
|
|
|
12a457 |
@@ -1347,6 +1346,7 @@ ec_lock_wake_shared(ec_lock_t *lock, struct list_head *list)
|
|
|
12a457 |
list_move_tail(&link->wait_list, list);
|
|
|
12a457 |
|
|
|
12a457 |
list_add_tail(&link->owner_list, &lock->owners);
|
|
|
12a457 |
+ lock->refs_owners++;
|
|
|
12a457 |
|
|
|
12a457 |
ec_lock_update_fd(lock, fop);
|
|
|
12a457 |
}
|
|
|
12a457 |
@@ -1478,6 +1478,8 @@ ec_lock_assign_owner(ec_lock_link_t *link)
|
|
|
12a457 |
ec_lock_link_t *timer_link = NULL;
|
|
|
12a457 |
gf_boolean_t assigned = _gf_false;
|
|
|
12a457 |
|
|
|
12a457 |
+ /* The link cannot be in any list because we have just finished preparing
|
|
|
12a457 |
+ * it. */
|
|
|
12a457 |
GF_ASSERT(list_empty(&link->wait_list));
|
|
|
12a457 |
|
|
|
12a457 |
fop = link->fop;
|
|
|
12a457 |
@@ -1485,27 +1487,85 @@ ec_lock_assign_owner(ec_lock_link_t *link)
|
|
|
12a457 |
|
|
|
12a457 |
LOCK(&lock->loc.inode->lock);
|
|
|
12a457 |
|
|
|
12a457 |
- GF_ASSERT (lock->inserted > 0);
|
|
|
12a457 |
- lock->inserted--;
|
|
|
12a457 |
+ /* Since the link has just been prepared but it's not active yet, the
|
|
|
12a457 |
+ * refs_pending must be one at least (the ref owned by this link). */
|
|
|
12a457 |
+ GF_ASSERT (lock->refs_pending > 0);
|
|
|
12a457 |
+ /* The link is not pending any more. It will be assigned to the owner,
|
|
|
12a457 |
+ * waiting or frozen list. */
|
|
|
12a457 |
+ lock->refs_pending--;
|
|
|
12a457 |
|
|
|
12a457 |
if (lock->release) {
|
|
|
12a457 |
ec_trace("LOCK_QUEUE_FREEZE", fop, "lock=%p", lock);
|
|
|
12a457 |
|
|
|
12a457 |
- list_add_tail(&link->wait_list, &lock->frozen);
|
|
|
12a457 |
+ /* When lock->release is set, we'll unlock the lock as soon as
|
|
|
12a457 |
+ * possible, meaning that we won't use a timer. */
|
|
|
12a457 |
+ GF_ASSERT(lock->timer == NULL);
|
|
|
12a457 |
|
|
|
12a457 |
- /* The lock is frozen, so we move the current reference to refs_frozen.
|
|
|
12a457 |
- * After that, there should remain at least one ref belonging to the
|
|
|
12a457 |
- * lock that is processing the release. */
|
|
|
12a457 |
- lock->refs--;
|
|
|
12a457 |
- GF_ASSERT(lock->refs > 0);
|
|
|
12a457 |
- lock->refs_frozen++;
|
|
|
12a457 |
+ /* The lock is marked to be released. We can still have owners and fops
|
|
|
12a457 |
+ * in the waiting ilist f they have been added before the lock has been
|
|
|
12a457 |
+ * marked to be released. However new fops are put into the frozen list
|
|
|
12a457 |
+ * to wait for the next unlock/lock cycle. */
|
|
|
12a457 |
+ list_add_tail(&link->wait_list, &lock->frozen);
|
|
|
12a457 |
|
|
|
12a457 |
goto unlock;
|
|
|
12a457 |
}
|
|
|
12a457 |
|
|
|
12a457 |
+ /* The lock is not marked to be released, so the frozen list should be
|
|
|
12a457 |
+ * empty. */
|
|
|
12a457 |
+ GF_ASSERT(list_empty(&lock->frozen));
|
|
|
12a457 |
+
|
|
|
12a457 |
+ if (lock->timer != NULL) {
|
|
|
12a457 |
+ /* We are trying to acquire a lock that has an unlock timer active.
|
|
|
12a457 |
+ * This means that the lock must be idle, i.e. no fop can be in the
|
|
|
12a457 |
+ * owner, waiting or frozen lists. It also means that the lock cannot
|
|
|
12a457 |
+ * have been marked as being released (this is done without timers)
|
|
|
12a457 |
+ * and it must not be exclusive. There should only be one owner
|
|
|
12a457 |
+ * reference, but it's possible that some fops are being prepared to
|
|
|
12a457 |
+ * use this lock. */
|
|
|
12a457 |
+ GF_ASSERT ((lock->exclusive == 0) && (lock->refs_owners == 1) &&
|
|
|
12a457 |
+ list_empty(&lock->owners) && list_empty(&lock->waiting));
|
|
|
12a457 |
+
|
|
|
12a457 |
+ /* We take the timer_link before cancelling the timer, since a
|
|
|
12a457 |
+ * successful cancellation will destroy it. It must not be NULL
|
|
|
12a457 |
+ * because it references the fop responsible for the delayed unlock
|
|
|
12a457 |
+ * that we are currently trying to cancel. */
|
|
|
12a457 |
+ timer_link = lock->timer->data;
|
|
|
12a457 |
+ GF_ASSERT(timer_link != NULL);
|
|
|
12a457 |
+
|
|
|
12a457 |
+ gf_timer_call_cancel(fop->xl->ctx, lock->timer);
|
|
|
12a457 |
+ ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock);
|
|
|
12a457 |
+
|
|
|
12a457 |
+ /* We have two options here:
|
|
|
12a457 |
+ *
|
|
|
12a457 |
+ * 1. The timer has been successfully cancelled.
|
|
|
12a457 |
+ *
|
|
|
12a457 |
+ * This is the easiest case and we can continue with the currently
|
|
|
12a457 |
+ * acquired lock.
|
|
|
12a457 |
+ *
|
|
|
12a457 |
+ * 2. The timer callback has already been fired.
|
|
|
12a457 |
+ *
|
|
|
12a457 |
+ * In this case we have not been able to cancel the timer before
|
|
|
12a457 |
+ * the timer callback has been fired, but we also know that
|
|
|
12a457 |
+ * lock->timer != NULL. This means that the timer callback is still
|
|
|
12a457 |
+ * trying to acquire the inode mutex that we currently own. We are
|
|
|
12a457 |
+ * safe until we release it. In this case we can safely clear
|
|
|
12a457 |
+ * lock->timer. This will cause that the timer callback does nothing
|
|
|
12a457 |
+ * once it acquires the mutex.
|
|
|
12a457 |
+ *
|
|
|
12a457 |
+ * In both cases we must release the owner reference assigned to the
|
|
|
12a457 |
+ * fop that was handling the unlock because ec_unlock_now() won't be
|
|
|
12a457 |
+ * called for that fop.
|
|
|
12a457 |
+ */
|
|
|
12a457 |
+ lock->timer = NULL;
|
|
|
12a457 |
+ lock->refs_owners--;
|
|
|
12a457 |
+ }
|
|
|
12a457 |
+
|
|
|
12a457 |
lock->exclusive |= (fop->flags & EC_FLAG_LOCK_SHARED) == 0;
|
|
|
12a457 |
|
|
|
12a457 |
if (!list_empty(&lock->owners)) {
|
|
|
12a457 |
+ /* There are other owners of this lock. We can only take ownership if
|
|
|
12a457 |
+ * the lock is already acquired and can be shared. Otherwise we need
|
|
|
12a457 |
+ * to wait. */
|
|
|
12a457 |
if (!lock->acquired || (lock->exclusive != 0)) {
|
|
|
12a457 |
ec_trace("LOCK_QUEUE_WAIT", fop, "lock=%p", lock);
|
|
|
12a457 |
|
|
|
12a457 |
@@ -1513,36 +1573,24 @@ ec_lock_assign_owner(ec_lock_link_t *link)
|
|
|
12a457 |
|
|
|
12a457 |
goto unlock;
|
|
|
12a457 |
}
|
|
|
12a457 |
- } else if (lock->timer != NULL) {
|
|
|
12a457 |
- GF_ASSERT (lock->release == _gf_false);
|
|
|
12a457 |
-
|
|
|
12a457 |
- timer_link = lock->timer->data;
|
|
|
12a457 |
- if (gf_timer_call_cancel(fop->xl->ctx, lock->timer) == 0) {
|
|
|
12a457 |
- ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock);
|
|
|
12a457 |
- lock->timer = NULL;
|
|
|
12a457 |
- lock->refs--;
|
|
|
12a457 |
- /* There should remain at least 1 ref, the current one. */
|
|
|
12a457 |
- GF_ASSERT(lock->refs > 0);
|
|
|
12a457 |
- } else {
|
|
|
12a457 |
- /* Timer expired and on the way to unlock.
|
|
|
12a457 |
- * Set lock->release to _gf_true, so that this
|
|
|
12a457 |
- * lock will be put in frozen list*/
|
|
|
12a457 |
- timer_link = NULL;
|
|
|
12a457 |
- lock->release = _gf_true;
|
|
|
12a457 |
- }
|
|
|
12a457 |
}
|
|
|
12a457 |
|
|
|
12a457 |
list_add_tail(&link->owner_list, &lock->owners);
|
|
|
12a457 |
+ lock->refs_owners++;
|
|
|
12a457 |
|
|
|
12a457 |
assigned = _gf_true;
|
|
|
12a457 |
|
|
|
12a457 |
unlock:
|
|
|
12a457 |
if (!assigned) {
|
|
|
12a457 |
+ /* We have not been able to take ownership of this lock. The fop must
|
|
|
12a457 |
+ * be put to sleep. */
|
|
|
12a457 |
ec_sleep(fop);
|
|
|
12a457 |
}
|
|
|
12a457 |
|
|
|
12a457 |
UNLOCK(&lock->loc.inode->lock);
|
|
|
12a457 |
|
|
|
12a457 |
+ /* If we have cancelled the timer, we need to resume the fop that was
|
|
|
12a457 |
+ * waiting for it. */
|
|
|
12a457 |
if (timer_link != NULL) {
|
|
|
12a457 |
ec_resume(timer_link->fop, 0);
|
|
|
12a457 |
}
|
|
|
12a457 |
@@ -1566,8 +1614,14 @@ ec_lock_next_owner(ec_lock_link_t *link, ec_cbk_data_t *cbk,
|
|
|
12a457 |
|
|
|
12a457 |
ec_trace("LOCK_DONE", fop, "lock=%p", lock);
|
|
|
12a457 |
|
|
|
12a457 |
- GF_ASSERT(!list_empty(&link->owner_list));
|
|
|
12a457 |
+ /* Current link must belong to the owner list of the lock. We don't
|
|
|
12a457 |
+ * decrement lock->refs_owners here because the inode mutex is released
|
|
|
12a457 |
+ * before ec_unlock() is called and we need to know when the last owner
|
|
|
12a457 |
+ * unlocks the lock to do proper cleanup. lock->refs_owners is used for
|
|
|
12a457 |
+ * this task. */
|
|
|
12a457 |
+ GF_ASSERT((lock->refs_owners > 0) && !list_empty(&link->owner_list));
|
|
|
12a457 |
list_del_init(&link->owner_list);
|
|
|
12a457 |
+
|
|
|
12a457 |
lock->release |= release;
|
|
|
12a457 |
|
|
|
12a457 |
if ((fop->error == 0) && (cbk != NULL) && (cbk->op_ret >= 0)) {
|
|
|
12a457 |
@@ -1625,6 +1679,7 @@ ec_lock_unfreeze(ec_lock_link_t *link)
|
|
|
12a457 |
{
|
|
|
12a457 |
struct list_head list;
|
|
|
12a457 |
ec_lock_t *lock;
|
|
|
12a457 |
+ gf_boolean_t destroy = _gf_false;
|
|
|
12a457 |
|
|
|
12a457 |
lock = link->lock;
|
|
|
12a457 |
|
|
|
12a457 |
@@ -1632,18 +1687,30 @@ ec_lock_unfreeze(ec_lock_link_t *link)
|
|
|
12a457 |
|
|
|
12a457 |
LOCK(&lock->loc.inode->lock);
|
|
|
12a457 |
|
|
|
12a457 |
- lock->acquired = _gf_false;
|
|
|
12a457 |
+ /* The lock must be marked to be released here, since we have just released
|
|
|
12a457 |
+ * it and any attempt to assign it to more fops must have added them to the
|
|
|
12a457 |
+ * frozen list. We can only have one active reference here: the one that
|
|
|
12a457 |
+ * is processing this unfreeze. */
|
|
|
12a457 |
+ GF_ASSERT(lock->release && (lock->refs_owners == 1));
|
|
|
12a457 |
lock->release = _gf_false;
|
|
|
12a457 |
- lock->refs--;
|
|
|
12a457 |
+ lock->refs_owners = 0;
|
|
|
12a457 |
|
|
|
12a457 |
- GF_ASSERT (lock->refs == lock->inserted);
|
|
|
12a457 |
- GF_ASSERT(lock->exclusive == 0);
|
|
|
12a457 |
- GF_ASSERT(list_empty(&lock->waiting) && list_empty(&lock->owners));
|
|
|
12a457 |
+ lock->acquired = _gf_false;
|
|
|
12a457 |
+
|
|
|
12a457 |
+ /* We are unfreezing a lock. This means that the lock has already been
|
|
|
12a457 |
+ * released. In this state it shouldn't be exclusive nor have a pending
|
|
|
12a457 |
+ * timer nor have any owner, and the waiting list should be empty. Only
|
|
|
12a457 |
+ * the frozen list can contain some fop. */
|
|
|
12a457 |
+ GF_ASSERT((lock->exclusive == 0) && (lock->timer == NULL) &&
|
|
|
12a457 |
+ list_empty(&lock->waiting) && list_empty(&lock->owners));
|
|
|
12a457 |
|
|
|
12a457 |
+ /* We move all frozen fops to the waiting list. */
|
|
|
12a457 |
list_splice_init(&lock->frozen, &lock->waiting);
|
|
|
12a457 |
- lock->refs += lock->refs_frozen;
|
|
|
12a457 |
- lock->refs_frozen = 0;
|
|
|
12a457 |
- if (lock->refs == 0) {
|
|
|
12a457 |
+
|
|
|
12a457 |
+ /* If we don't have any fop waiting nor there are any prepared fops using
|
|
|
12a457 |
+ * this lock, we can finally dispose it. */
|
|
|
12a457 |
+ destroy = list_empty(&lock->waiting) && (lock->refs_pending == 0);
|
|
|
12a457 |
+ if (destroy) {
|
|
|
12a457 |
ec_trace("LOCK_DESTROY", link->fop, "lock=%p", lock);
|
|
|
12a457 |
|
|
|
12a457 |
lock->ctx->inode_lock = NULL;
|
|
|
12a457 |
@@ -1657,7 +1724,7 @@ ec_lock_unfreeze(ec_lock_link_t *link)
|
|
|
12a457 |
|
|
|
12a457 |
ec_lock_resume_shared(&list);
|
|
|
12a457 |
|
|
|
12a457 |
- if (lock->refs == 0) {
|
|
|
12a457 |
+ if (destroy) {
|
|
|
12a457 |
ec_lock_destroy(lock);
|
|
|
12a457 |
}
|
|
|
12a457 |
}
|
|
|
12a457 |
@@ -1770,9 +1837,6 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version,
|
|
|
12a457 |
|
|
|
12a457 |
fop = link->fop;
|
|
|
12a457 |
|
|
|
12a457 |
- GF_ASSERT(version[0] < 0x100000000);
|
|
|
12a457 |
- GF_ASSERT(version[1] < 0x100000000);
|
|
|
12a457 |
-
|
|
|
12a457 |
ec_trace("UPDATE", fop, "version=%ld/%ld, size=%ld, dirty=%ld/%ld",
|
|
|
12a457 |
version[0], version[1], size, dirty[0], dirty[1]);
|
|
|
12a457 |
|
|
|
12a457 |
@@ -1814,7 +1878,7 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version,
|
|
|
12a457 |
}
|
|
|
12a457 |
}
|
|
|
12a457 |
|
|
|
12a457 |
- /* If config information is not know, we request it now. */
|
|
|
12a457 |
+ /* If config information is not known, we request it now. */
|
|
|
12a457 |
if ((lock->loc.inode->ia_type == IA_IFREG) && !ctx->have_config) {
|
|
|
12a457 |
/* A failure requesting this xattr is ignored because it's not
|
|
|
12a457 |
* absolutely required right now. */
|
|
|
12a457 |
@@ -1850,6 +1914,13 @@ out:
|
|
|
12a457 |
|
|
|
12a457 |
gf_msg (fop->xl->name, GF_LOG_ERROR, -err, EC_MSG_SIZE_VERS_UPDATE_FAIL,
|
|
|
12a457 |
"Unable to update version and size");
|
|
|
12a457 |
+
|
|
|
12a457 |
+ if ((fop->parent->id != GF_FOP_FLUSH) &&
|
|
|
12a457 |
+ (fop->parent->id != GF_FOP_FSYNC) &&
|
|
|
12a457 |
+ (fop->parent->id != GF_FOP_FSYNCDIR)) {
|
|
|
12a457 |
+ ec_unlock_lock(fop->data);
|
|
|
12a457 |
+ }
|
|
|
12a457 |
+
|
|
|
12a457 |
}
|
|
|
12a457 |
|
|
|
12a457 |
gf_boolean_t
|
|
|
12a457 |
@@ -1900,7 +1971,6 @@ ec_unlock_now(ec_lock_link_t *link)
|
|
|
12a457 |
void
|
|
|
12a457 |
ec_unlock_timer_del(ec_lock_link_t *link)
|
|
|
12a457 |
{
|
|
|
12a457 |
- int32_t before = 0;
|
|
|
12a457 |
ec_lock_t *lock;
|
|
|
12a457 |
inode_t *inode;
|
|
|
12a457 |
gf_boolean_t now = _gf_false;
|
|
|
12a457 |
@@ -1922,22 +1992,24 @@ ec_unlock_timer_del(ec_lock_link_t *link)
|
|
|
12a457 |
if (lock->timer != NULL) {
|
|
|
12a457 |
ec_trace("UNLOCK_DELAYED", link->fop, "lock=%p", lock);
|
|
|
12a457 |
|
|
|
12a457 |
+ /* The unlock timer has expired without anyone cancelling it.
|
|
|
12a457 |
+ * This means that it shouldn't have any owner, and the
|
|
|
12a457 |
+ * waiting and frozen lists should be empty. It shouldn't have
|
|
|
12a457 |
+ * been marked as release nor be exclusive either. It must have
|
|
|
12a457 |
+ * only one owner reference, but there can be fops being
|
|
|
12a457 |
+ * prepared though. */
|
|
|
12a457 |
+ GF_ASSERT(!lock->release && (lock->exclusive == 0) &&
|
|
|
12a457 |
+ (lock->refs_owners == 1) &&
|
|
|
12a457 |
+ list_empty(&lock->owners) &&
|
|
|
12a457 |
+ list_empty(&lock->waiting) &&
|
|
|
12a457 |
+ list_empty(&lock->frozen));
|
|
|
12a457 |
+
|
|
|
12a457 |
gf_timer_call_cancel(link->fop->xl->ctx, lock->timer);
|
|
|
12a457 |
lock->timer = NULL;
|
|
|
12a457 |
|
|
|
12a457 |
+ /* Any fop being processed from now on, will need to wait
|
|
|
12a457 |
+ * until the next unlock/lock cycle. */
|
|
|
12a457 |
lock->release = now = _gf_true;
|
|
|
12a457 |
-
|
|
|
12a457 |
- /* TODO: If the assertion is really true, following code is
|
|
|
12a457 |
- * not needed. */
|
|
|
12a457 |
- GF_ASSERT(list_empty(&lock->waiting));
|
|
|
12a457 |
-
|
|
|
12a457 |
- before = lock->refs + lock->refs_frozen;
|
|
|
12a457 |
- list_splice_init(&lock->waiting, &lock->frozen);
|
|
|
12a457 |
- lock->refs_frozen += lock->refs - lock->inserted - 1;
|
|
|
12a457 |
- lock->refs = 1 + lock->inserted;
|
|
|
12a457 |
- /* We moved around the locks, so total number of locks shouldn't
|
|
|
12a457 |
- * change by this operation*/
|
|
|
12a457 |
- GF_ASSERT (before == (lock->refs + lock->refs_frozen));
|
|
|
12a457 |
}
|
|
|
12a457 |
|
|
|
12a457 |
UNLOCK(&inode->lock);
|
|
|
12a457 |
@@ -1961,24 +2033,50 @@ void ec_unlock_timer_add(ec_lock_link_t *link)
|
|
|
12a457 |
|
|
|
12a457 |
LOCK(&lock->loc.inode->lock);
|
|
|
12a457 |
|
|
|
12a457 |
- GF_ASSERT(lock->timer == NULL);
|
|
|
12a457 |
+ /* We are trying to unlock the lock. We can have multiple scenarios here,
|
|
|
12a457 |
+ * but all of them need to have lock->timer == NULL:
|
|
|
12a457 |
+ *
|
|
|
12a457 |
+ * 1. There are other owners currently running that can call ec_unlock().
|
|
|
12a457 |
+ *
|
|
|
12a457 |
+ * None of them can have started the timer until the last one. But this
|
|
|
12a457 |
+ * call should be the consequence of this lastest one.
|
|
|
12a457 |
+ *
|
|
|
12a457 |
+ * 2. There are fops in the waiting or frozen lists.
|
|
|
12a457 |
+ *
|
|
|
12a457 |
+ * These fops cannot call ec_unlock(). So we should be here.
|
|
|
12a457 |
+ *
|
|
|
12a457 |
+ * We must reach here with at least one owner reference.
|
|
|
12a457 |
+ */
|
|
|
12a457 |
+ GF_ASSERT((lock->timer == NULL) && (lock->refs_owners > 0));
|
|
|
12a457 |
|
|
|
12a457 |
- if ((lock->refs - lock->inserted) > 1) {
|
|
|
12a457 |
+ /* If the fop detects that a heal is needed, we mark the lock to be
|
|
|
12a457 |
+ * released as soon as possible. */
|
|
|
12a457 |
+ lock->release |= ec_fop_needs_heal(fop);
|
|
|
12a457 |
+
|
|
|
12a457 |
+ if (lock->refs_owners > 1) {
|
|
|
12a457 |
ec_trace("UNLOCK_SKIP", fop, "lock=%p", lock);
|
|
|
12a457 |
|
|
|
12a457 |
- lock->refs--;
|
|
|
12a457 |
+ /* If there are other owners we cannot do anything else with the lock.
|
|
|
12a457 |
+ * Note that the current fop has already been removed from the owners
|
|
|
12a457 |
+ * list in ec_lock_reuse(). */
|
|
|
12a457 |
+ lock->refs_owners--;
|
|
|
12a457 |
|
|
|
12a457 |
UNLOCK(&lock->loc.inode->lock);
|
|
|
12a457 |
} else if (lock->acquired) {
|
|
|
12a457 |
- ec_t *ec = fop->xl->private;
|
|
|
12a457 |
+ /* There are no other owners and the lock is acquired. If there were
|
|
|
12a457 |
+ * fops waiting, at least one of them should have been promoted to an
|
|
|
12a457 |
+ * owner, so the waiting list should be empty. */
|
|
|
12a457 |
+ GF_ASSERT(list_empty(&lock->owners) && list_empty(&lock->waiting));
|
|
|
12a457 |
|
|
|
12a457 |
- GF_ASSERT(list_empty(&lock->owners));
|
|
|
12a457 |
+ ec_t *ec = fop->xl->private;
|
|
|
12a457 |
|
|
|
12a457 |
+ /* If everything goes as expected this fop will be put to sleep until
|
|
|
12a457 |
+ * the timer callback is executed. */
|
|
|
12a457 |
ec_sleep(fop);
|
|
|
12a457 |
|
|
|
12a457 |
- /* If healing is needed, the lock needs to be released due to
|
|
|
12a457 |
- * contention, or ec is shutting down, do not delay lock release. */
|
|
|
12a457 |
- if (!lock->release && !ec_fop_needs_heal(fop) && !ec->shutdown) {
|
|
|
12a457 |
+ /* If the lock needs to be released, or ec is shutting down, do not
|
|
|
12a457 |
+ * delay lock release. */
|
|
|
12a457 |
+ if (!lock->release && !ec->shutdown) {
|
|
|
12a457 |
ec_trace("UNLOCK_DELAY", fop, "lock=%p, release=%d", lock,
|
|
|
12a457 |
lock->release);
|
|
|
12a457 |
|
|
|
12a457 |
@@ -1989,9 +2087,10 @@ void ec_unlock_timer_add(ec_lock_link_t *link)
|
|
|
12a457 |
if (lock->timer == NULL) {
|
|
|
12a457 |
gf_msg(fop->xl->name, GF_LOG_WARNING, ENOMEM,
|
|
|
12a457 |
EC_MSG_UNLOCK_DELAY_FAILED,
|
|
|
12a457 |
- "Unable to delay an "
|
|
|
12a457 |
- "unlock");
|
|
|
12a457 |
+ "Unable to delay an unlock");
|
|
|
12a457 |
|
|
|
12a457 |
+ /* We are unable to create a new timer. We immediately release
|
|
|
12a457 |
+ * the lock. */
|
|
|
12a457 |
lock->release = now = _gf_true;
|
|
|
12a457 |
}
|
|
|
12a457 |
} else {
|
|
|
12a457 |
@@ -2006,10 +2105,17 @@ void ec_unlock_timer_add(ec_lock_link_t *link)
|
|
|
12a457 |
ec_unlock_now(link);
|
|
|
12a457 |
}
|
|
|
12a457 |
} else {
|
|
|
12a457 |
+ /* There are no owners and the lock is not acquired. This can only
|
|
|
12a457 |
+ * happen if a lock attempt has failed and we get to the unlock step
|
|
|
12a457 |
+ * of the fop. As in the previous case, the waiting list must be
|
|
|
12a457 |
+ * empty. */
|
|
|
12a457 |
+ GF_ASSERT(list_empty(&lock->owners) && list_empty(&lock->waiting));
|
|
|
12a457 |
+
|
|
|
12a457 |
+ /* We need to mark the lock to be released to correctly handle fops
|
|
|
12a457 |
+ * that may get in after we release the inode mutex but before
|
|
|
12a457 |
+ * ec_lock_unfreeze() is processed. */
|
|
|
12a457 |
lock->release = _gf_true;
|
|
|
12a457 |
|
|
|
12a457 |
- GF_ASSERT(list_empty(&lock->owners));
|
|
|
12a457 |
-
|
|
|
12a457 |
UNLOCK(&lock->loc.inode->lock);
|
|
|
12a457 |
|
|
|
12a457 |
ec_lock_unfreeze(link);
|
|
|
12a457 |
@@ -2052,7 +2158,7 @@ void ec_lock_reuse(ec_fop_data_t *fop)
|
|
|
12a457 |
}
|
|
|
12a457 |
}
|
|
|
12a457 |
} else {
|
|
|
12a457 |
- /* If eager lock is disabled or If we haven't get
|
|
|
12a457 |
+ /* If eager lock is disabled or if we haven't get
|
|
|
12a457 |
* an answer with enough quorum, we always release
|
|
|
12a457 |
* the lock. */
|
|
|
12a457 |
release = _gf_true;
|
|
|
12a457 |
diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h
|
|
|
12a457 |
index 9107b4b..f4214ec 100644
|
|
|
12a457 |
--- a/xlators/cluster/ec/src/ec-data.h
|
|
|
12a457 |
+++ b/xlators/cluster/ec/src/ec-data.h
|
|
|
12a457 |
@@ -139,17 +139,28 @@ struct _ec_lock
|
|
|
12a457 |
{
|
|
|
12a457 |
ec_inode_t *ctx;
|
|
|
12a457 |
gf_timer_t *timer;
|
|
|
12a457 |
- struct list_head owners; /* List of owners of this lock. */
|
|
|
12a457 |
- struct list_head waiting; /* Queue of requests being serviced. */
|
|
|
12a457 |
- struct list_head frozen; /* Queue of requests that will be serviced in
|
|
|
12a457 |
- the next unlock/lock cycle. */
|
|
|
12a457 |
+
|
|
|
12a457 |
+ /* List of owners of this lock. All fops added to this list are running
|
|
|
12a457 |
+ * concurrently. */
|
|
|
12a457 |
+ struct list_head owners;
|
|
|
12a457 |
+
|
|
|
12a457 |
+ /* List of fops waiting to be an owner of the lock. Fops are added to this
|
|
|
12a457 |
+ * list when the current owner has an incompatible access (shared vs
|
|
|
12a457 |
+ * exclusive) or the lock is not acquired yet. */
|
|
|
12a457 |
+ struct list_head waiting;
|
|
|
12a457 |
+
|
|
|
12a457 |
+ /* List of fops that will wait until the next unlock/lock cycle. This
|
|
|
12a457 |
+ * happens when the currently acquired lock is decided to be released as
|
|
|
12a457 |
+ * soon as possible. In this case, all frozen fops will be continued only
|
|
|
12a457 |
+ * after the lock is reacquired. */
|
|
|
12a457 |
+ struct list_head frozen;
|
|
|
12a457 |
+
|
|
|
12a457 |
int32_t exclusive;
|
|
|
12a457 |
uintptr_t mask;
|
|
|
12a457 |
uintptr_t good_mask;
|
|
|
12a457 |
uintptr_t healing;
|
|
|
12a457 |
- int32_t refs;
|
|
|
12a457 |
- int32_t refs_frozen;
|
|
|
12a457 |
- int32_t inserted;
|
|
|
12a457 |
+ uint32_t refs_owners; /* Refs for fops owning the lock */
|
|
|
12a457 |
+ uint32_t refs_pending; /* Refs assigned to fops being prepared */
|
|
|
12a457 |
gf_boolean_t acquired;
|
|
|
12a457 |
gf_boolean_t getting_size;
|
|
|
12a457 |
gf_boolean_t release;
|
|
|
12a457 |
--
|
|
|
12a457 |
1.7.1
|
|
|
12a457 |
|