From e3813685237dbdf8dc7cf28726fff2caf2288706 Mon Sep 17 00:00:00 2001 From: Xavi Hernandez Date: Mon, 19 Jul 2021 15:37:02 +0200 Subject: [PATCH 588/610] locks: Fix null gfid in lock contention notifications This patch fixes 3 problems: First problem: After commit c0bd592e, the pl_inode_t object was also created in the cbk of lookup requests. Lookup requests are a bit different than any other request because the inode received may not be completely initialized. In particular, inode->gfid may be null. This caused that the gfid stored in the pl_inode_t object was null in some cases. This gfid is used mostly for logs, but also to send lock contention notifications. This meant that some notifications could be sent with a null gfid, making impossible for the client xlator to correctly identify the contending inode, so the lock was not released immediately when eager-lock was also enabled. Second problem: The feature introduced by c0bd592e needed to track the number of hardlinks of each inode to detect when it was deleted. However it was done using the 'get-link-count' special xattr on lookup, while posix only implements it for unlink and rename. Also, the number of hardlinks was not incremented for mkdir, mknod, rename, ..., so it didn't work correctly for directories. Third problem: When the last hardlink of an open file is deleted, all locks will be denied with ESTALE error, but that's not correct. Access to the open fd must succeed. The first problem is fixed by avoiding creating pl_inode_t objects during lookup. Second and third problems are fixed by completely ignoring if the file has been deleted or not. Even if we grant a lock on a non-existing file, the next operation done by the client inside the lock will return the correct error, which should be enough. Upstream patch: > Upstream-patch-link: https://github.com/gluster/glusterfs/pull/2553 > Fixes: #2551 > Change-Id: Ic73e82f6b725b838c1600b6a128ea36a75f13253 > Signed-off-by: Xavi Hernandez BUG: 1962972 Change-Id: Ic73e82f6b725b838c1600b6a128ea36a75f13253 Signed-off-by: Xavi Hernandez Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/279192 Tested-by: RHGS Build Bot Reviewed-by: Sunil Kumar Heggodu Gopala Acharya --- tests/bugs/locks/issue-2551.t | 58 ++++++++++++++++++ xlators/features/locks/src/common.c | 31 +++------- xlators/features/locks/src/locks.h | 2 - xlators/features/locks/src/posix.c | 118 +++--------------------------------- 4 files changed, 74 insertions(+), 135 deletions(-) create mode 100644 tests/bugs/locks/issue-2551.t diff --git a/tests/bugs/locks/issue-2551.t b/tests/bugs/locks/issue-2551.t new file mode 100644 index 0000000..a32af02 --- /dev/null +++ b/tests/bugs/locks/issue-2551.t @@ -0,0 +1,58 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +function check_time() { + local max="${1}" + local start="$(date +"%s")" + + shift + + if "${@}"; then + if [[ $(($(date +"%s") - ${start})) -lt ${max} ]]; then + return 0 + fi + fi + + return 1 +} + +cleanup + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 disperse 3 redundancy 1 $H0:$B0/brick{0..2} +TEST $CLI volume set $V0 disperse.eager-lock on +TEST $CLI volume set $V0 disperse.eager-lock-timeout 30 +TEST $CLI volume set $V0 features.locks-notify-contention on +TEST $CLI volume set $V0 performance.write-behind off +TEST $CLI volume set $V0 performance.open-behind off +TEST $CLI volume set $V0 performance.quick-read off + +TEST $CLI volume start $V0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick2 + +TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 $M0 + +TEST mkdir $M0/dir +TEST dd if=/dev/zero of=$M0/dir/test bs=4k count=1 +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 + +TEST $CLI volume stop $V0 +TEST $CLI volume start $V0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick2 + +TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 $M0 + +TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M1 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 $M1 + +TEST dd if=/dev/zero of=$M0/dir/test bs=4k count=1 conv=notrunc +TEST check_time 5 dd if=/dev/zero of=$M1/dir/test bs=4k count=1 conv=notrunc diff --git a/xlators/features/locks/src/common.c b/xlators/features/locks/src/common.c index cddbfa6..5403086 100644 --- a/xlators/features/locks/src/common.c +++ b/xlators/features/locks/src/common.c @@ -468,9 +468,7 @@ pl_inode_get(xlator_t *this, inode_t *inode, pl_local_t *local) pl_inode->check_mlock_info = _gf_true; pl_inode->mlock_enforced = _gf_false; - /* -2 means never looked up. -1 means something went wrong and link - * tracking is disabled. */ - pl_inode->links = -2; + pl_inode->remove_running = 0; ret = __inode_ctx_put(inode, this, (uint64_t)(long)(pl_inode)); if (ret) { @@ -1403,11 +1401,6 @@ pl_inode_remove_prepare(xlator_t *xl, call_frame_t *frame, loc_t *loc, pthread_mutex_lock(&pl_inode->mutex); - if (pl_inode->removed) { - error = ESTALE; - goto unlock; - } - if (pl_inode_has_owners(xl, frame->root->client, pl_inode, &now, contend)) { error = -1; /* We skip the unlock here because the caller must create a stub when @@ -1420,7 +1413,6 @@ pl_inode_remove_prepare(xlator_t *xl, call_frame_t *frame, loc_t *loc, pl_inode->is_locked = _gf_true; pl_inode->remove_running++; -unlock: pthread_mutex_unlock(&pl_inode->mutex); done: @@ -1490,20 +1482,18 @@ pl_inode_remove_cbk(xlator_t *xl, pl_inode_t *pl_inode, int32_t error) pthread_mutex_lock(&pl_inode->mutex); - if (error == 0) { - if (pl_inode->links >= 0) { - pl_inode->links--; - } - if (pl_inode->links == 0) { - pl_inode->removed = _gf_true; - } - } - pl_inode->remove_running--; if ((pl_inode->remove_running == 0) && list_empty(&pl_inode->waiting)) { pl_inode->is_locked = _gf_false; + /* At this point it's possible that the inode has been deleted, but + * there could be open fd's still referencing it, so we can't prevent + * pending locks from being granted. If the file has really been + * deleted, whatever the client does once the lock is granted will + * fail with the appropriate error, so we don't need to worry about + * it here. */ + list_for_each_entry(dom, &pl_inode->dom_list, inode_list) { __grant_blocked_inode_locks(xl, pl_inode, &granted, dom, &now, @@ -1555,11 +1545,6 @@ pl_inode_remove_inodelk(pl_inode_t *pl_inode, pl_inode_lock_t *lock) pl_dom_list_t *dom; pl_inode_lock_t *ilock; - /* If the inode has been deleted, we won't allow any lock. */ - if (pl_inode->removed) { - return -ESTALE; - } - /* We only synchronize with locks made for regular operations coming from * the user. Locks done for internal purposes are hard to control and could * lead to long delays or deadlocks quite easily. */ diff --git a/xlators/features/locks/src/locks.h b/xlators/features/locks/src/locks.h index 6666feb..2406dcd 100644 --- a/xlators/features/locks/src/locks.h +++ b/xlators/features/locks/src/locks.h @@ -202,10 +202,8 @@ struct __pl_inode { int fop_wind_count; pthread_cond_t check_fop_wind_count; - int32_t links; /* Number of hard links the inode has. */ uint32_t remove_running; /* Number of remove operations running. */ gf_boolean_t is_locked; /* Regular locks will be blocked. */ - gf_boolean_t removed; /* The inode has been deleted. */ }; typedef struct __pl_inode pl_inode_t; diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c index 22ef5b8..d5effef 100644 --- a/xlators/features/locks/src/posix.c +++ b/xlators/features/locks/src/posix.c @@ -2975,104 +2975,24 @@ out: return ret; } -static int32_t -pl_request_link_count(dict_t **pxdata) -{ - dict_t *xdata; - - xdata = *pxdata; - if (xdata == NULL) { - xdata = dict_new(); - if (xdata == NULL) { - return ENOMEM; - } - } else { - dict_ref(xdata); - } - - if (dict_set_uint32(xdata, GET_LINK_COUNT, 0) != 0) { - dict_unref(xdata); - return ENOMEM; - } - - *pxdata = xdata; - - return 0; -} - -static int32_t -pl_check_link_count(dict_t *xdata) -{ - int32_t count; - - /* In case we are unable to read the link count from xdata, we take a - * conservative approach and return -2, which will prevent the inode from - * being considered deleted. In fact it will cause link tracking for this - * inode to be disabled completely to avoid races. */ - - if (xdata == NULL) { - return -2; - } - - if (dict_get_int32(xdata, GET_LINK_COUNT, &count) != 0) { - return -2; - } - - return count; -} - int32_t pl_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata, struct iatt *postparent) { - pl_inode_t *pl_inode; - - if (op_ret >= 0) { - pl_inode = pl_inode_get(this, inode, NULL); - if (pl_inode == NULL) { - PL_STACK_UNWIND(lookup, xdata, frame, -1, ENOMEM, NULL, NULL, NULL, - NULL); - return 0; - } - - pthread_mutex_lock(&pl_inode->mutex); - - /* We only update the link count if we previously didn't know it. - * Doing it always can lead to races since lookup is not executed - * atomically most of the times. */ - if (pl_inode->links == -2) { - pl_inode->links = pl_check_link_count(xdata); - if (buf->ia_type == IA_IFDIR) { - /* Directories have at least 2 links. To avoid special handling - * for directories, we simply decrement the value here to make - * them equivalent to regular files. */ - pl_inode->links--; - } - } - - pthread_mutex_unlock(&pl_inode->mutex); - } - PL_STACK_UNWIND(lookup, xdata, frame, op_ret, op_errno, inode, buf, xdata, postparent); + return 0; } int32_t pl_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - int32_t error; + PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); + STACK_WIND(frame, pl_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xdata); - error = pl_request_link_count(&xdata); - if (error == 0) { - PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); - STACK_WIND(frame, pl_lookup_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, loc, xdata); - dict_unref(xdata); - } else { - STACK_UNWIND_STRICT(lookup, frame, -1, error, NULL, NULL, NULL, NULL); - } return 0; } @@ -3881,9 +3801,7 @@ unlock: __dump_posixlks(pl_inode); } - gf_proc_dump_write("links", "%d", pl_inode->links); gf_proc_dump_write("removes_pending", "%u", pl_inode->remove_running); - gf_proc_dump_write("removed", "%u", pl_inode->removed); } pthread_mutex_unlock(&pl_inode->mutex); @@ -4508,21 +4426,9 @@ pl_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - pl_inode_t *pl_inode = (pl_inode_t *)cookie; - - if (op_ret >= 0) { - pthread_mutex_lock(&pl_inode->mutex); - - /* TODO: can happen pl_inode->links == 0 ? */ - if (pl_inode->links >= 0) { - pl_inode->links++; - } - - pthread_mutex_unlock(&pl_inode->mutex); - } - PL_STACK_UNWIND_FOR_CLIENT(link, xdata, frame, op_ret, op_errno, inode, buf, preparent, postparent, xdata); + return 0; } @@ -4530,18 +4436,10 @@ int pl_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, dict_t *xdata) { - pl_inode_t *pl_inode; - - pl_inode = pl_inode_get(this, oldloc->inode, NULL); - if (pl_inode == NULL) { - STACK_UNWIND_STRICT(link, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, - NULL); - return 0; - } - PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), oldloc, newloc); - STACK_WIND_COOKIE(frame, pl_link_cbk, pl_inode, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata); + STACK_WIND(frame, pl_link_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata); + return 0; } -- 1.8.3.1