Blob Blame History Raw
From e3813685237dbdf8dc7cf28726fff2caf2288706 Mon Sep 17 00:00:00 2001
From: Xavi Hernandez <xhernandez@redhat.com>
Date: Mon, 19 Jul 2021 15:37:02 +0200
Subject: [PATCH 588/610] locks: Fix null gfid in lock contention notifications

This patch fixes 3 problems:

First problem:

After commit c0bd592e, the pl_inode_t object was also created in the
cbk of lookup requests. Lookup requests are a bit different than any
other request because the inode received may not be completely
initialized. In particular, inode->gfid may be null.

This caused that the gfid stored in the pl_inode_t object was null in
some cases. This gfid is used mostly for logs, but also to send lock
contention notifications. This meant that some notifications could be
sent with a null gfid, making impossible for the client xlator to
correctly identify the contending inode, so the lock was not released
immediately when eager-lock was also enabled.

Second problem:

The feature introduced by c0bd592e needed to track the number of
hardlinks of each inode to detect when it was deleted. However it
was done using the 'get-link-count' special xattr on lookup, while
posix only implements it for unlink and rename.

Also, the number of hardlinks was not incremented for mkdir, mknod,
rename, ..., so it didn't work correctly for directories.

Third problem:

When the last hardlink of an open file is deleted, all locks will be
denied with ESTALE error, but that's not correct. Access to the open
fd must succeed.

The first problem is fixed by avoiding creating pl_inode_t objects
during lookup. Second and third problems are fixed by completely
ignoring if the file has been deleted or not. Even if we grant a
lock on a non-existing file, the next operation done by the client
inside the lock will return the correct error, which should be enough.

Upstream patch:
> Upstream-patch-link: https://github.com/gluster/glusterfs/pull/2553
> Fixes: #2551
> Change-Id: Ic73e82f6b725b838c1600b6a128ea36a75f13253
> Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>

BUG: 1962972
Change-Id: Ic73e82f6b725b838c1600b6a128ea36a75f13253
Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/279192
Tested-by: RHGS Build Bot <nigelb@redhat.com>
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
---
 tests/bugs/locks/issue-2551.t       |  58 ++++++++++++++++++
 xlators/features/locks/src/common.c |  31 +++-------
 xlators/features/locks/src/locks.h  |   2 -
 xlators/features/locks/src/posix.c  | 118 +++---------------------------------
 4 files changed, 74 insertions(+), 135 deletions(-)
 create mode 100644 tests/bugs/locks/issue-2551.t

diff --git a/tests/bugs/locks/issue-2551.t b/tests/bugs/locks/issue-2551.t
new file mode 100644
index 0000000..a32af02
--- /dev/null
+++ b/tests/bugs/locks/issue-2551.t
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+function check_time() {
+    local max="${1}"
+    local start="$(date +"%s")"
+
+    shift
+
+    if "${@}"; then
+        if [[ $(($(date +"%s") - ${start})) -lt ${max} ]]; then
+            return 0
+        fi
+    fi
+
+    return 1
+}
+
+cleanup
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 disperse 3 redundancy 1 $H0:$B0/brick{0..2}
+TEST $CLI volume set $V0 disperse.eager-lock on
+TEST $CLI volume set $V0 disperse.eager-lock-timeout 30
+TEST $CLI volume set $V0 features.locks-notify-contention on
+TEST $CLI volume set $V0 performance.write-behind off
+TEST $CLI volume set $V0 performance.open-behind off
+TEST $CLI volume set $V0 performance.quick-read off
+
+TEST $CLI volume start $V0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick1
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick2
+
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 $M0
+
+TEST mkdir $M0/dir
+TEST dd if=/dev/zero of=$M0/dir/test bs=4k count=1
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
+
+TEST $CLI volume stop $V0
+TEST $CLI volume start $V0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick1
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick2
+
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 $M0
+
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M1
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 $M1
+
+TEST dd if=/dev/zero of=$M0/dir/test bs=4k count=1 conv=notrunc
+TEST check_time 5 dd if=/dev/zero of=$M1/dir/test bs=4k count=1 conv=notrunc
diff --git a/xlators/features/locks/src/common.c b/xlators/features/locks/src/common.c
index cddbfa6..5403086 100644
--- a/xlators/features/locks/src/common.c
+++ b/xlators/features/locks/src/common.c
@@ -468,9 +468,7 @@ pl_inode_get(xlator_t *this, inode_t *inode, pl_local_t *local)
         pl_inode->check_mlock_info = _gf_true;
         pl_inode->mlock_enforced = _gf_false;
 
-        /* -2 means never looked up. -1 means something went wrong and link
-         * tracking is disabled. */
-        pl_inode->links = -2;
+        pl_inode->remove_running = 0;
 
         ret = __inode_ctx_put(inode, this, (uint64_t)(long)(pl_inode));
         if (ret) {
@@ -1403,11 +1401,6 @@ pl_inode_remove_prepare(xlator_t *xl, call_frame_t *frame, loc_t *loc,
 
     pthread_mutex_lock(&pl_inode->mutex);
 
-    if (pl_inode->removed) {
-        error = ESTALE;
-        goto unlock;
-    }
-
     if (pl_inode_has_owners(xl, frame->root->client, pl_inode, &now, contend)) {
         error = -1;
         /* We skip the unlock here because the caller must create a stub when
@@ -1420,7 +1413,6 @@ pl_inode_remove_prepare(xlator_t *xl, call_frame_t *frame, loc_t *loc,
     pl_inode->is_locked = _gf_true;
     pl_inode->remove_running++;
 
-unlock:
     pthread_mutex_unlock(&pl_inode->mutex);
 
 done:
@@ -1490,20 +1482,18 @@ pl_inode_remove_cbk(xlator_t *xl, pl_inode_t *pl_inode, int32_t error)
 
     pthread_mutex_lock(&pl_inode->mutex);
 
-    if (error == 0) {
-        if (pl_inode->links >= 0) {
-            pl_inode->links--;
-        }
-        if (pl_inode->links == 0) {
-            pl_inode->removed = _gf_true;
-        }
-    }
-
     pl_inode->remove_running--;
 
     if ((pl_inode->remove_running == 0) && list_empty(&pl_inode->waiting)) {
         pl_inode->is_locked = _gf_false;
 
+        /* At this point it's possible that the inode has been deleted, but
+         * there could be open fd's still referencing it, so we can't prevent
+         * pending locks from being granted. If the file has really been
+         * deleted, whatever the client does once the lock is granted will
+         * fail with the appropriate error, so we don't need to worry about
+         * it here. */
+
         list_for_each_entry(dom, &pl_inode->dom_list, inode_list)
         {
             __grant_blocked_inode_locks(xl, pl_inode, &granted, dom, &now,
@@ -1555,11 +1545,6 @@ pl_inode_remove_inodelk(pl_inode_t *pl_inode, pl_inode_lock_t *lock)
     pl_dom_list_t *dom;
     pl_inode_lock_t *ilock;
 
-    /* If the inode has been deleted, we won't allow any lock. */
-    if (pl_inode->removed) {
-        return -ESTALE;
-    }
-
     /* We only synchronize with locks made for regular operations coming from
      * the user. Locks done for internal purposes are hard to control and could
      * lead to long delays or deadlocks quite easily. */
diff --git a/xlators/features/locks/src/locks.h b/xlators/features/locks/src/locks.h
index 6666feb..2406dcd 100644
--- a/xlators/features/locks/src/locks.h
+++ b/xlators/features/locks/src/locks.h
@@ -202,10 +202,8 @@ struct __pl_inode {
     int fop_wind_count;
     pthread_cond_t check_fop_wind_count;
 
-    int32_t links;           /* Number of hard links the inode has. */
     uint32_t remove_running; /* Number of remove operations running. */
     gf_boolean_t is_locked;  /* Regular locks will be blocked. */
-    gf_boolean_t removed;    /* The inode has been deleted. */
 };
 typedef struct __pl_inode pl_inode_t;
 
diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c
index 22ef5b8..d5effef 100644
--- a/xlators/features/locks/src/posix.c
+++ b/xlators/features/locks/src/posix.c
@@ -2975,104 +2975,24 @@ out:
     return ret;
 }
 
-static int32_t
-pl_request_link_count(dict_t **pxdata)
-{
-    dict_t *xdata;
-
-    xdata = *pxdata;
-    if (xdata == NULL) {
-        xdata = dict_new();
-        if (xdata == NULL) {
-            return ENOMEM;
-        }
-    } else {
-        dict_ref(xdata);
-    }
-
-    if (dict_set_uint32(xdata, GET_LINK_COUNT, 0) != 0) {
-        dict_unref(xdata);
-        return ENOMEM;
-    }
-
-    *pxdata = xdata;
-
-    return 0;
-}
-
-static int32_t
-pl_check_link_count(dict_t *xdata)
-{
-    int32_t count;
-
-    /* In case we are unable to read the link count from xdata, we take a
-     * conservative approach and return -2, which will prevent the inode from
-     * being considered deleted. In fact it will cause link tracking for this
-     * inode to be disabled completely to avoid races. */
-
-    if (xdata == NULL) {
-        return -2;
-    }
-
-    if (dict_get_int32(xdata, GET_LINK_COUNT, &count) != 0) {
-        return -2;
-    }
-
-    return count;
-}
-
 int32_t
 pl_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
               int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata,
               struct iatt *postparent)
 {
-    pl_inode_t *pl_inode;
-
-    if (op_ret >= 0) {
-        pl_inode = pl_inode_get(this, inode, NULL);
-        if (pl_inode == NULL) {
-            PL_STACK_UNWIND(lookup, xdata, frame, -1, ENOMEM, NULL, NULL, NULL,
-                            NULL);
-            return 0;
-        }
-
-        pthread_mutex_lock(&pl_inode->mutex);
-
-        /* We only update the link count if we previously didn't know it.
-         * Doing it always can lead to races since lookup is not executed
-         * atomically most of the times. */
-        if (pl_inode->links == -2) {
-            pl_inode->links = pl_check_link_count(xdata);
-            if (buf->ia_type == IA_IFDIR) {
-                /* Directories have at least 2 links. To avoid special handling
-                 * for directories, we simply decrement the value here to make
-                 * them equivalent to regular files. */
-                pl_inode->links--;
-            }
-        }
-
-        pthread_mutex_unlock(&pl_inode->mutex);
-    }
-
     PL_STACK_UNWIND(lookup, xdata, frame, op_ret, op_errno, inode, buf, xdata,
                     postparent);
+
     return 0;
 }
 
 int32_t
 pl_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
 {
-    int32_t error;
+    PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL);
+    STACK_WIND(frame, pl_lookup_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->lookup, loc, xdata);
 
-    error = pl_request_link_count(&xdata);
-    if (error == 0) {
-        PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL);
-        STACK_WIND(frame, pl_lookup_cbk, FIRST_CHILD(this),
-                   FIRST_CHILD(this)->fops->lookup, loc, xdata);
-        dict_unref(xdata);
-    } else {
-        STACK_UNWIND_STRICT(lookup, frame, -1, error, NULL, NULL, NULL, NULL);
-    }
     return 0;
 }
 
@@ -3881,9 +3801,7 @@ unlock:
             __dump_posixlks(pl_inode);
         }
 
-        gf_proc_dump_write("links", "%d", pl_inode->links);
         gf_proc_dump_write("removes_pending", "%u", pl_inode->remove_running);
-        gf_proc_dump_write("removed", "%u", pl_inode->removed);
     }
     pthread_mutex_unlock(&pl_inode->mutex);
 
@@ -4508,21 +4426,9 @@ pl_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
             int32_t op_errno, inode_t *inode, struct iatt *buf,
             struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
 {
-    pl_inode_t *pl_inode = (pl_inode_t *)cookie;
-
-    if (op_ret >= 0) {
-        pthread_mutex_lock(&pl_inode->mutex);
-
-        /* TODO: can happen pl_inode->links == 0 ? */
-        if (pl_inode->links >= 0) {
-            pl_inode->links++;
-        }
-
-        pthread_mutex_unlock(&pl_inode->mutex);
-    }
-
     PL_STACK_UNWIND_FOR_CLIENT(link, xdata, frame, op_ret, op_errno, inode, buf,
                                preparent, postparent, xdata);
+
     return 0;
 }
 
@@ -4530,18 +4436,10 @@ int
 pl_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
         dict_t *xdata)
 {
-    pl_inode_t *pl_inode;
-
-    pl_inode = pl_inode_get(this, oldloc->inode, NULL);
-    if (pl_inode == NULL) {
-        STACK_UNWIND_STRICT(link, frame, -1, ENOMEM, NULL, NULL, NULL, NULL,
-                            NULL);
-        return 0;
-    }
-
     PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), oldloc, newloc);
-    STACK_WIND_COOKIE(frame, pl_link_cbk, pl_inode, FIRST_CHILD(this),
-                      FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
+    STACK_WIND(frame, pl_link_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
+
     return 0;
 }
 
-- 
1.8.3.1