c460ee
From e3813685237dbdf8dc7cf28726fff2caf2288706 Mon Sep 17 00:00:00 2001
c460ee
From: Xavi Hernandez <xhernandez@redhat.com>
c460ee
Date: Mon, 19 Jul 2021 15:37:02 +0200
c460ee
Subject: [PATCH 588/610] locks: Fix null gfid in lock contention notifications
c460ee
c460ee
This patch fixes 3 problems:
c460ee
c460ee
First problem:
c460ee
c460ee
After commit c0bd592e, the pl_inode_t object was also created in the
c460ee
cbk of lookup requests. Lookup requests are a bit different than any
c460ee
other request because the inode received may not be completely
c460ee
initialized. In particular, inode->gfid may be null.
c460ee
c460ee
This caused that the gfid stored in the pl_inode_t object was null in
c460ee
some cases. This gfid is used mostly for logs, but also to send lock
c460ee
contention notifications. This meant that some notifications could be
c460ee
sent with a null gfid, making impossible for the client xlator to
c460ee
correctly identify the contending inode, so the lock was not released
c460ee
immediately when eager-lock was also enabled.
c460ee
c460ee
Second problem:
c460ee
c460ee
The feature introduced by c0bd592e needed to track the number of
c460ee
hardlinks of each inode to detect when it was deleted. However it
c460ee
was done using the 'get-link-count' special xattr on lookup, while
c460ee
posix only implements it for unlink and rename.
c460ee
c460ee
Also, the number of hardlinks was not incremented for mkdir, mknod,
c460ee
rename, ..., so it didn't work correctly for directories.
c460ee
c460ee
Third problem:
c460ee
c460ee
When the last hardlink of an open file is deleted, all locks will be
c460ee
denied with ESTALE error, but that's not correct. Access to the open
c460ee
fd must succeed.
c460ee
c460ee
The first problem is fixed by avoiding creating pl_inode_t objects
c460ee
during lookup. Second and third problems are fixed by completely
c460ee
ignoring if the file has been deleted or not. Even if we grant a
c460ee
lock on a non-existing file, the next operation done by the client
c460ee
inside the lock will return the correct error, which should be enough.
c460ee
c460ee
Upstream patch:
c460ee
> Upstream-patch-link: https://github.com/gluster/glusterfs/pull/2553
c460ee
> Fixes: #2551
c460ee
> Change-Id: Ic73e82f6b725b838c1600b6a128ea36a75f13253
c460ee
> Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
c460ee
c460ee
BUG: 1962972
c460ee
Change-Id: Ic73e82f6b725b838c1600b6a128ea36a75f13253
c460ee
Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
c460ee
Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/279192
c460ee
Tested-by: RHGS Build Bot <nigelb@redhat.com>
c460ee
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
c460ee
---
c460ee
 tests/bugs/locks/issue-2551.t       |  58 ++++++++++++++++++
c460ee
 xlators/features/locks/src/common.c |  31 +++-------
c460ee
 xlators/features/locks/src/locks.h  |   2 -
c460ee
 xlators/features/locks/src/posix.c  | 118 +++---------------------------------
c460ee
 4 files changed, 74 insertions(+), 135 deletions(-)
c460ee
 create mode 100644 tests/bugs/locks/issue-2551.t
c460ee
c460ee
diff --git a/tests/bugs/locks/issue-2551.t b/tests/bugs/locks/issue-2551.t
c460ee
new file mode 100644
c460ee
index 0000000..a32af02
c460ee
--- /dev/null
c460ee
+++ b/tests/bugs/locks/issue-2551.t
c460ee
@@ -0,0 +1,58 @@
c460ee
+#!/bin/bash
c460ee
+
c460ee
+. $(dirname $0)/../../include.rc
c460ee
+. $(dirname $0)/../../volume.rc
c460ee
+
c460ee
+function check_time() {
c460ee
+    local max="${1}"
c460ee
+    local start="$(date +"%s")"
c460ee
+
c460ee
+    shift
c460ee
+
c460ee
+    if "${@}"; then
c460ee
+        if [[ $(($(date +"%s") - ${start})) -lt ${max} ]]; then
c460ee
+            return 0
c460ee
+        fi
c460ee
+    fi
c460ee
+
c460ee
+    return 1
c460ee
+}
c460ee
+
c460ee
+cleanup
c460ee
+
c460ee
+TEST glusterd
c460ee
+TEST pidof glusterd
c460ee
+TEST $CLI volume create $V0 disperse 3 redundancy 1 $H0:$B0/brick{0..2}
c460ee
+TEST $CLI volume set $V0 disperse.eager-lock on
c460ee
+TEST $CLI volume set $V0 disperse.eager-lock-timeout 30
c460ee
+TEST $CLI volume set $V0 features.locks-notify-contention on
c460ee
+TEST $CLI volume set $V0 performance.write-behind off
c460ee
+TEST $CLI volume set $V0 performance.open-behind off
c460ee
+TEST $CLI volume set $V0 performance.quick-read off
c460ee
+
c460ee
+TEST $CLI volume start $V0
c460ee
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick0
c460ee
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick1
c460ee
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick2
c460ee
+
c460ee
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
c460ee
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 $M0
c460ee
+
c460ee
+TEST mkdir $M0/dir
c460ee
+TEST dd if=/dev/zero of=$M0/dir/test bs=4k count=1
c460ee
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
c460ee
+
c460ee
+TEST $CLI volume stop $V0
c460ee
+TEST $CLI volume start $V0
c460ee
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick0
c460ee
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick1
c460ee
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/brick2
c460ee
+
c460ee
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
c460ee
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 $M0
c460ee
+
c460ee
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M1
c460ee
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 $M1
c460ee
+
c460ee
+TEST dd if=/dev/zero of=$M0/dir/test bs=4k count=1 conv=notrunc
c460ee
+TEST check_time 5 dd if=/dev/zero of=$M1/dir/test bs=4k count=1 conv=notrunc
c460ee
diff --git a/xlators/features/locks/src/common.c b/xlators/features/locks/src/common.c
c460ee
index cddbfa6..5403086 100644
c460ee
--- a/xlators/features/locks/src/common.c
c460ee
+++ b/xlators/features/locks/src/common.c
c460ee
@@ -468,9 +468,7 @@ pl_inode_get(xlator_t *this, inode_t *inode, pl_local_t *local)
c460ee
         pl_inode->check_mlock_info = _gf_true;
c460ee
         pl_inode->mlock_enforced = _gf_false;
c460ee
 
c460ee
-        /* -2 means never looked up. -1 means something went wrong and link
c460ee
-         * tracking is disabled. */
c460ee
-        pl_inode->links = -2;
c460ee
+        pl_inode->remove_running = 0;
c460ee
 
c460ee
         ret = __inode_ctx_put(inode, this, (uint64_t)(long)(pl_inode));
c460ee
         if (ret) {
c460ee
@@ -1403,11 +1401,6 @@ pl_inode_remove_prepare(xlator_t *xl, call_frame_t *frame, loc_t *loc,
c460ee
 
c460ee
     pthread_mutex_lock(&pl_inode->mutex);
c460ee
 
c460ee
-    if (pl_inode->removed) {
c460ee
-        error = ESTALE;
c460ee
-        goto unlock;
c460ee
-    }
c460ee
-
c460ee
     if (pl_inode_has_owners(xl, frame->root->client, pl_inode, &now, contend)) {
c460ee
         error = -1;
c460ee
         /* We skip the unlock here because the caller must create a stub when
c460ee
@@ -1420,7 +1413,6 @@ pl_inode_remove_prepare(xlator_t *xl, call_frame_t *frame, loc_t *loc,
c460ee
     pl_inode->is_locked = _gf_true;
c460ee
     pl_inode->remove_running++;
c460ee
 
c460ee
-unlock:
c460ee
     pthread_mutex_unlock(&pl_inode->mutex);
c460ee
 
c460ee
 done:
c460ee
@@ -1490,20 +1482,18 @@ pl_inode_remove_cbk(xlator_t *xl, pl_inode_t *pl_inode, int32_t error)
c460ee
 
c460ee
     pthread_mutex_lock(&pl_inode->mutex);
c460ee
 
c460ee
-    if (error == 0) {
c460ee
-        if (pl_inode->links >= 0) {
c460ee
-            pl_inode->links--;
c460ee
-        }
c460ee
-        if (pl_inode->links == 0) {
c460ee
-            pl_inode->removed = _gf_true;
c460ee
-        }
c460ee
-    }
c460ee
-
c460ee
     pl_inode->remove_running--;
c460ee
 
c460ee
     if ((pl_inode->remove_running == 0) && list_empty(&pl_inode->waiting)) {
c460ee
         pl_inode->is_locked = _gf_false;
c460ee
 
c460ee
+        /* At this point it's possible that the inode has been deleted, but
c460ee
+         * there could be open fd's still referencing it, so we can't prevent
c460ee
+         * pending locks from being granted. If the file has really been
c460ee
+         * deleted, whatever the client does once the lock is granted will
c460ee
+         * fail with the appropriate error, so we don't need to worry about
c460ee
+         * it here. */
c460ee
+
c460ee
         list_for_each_entry(dom, &pl_inode->dom_list, inode_list)
c460ee
         {
c460ee
             __grant_blocked_inode_locks(xl, pl_inode, &granted, dom, &now,
c460ee
@@ -1555,11 +1545,6 @@ pl_inode_remove_inodelk(pl_inode_t *pl_inode, pl_inode_lock_t *lock)
c460ee
     pl_dom_list_t *dom;
c460ee
     pl_inode_lock_t *ilock;
c460ee
 
c460ee
-    /* If the inode has been deleted, we won't allow any lock. */
c460ee
-    if (pl_inode->removed) {
c460ee
-        return -ESTALE;
c460ee
-    }
c460ee
-
c460ee
     /* We only synchronize with locks made for regular operations coming from
c460ee
      * the user. Locks done for internal purposes are hard to control and could
c460ee
      * lead to long delays or deadlocks quite easily. */
c460ee
diff --git a/xlators/features/locks/src/locks.h b/xlators/features/locks/src/locks.h
c460ee
index 6666feb..2406dcd 100644
c460ee
--- a/xlators/features/locks/src/locks.h
c460ee
+++ b/xlators/features/locks/src/locks.h
c460ee
@@ -202,10 +202,8 @@ struct __pl_inode {
c460ee
     int fop_wind_count;
c460ee
     pthread_cond_t check_fop_wind_count;
c460ee
 
c460ee
-    int32_t links;           /* Number of hard links the inode has. */
c460ee
     uint32_t remove_running; /* Number of remove operations running. */
c460ee
     gf_boolean_t is_locked;  /* Regular locks will be blocked. */
c460ee
-    gf_boolean_t removed;    /* The inode has been deleted. */
c460ee
 };
c460ee
 typedef struct __pl_inode pl_inode_t;
c460ee
 
c460ee
diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c
c460ee
index 22ef5b8..d5effef 100644
c460ee
--- a/xlators/features/locks/src/posix.c
c460ee
+++ b/xlators/features/locks/src/posix.c
c460ee
@@ -2975,104 +2975,24 @@ out:
c460ee
     return ret;
c460ee
 }
c460ee
 
c460ee
-static int32_t
c460ee
-pl_request_link_count(dict_t **pxdata)
c460ee
-{
c460ee
-    dict_t *xdata;
c460ee
-
c460ee
-    xdata = *pxdata;
c460ee
-    if (xdata == NULL) {
c460ee
-        xdata = dict_new();
c460ee
-        if (xdata == NULL) {
c460ee
-            return ENOMEM;
c460ee
-        }
c460ee
-    } else {
c460ee
-        dict_ref(xdata);
c460ee
-    }
c460ee
-
c460ee
-    if (dict_set_uint32(xdata, GET_LINK_COUNT, 0) != 0) {
c460ee
-        dict_unref(xdata);
c460ee
-        return ENOMEM;
c460ee
-    }
c460ee
-
c460ee
-    *pxdata = xdata;
c460ee
-
c460ee
-    return 0;
c460ee
-}
c460ee
-
c460ee
-static int32_t
c460ee
-pl_check_link_count(dict_t *xdata)
c460ee
-{
c460ee
-    int32_t count;
c460ee
-
c460ee
-    /* In case we are unable to read the link count from xdata, we take a
c460ee
-     * conservative approach and return -2, which will prevent the inode from
c460ee
-     * being considered deleted. In fact it will cause link tracking for this
c460ee
-     * inode to be disabled completely to avoid races. */
c460ee
-
c460ee
-    if (xdata == NULL) {
c460ee
-        return -2;
c460ee
-    }
c460ee
-
c460ee
-    if (dict_get_int32(xdata, GET_LINK_COUNT, &count) != 0) {
c460ee
-        return -2;
c460ee
-    }
c460ee
-
c460ee
-    return count;
c460ee
-}
c460ee
-
c460ee
 int32_t
c460ee
 pl_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
c460ee
               int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata,
c460ee
               struct iatt *postparent)
c460ee
 {
c460ee
-    pl_inode_t *pl_inode;
c460ee
-
c460ee
-    if (op_ret >= 0) {
c460ee
-        pl_inode = pl_inode_get(this, inode, NULL);
c460ee
-        if (pl_inode == NULL) {
c460ee
-            PL_STACK_UNWIND(lookup, xdata, frame, -1, ENOMEM, NULL, NULL, NULL,
c460ee
-                            NULL);
c460ee
-            return 0;
c460ee
-        }
c460ee
-
c460ee
-        pthread_mutex_lock(&pl_inode->mutex);
c460ee
-
c460ee
-        /* We only update the link count if we previously didn't know it.
c460ee
-         * Doing it always can lead to races since lookup is not executed
c460ee
-         * atomically most of the times. */
c460ee
-        if (pl_inode->links == -2) {
c460ee
-            pl_inode->links = pl_check_link_count(xdata);
c460ee
-            if (buf->ia_type == IA_IFDIR) {
c460ee
-                /* Directories have at least 2 links. To avoid special handling
c460ee
-                 * for directories, we simply decrement the value here to make
c460ee
-                 * them equivalent to regular files. */
c460ee
-                pl_inode->links--;
c460ee
-            }
c460ee
-        }
c460ee
-
c460ee
-        pthread_mutex_unlock(&pl_inode->mutex);
c460ee
-    }
c460ee
-
c460ee
     PL_STACK_UNWIND(lookup, xdata, frame, op_ret, op_errno, inode, buf, xdata,
c460ee
                     postparent);
c460ee
+
c460ee
     return 0;
c460ee
 }
c460ee
 
c460ee
 int32_t
c460ee
 pl_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
c460ee
 {
c460ee
-    int32_t error;
c460ee
+    PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL);
c460ee
+    STACK_WIND(frame, pl_lookup_cbk, FIRST_CHILD(this),
c460ee
+               FIRST_CHILD(this)->fops->lookup, loc, xdata);
c460ee
 
c460ee
-    error = pl_request_link_count(&xdata);
c460ee
-    if (error == 0) {
c460ee
-        PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL);
c460ee
-        STACK_WIND(frame, pl_lookup_cbk, FIRST_CHILD(this),
c460ee
-                   FIRST_CHILD(this)->fops->lookup, loc, xdata);
c460ee
-        dict_unref(xdata);
c460ee
-    } else {
c460ee
-        STACK_UNWIND_STRICT(lookup, frame, -1, error, NULL, NULL, NULL, NULL);
c460ee
-    }
c460ee
     return 0;
c460ee
 }
c460ee
 
c460ee
@@ -3881,9 +3801,7 @@ unlock:
c460ee
             __dump_posixlks(pl_inode);
c460ee
         }
c460ee
 
c460ee
-        gf_proc_dump_write("links", "%d", pl_inode->links);
c460ee
         gf_proc_dump_write("removes_pending", "%u", pl_inode->remove_running);
c460ee
-        gf_proc_dump_write("removed", "%u", pl_inode->removed);
c460ee
     }
c460ee
     pthread_mutex_unlock(&pl_inode->mutex);
c460ee
 
c460ee
@@ -4508,21 +4426,9 @@ pl_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
c460ee
             int32_t op_errno, inode_t *inode, struct iatt *buf,
c460ee
             struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
c460ee
 {
c460ee
-    pl_inode_t *pl_inode = (pl_inode_t *)cookie;
c460ee
-
c460ee
-    if (op_ret >= 0) {
c460ee
-        pthread_mutex_lock(&pl_inode->mutex);
c460ee
-
c460ee
-        /* TODO: can happen pl_inode->links == 0 ? */
c460ee
-        if (pl_inode->links >= 0) {
c460ee
-            pl_inode->links++;
c460ee
-        }
c460ee
-
c460ee
-        pthread_mutex_unlock(&pl_inode->mutex);
c460ee
-    }
c460ee
-
c460ee
     PL_STACK_UNWIND_FOR_CLIENT(link, xdata, frame, op_ret, op_errno, inode, buf,
c460ee
                                preparent, postparent, xdata);
c460ee
+
c460ee
     return 0;
c460ee
 }
c460ee
 
c460ee
@@ -4530,18 +4436,10 @@ int
c460ee
 pl_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
c460ee
         dict_t *xdata)
c460ee
 {
c460ee
-    pl_inode_t *pl_inode;
c460ee
-
c460ee
-    pl_inode = pl_inode_get(this, oldloc->inode, NULL);
c460ee
-    if (pl_inode == NULL) {
c460ee
-        STACK_UNWIND_STRICT(link, frame, -1, ENOMEM, NULL, NULL, NULL, NULL,
c460ee
-                            NULL);
c460ee
-        return 0;
c460ee
-    }
c460ee
-
c460ee
     PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), oldloc, newloc);
c460ee
-    STACK_WIND_COOKIE(frame, pl_link_cbk, pl_inode, FIRST_CHILD(this),
c460ee
-                      FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
c460ee
+    STACK_WIND(frame, pl_link_cbk, FIRST_CHILD(this),
c460ee
+               FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
c460ee
+
c460ee
     return 0;
c460ee
 }
c460ee
 
c460ee
-- 
c460ee
1.8.3.1
c460ee