d2787b
From ba57b043db1e19196cf860baeeeb1acfc9985cd2 Mon Sep 17 00:00:00 2001
d2787b
From: Xavi Hernandez <xhernandez@users.noreply.github.com>
d2787b
Date: Wed, 24 Feb 2021 15:04:23 +0100
d2787b
Subject: [PATCH 557/584] cluster/dht: Fix stack overflow in readdir(p)
d2787b
d2787b
When parallel-readdir is enabled, readdir(p) requests sent by DHT can be
d2787b
immediately processed and answered in the same thread before the call to
d2787b
STACK_WIND_COOKIE() completes.
d2787b
d2787b
This means that the readdir(p) cbk is processed synchronously. In some
d2787b
cases it may decide to send another readdir(p) request, which causes a
d2787b
recursive call.
d2787b
d2787b
When some special conditions happen and the directories are big, it's
d2787b
possible that the number of nested calls is so high that the process
d2787b
crashes because of a stack overflow.
d2787b
d2787b
This patch fixes this by not allowing nested readdir(p) calls. When a
d2787b
nested call is detected, it's queued instead of sending it. The queued
d2787b
request is processed when the current call finishes by the top level
d2787b
stack function.
d2787b
d2787b
Backport of 3 patches:
d2787b
> Upstream-patch: https://github.com/gluster/glusterfs/pull/2170
d2787b
> Fixes: #2169
d2787b
> Change-Id: Id763a8a51fb3c3314588ec7c162f649babf33099
d2787b
> Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
d2787b
d2787b
> Upstream-patch: https://github.com/gluster/glusterfs/pull/2202
d2787b
> Updates: #2169
d2787b
> Change-Id: I97e73c0aae74fc5d80c975f56f2f7a64e3e1ae95
d2787b
> Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
d2787b
d2787b
> Upstream-patch: https://github.com/gluster/glusterfs/pull/2242
d2787b
> Fixes: #2239
d2787b
> Change-Id: I6b2e48e87c85de27fad67a12d97abd91fa27c0c1
d2787b
> Signed-off-by: Pranith Kumar K <pranith.karampuri@phonepe.com>
d2787b
d2787b
BUG: 1798897
d2787b
Change-Id: Id763a8a51fb3c3314588ec7c162f649babf33099
d2787b
Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
d2787b
Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244549
d2787b
Tested-by: RHGS Build Bot <nigelb@redhat.com>
d2787b
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
d2787b
---
d2787b
 tests/bugs/distribute/issue-2169.t   |  33 +++++++++
d2787b
 xlators/cluster/dht/src/dht-common.c | 134 ++++++++++++++++++++++++++++++++---
d2787b
 xlators/cluster/dht/src/dht-common.h |   5 ++
d2787b
 3 files changed, 162 insertions(+), 10 deletions(-)
d2787b
 create mode 100755 tests/bugs/distribute/issue-2169.t
d2787b
d2787b
diff --git a/tests/bugs/distribute/issue-2169.t b/tests/bugs/distribute/issue-2169.t
d2787b
new file mode 100755
d2787b
index 0000000..91fa72a
d2787b
--- /dev/null
d2787b
+++ b/tests/bugs/distribute/issue-2169.t
d2787b
@@ -0,0 +1,33 @@
d2787b
+#!/bin/bash
d2787b
+
d2787b
+. $(dirname $0)/../../include.rc
d2787b
+. $(dirname $0)/../../volume.rc
d2787b
+
d2787b
+cleanup
d2787b
+
d2787b
+TEST glusterd
d2787b
+TEST ${CLI} volume create ${V0} ${H0}:/$B0/${V0}_0
d2787b
+TEST ${CLI} volume set ${V0} readdir-ahead on
d2787b
+TEST ${CLI} volume set ${V0} parallel-readdir on
d2787b
+TEST ${CLI} volume start ${V0}
d2787b
+
d2787b
+TEST ${GFS} --volfile-server ${H0} --volfile-id ${V0} ${M0}
d2787b
+
d2787b
+TEST mkdir -p ${M0}/d/d.{000..999}
d2787b
+
d2787b
+EXPECT_WITHIN ${UMOUNT_TIMEOUT} "Y" force_umount ${M0}
d2787b
+
d2787b
+TEST ${CLI} volume add-brick ${V0} ${H0}:${B0}/${V0}_{1..7}
d2787b
+
d2787b
+TEST ${GFS} --volfile-server ${H0} --volfile-id ${V0} ${M0}
d2787b
+
d2787b
+ls -l ${M0}/d/ | wc -l
d2787b
+
d2787b
+EXPECT_WITHIN ${UMOUNT_TIMEOUT} "Y" force_umount ${M0}
d2787b
+TEST ${GFS} --volfile-server ${H0} --volfile-id ${V0} ${M0}
d2787b
+
d2787b
+ls -l ${M0}/d/ | wc -l
d2787b
+
d2787b
+TEST ls ${M0}/d
d2787b
+
d2787b
+cleanup
d2787b
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
d2787b
index 0773092..ce0fbbf 100644
d2787b
--- a/xlators/cluster/dht/src/dht-common.c
d2787b
+++ b/xlators/cluster/dht/src/dht-common.c
d2787b
@@ -24,8 +24,15 @@
d2787b
 #include <libgen.h>
d2787b
 #include <signal.h>
d2787b
 
d2787b
+#include <urcu/uatomic.h>
d2787b
+
d2787b
 int run_defrag = 0;
d2787b
 
d2787b
+static int
d2787b
+dht_rmdir_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
d2787b
+                       int op_ret, int op_errno, gf_dirent_t *entries,
d2787b
+                       dict_t *xdata);
d2787b
+
d2787b
 int
d2787b
 dht_link2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret);
d2787b
 
d2787b
@@ -6681,6 +6688,94 @@ out:
d2787b
     return;
d2787b
 }
d2787b
 
d2787b
+/* Execute a READDIR request if no other request is in progress. Otherwise
d2787b
+ * queue it to be executed when the current one finishes.
d2787b
+ *
d2787b
+ * When parallel-readdir is enabled and directory contents are cached, the
d2787b
+ * callback of a readdirp will be called before returning from STACK_WIND.
d2787b
+ * If the returned contents are not useful for DHT, and the buffer is not
d2787b
+ * yet full, a nested readdirp request will be sent. This means that there
d2787b
+ * will be many recursive calls. In the worst case there might be a stack
d2787b
+ * overflow.
d2787b
+ *
d2787b
+ * To avoid this, we only wind a request if no other request is being wound.
d2787b
+ * If there's another request, we simple store the values for the next call.
d2787b
+ * When the thread processing the current wind completes it, it will take
d2787b
+ * the new arguments and send the request from the top level stack. */
d2787b
+static void
d2787b
+dht_queue_readdir(call_frame_t *frame, xlator_t *xl, off_t offset,
d2787b
+                  fop_readdir_cbk_t cbk)
d2787b
+{
d2787b
+    dht_local_t *local;
d2787b
+    int32_t queue;
d2787b
+    xlator_t *this = NULL;
d2787b
+
d2787b
+    local = frame->local;
d2787b
+    this = frame->this;
d2787b
+
d2787b
+    local->queue_xl = xl;
d2787b
+    local->queue_offset = offset;
d2787b
+
d2787b
+    if (uatomic_add_return(&local->queue, 1) == 1) {
d2787b
+        /* If we are here it means that we are the first one to send a
d2787b
+         * readdir request. Any attempt to send more readdir requests will
d2787b
+         * find local->queue > 1, so it won't do anything. The needed data
d2787b
+         * to send the request has been stored into local->queue_*.
d2787b
+         *
d2787b
+         * Note: this works because we will only have 1 additional request
d2787b
+         *       at most (the one called by the cbk function) while we are
d2787b
+         *       processing another readdir. */
d2787b
+        do {
d2787b
+            STACK_WIND_COOKIE(frame, cbk, local->queue_xl, local->queue_xl,
d2787b
+                              local->queue_xl->fops->readdir, local->fd,
d2787b
+                              local->size, local->queue_offset, local->xattr);
d2787b
+
d2787b
+            /* If a new readdirp request has been added before returning
d2787b
+             * from winding, we process it. */
d2787b
+        } while ((queue = uatomic_sub_return(&local->queue, 1)) > 0);
d2787b
+
d2787b
+        if (queue < 0) {
d2787b
+            /* A negative value means that an unwind has been called before
d2787b
+             * returning from the previous wind. This means that 'local' is
d2787b
+             * not needed anymore and must be destroyed. */
d2787b
+            dht_local_wipe(this, local);
d2787b
+        }
d2787b
+    }
d2787b
+}
d2787b
+
d2787b
+/* Execute a READDIRP request if no other request is in progress. Otherwise
d2787b
+ * queue it to be executed when the current one finishes. */
d2787b
+static void
d2787b
+dht_queue_readdirp(call_frame_t *frame, xlator_t *xl, off_t offset,
d2787b
+                   fop_readdirp_cbk_t cbk)
d2787b
+{
d2787b
+    dht_local_t *local;
d2787b
+    int32_t queue;
d2787b
+    xlator_t *this = NULL;
d2787b
+
d2787b
+    local = frame->local;
d2787b
+    this = frame->this;
d2787b
+
d2787b
+    local->queue_xl = xl;
d2787b
+    local->queue_offset = offset;
d2787b
+
d2787b
+    /* Check dht_queue_readdir() comments for an explanation of this. */
d2787b
+    if (uatomic_add_return(&local->queue, 1) == 1) {
d2787b
+        do {
d2787b
+            STACK_WIND_COOKIE(frame, cbk, local->queue_xl, local->queue_xl,
d2787b
+                              local->queue_xl->fops->readdirp, local->fd,
d2787b
+                              local->size, local->queue_offset, local->xattr);
d2787b
+        } while ((queue = uatomic_sub_return(&local->queue, 1)) > 0);
d2787b
+
d2787b
+        if (queue < 0) {
d2787b
+            /* A negative value means that an unwind has been called before
d2787b
+             * returning from the previous wind. This means that 'local' is
d2787b
+             * not needed anymore and must be destroyed. */
d2787b
+            dht_local_wipe(this, local);
d2787b
+        }
d2787b
+    }
d2787b
+}
d2787b
+
d2787b
 /* Posix returns op_errno = ENOENT to indicate that there are no more
d2787b
  * entries
d2787b
  */
d2787b
@@ -6950,9 +7045,8 @@ done:
d2787b
             }
d2787b
         }
d2787b
 
d2787b
-        STACK_WIND_COOKIE(frame, dht_readdirp_cbk, next_subvol, next_subvol,
d2787b
-                          next_subvol->fops->readdirp, local->fd, local->size,
d2787b
-                          next_offset, local->xattr);
d2787b
+        dht_queue_readdirp(frame, next_subvol, next_offset, dht_readdirp_cbk);
d2787b
+
d2787b
         return 0;
d2787b
     }
d2787b
 
d2787b
@@ -6970,6 +7064,17 @@ unwind:
d2787b
     if (prev != dht_last_up_subvol(this))
d2787b
         op_errno = 0;
d2787b
 
d2787b
+    /* If we are inside a recursive call (or not inside a recursive call but
d2787b
+     * the cbk is completed before the wind returns), local->queue will be 1.
d2787b
+     * In this case we cannot destroy 'local' because it will be needed by
d2787b
+     * the caller of STACK_WIND. In this case, we decrease the value to let
d2787b
+     * the caller know that the operation has terminated and it must destroy
d2787b
+     * 'local'. If local->queue 0, we can destroy it here because there are
d2787b
+     * no other users. */
d2787b
+    if (uatomic_sub_return(&local->queue, 1) >= 0) {
d2787b
+        frame->local = NULL;
d2787b
+    }
d2787b
+
d2787b
     DHT_STACK_UNWIND(readdirp, frame, op_ret, op_errno, &entries, NULL);
d2787b
 
d2787b
     gf_dirent_free(&entries);
d2787b
@@ -7071,9 +7176,8 @@ done:
d2787b
             goto unwind;
d2787b
         }
d2787b
 
d2787b
-        STACK_WIND_COOKIE(frame, dht_readdir_cbk, next_subvol, next_subvol,
d2787b
-                          next_subvol->fops->readdir, local->fd, local->size,
d2787b
-                          next_offset, NULL);
d2787b
+        dht_queue_readdir(frame, next_subvol, next_offset, dht_readdir_cbk);
d2787b
+
d2787b
         return 0;
d2787b
     }
d2787b
 
d2787b
@@ -7089,6 +7193,17 @@ unwind:
d2787b
     if (prev != dht_last_up_subvol(this))
d2787b
         op_errno = 0;
d2787b
 
d2787b
+    /* If we are inside a recursive call (or not inside a recursive call but
d2787b
+     * the cbk is completed before the wind returns), local->queue will be 1.
d2787b
+     * In this case we cannot destroy 'local' because it will be needed by
d2787b
+     * the caller of STACK_WIND. In this case, we decrease the value to let
d2787b
+     * the caller know that the operation has terminated and it must destroy
d2787b
+     * 'local'. If local->queue 0, we can destroy it here because there are
d2787b
+     * no other users. */
d2787b
+    if (uatomic_sub_return(&local->queue, 1) >= 0) {
d2787b
+        frame->local = NULL;
d2787b
+    }
d2787b
+
d2787b
     if (!skip_hashed_check) {
d2787b
         DHT_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, NULL);
d2787b
         gf_dirent_free(&entries);
d2787b
@@ -7096,6 +7211,7 @@ unwind:
d2787b
     } else {
d2787b
         DHT_STACK_UNWIND(readdir, frame, op_ret, op_errno, orig_entries, NULL);
d2787b
     }
d2787b
+
d2787b
     return 0;
d2787b
 }
d2787b
 
d2787b
@@ -7172,11 +7288,9 @@ dht_do_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
d2787b
             }
d2787b
         }
d2787b
 
d2787b
-        STACK_WIND_COOKIE(frame, dht_readdirp_cbk, xvol, xvol,
d2787b
-                          xvol->fops->readdirp, fd, size, yoff, local->xattr);
d2787b
+        dht_queue_readdirp(frame, xvol, yoff, dht_readdirp_cbk);
d2787b
     } else {
d2787b
-        STACK_WIND_COOKIE(frame, dht_readdir_cbk, xvol, xvol,
d2787b
-                          xvol->fops->readdir, fd, size, yoff, local->xattr);
d2787b
+        dht_queue_readdir(frame, xvol, yoff, dht_readdir_cbk);
d2787b
     }
d2787b
 
d2787b
     return 0;
d2787b
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
d2787b
index 92f1b89..132b3b3 100644
d2787b
--- a/xlators/cluster/dht/src/dht-common.h
d2787b
+++ b/xlators/cluster/dht/src/dht-common.h
d2787b
@@ -369,6 +369,11 @@ struct dht_local {
d2787b
 
d2787b
     dht_dir_transaction_t lock[2], *current;
d2787b
 
d2787b
+    /* for nested readdirs */
d2787b
+    xlator_t *queue_xl;
d2787b
+    off_t queue_offset;
d2787b
+    int32_t queue;
d2787b
+
d2787b
     /* inodelks during filerename for backward compatibility */
d2787b
     dht_lock_t **rename_inodelk_backward_compatible;
d2787b
     int rename_inodelk_bc_count;
d2787b
-- 
d2787b
1.8.3.1
d2787b