|
|
d2787b |
From ba57b043db1e19196cf860baeeeb1acfc9985cd2 Mon Sep 17 00:00:00 2001
|
|
|
d2787b |
From: Xavi Hernandez <xhernandez@users.noreply.github.com>
|
|
|
d2787b |
Date: Wed, 24 Feb 2021 15:04:23 +0100
|
|
|
d2787b |
Subject: [PATCH 557/584] cluster/dht: Fix stack overflow in readdir(p)
|
|
|
d2787b |
|
|
|
d2787b |
When parallel-readdir is enabled, readdir(p) requests sent by DHT can be
|
|
|
d2787b |
immediately processed and answered in the same thread before the call to
|
|
|
d2787b |
STACK_WIND_COOKIE() completes.
|
|
|
d2787b |
|
|
|
d2787b |
This means that the readdir(p) cbk is processed synchronously. In some
|
|
|
d2787b |
cases it may decide to send another readdir(p) request, which causes a
|
|
|
d2787b |
recursive call.
|
|
|
d2787b |
|
|
|
d2787b |
When some special conditions happen and the directories are big, it's
|
|
|
d2787b |
possible that the number of nested calls is so high that the process
|
|
|
d2787b |
crashes because of a stack overflow.
|
|
|
d2787b |
|
|
|
d2787b |
This patch fixes this by not allowing nested readdir(p) calls. When a
|
|
|
d2787b |
nested call is detected, it's queued instead of sending it. The queued
|
|
|
d2787b |
request is processed when the current call finishes by the top level
|
|
|
d2787b |
stack function.
|
|
|
d2787b |
|
|
|
d2787b |
Backport of 3 patches:
|
|
|
d2787b |
> Upstream-patch: https://github.com/gluster/glusterfs/pull/2170
|
|
|
d2787b |
> Fixes: #2169
|
|
|
d2787b |
> Change-Id: Id763a8a51fb3c3314588ec7c162f649babf33099
|
|
|
d2787b |
> Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
|
|
|
d2787b |
|
|
|
d2787b |
> Upstream-patch: https://github.com/gluster/glusterfs/pull/2202
|
|
|
d2787b |
> Updates: #2169
|
|
|
d2787b |
> Change-Id: I97e73c0aae74fc5d80c975f56f2f7a64e3e1ae95
|
|
|
d2787b |
> Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
|
|
|
d2787b |
|
|
|
d2787b |
> Upstream-patch: https://github.com/gluster/glusterfs/pull/2242
|
|
|
d2787b |
> Fixes: #2239
|
|
|
d2787b |
> Change-Id: I6b2e48e87c85de27fad67a12d97abd91fa27c0c1
|
|
|
d2787b |
> Signed-off-by: Pranith Kumar K <pranith.karampuri@phonepe.com>
|
|
|
d2787b |
|
|
|
d2787b |
BUG: 1798897
|
|
|
d2787b |
Change-Id: Id763a8a51fb3c3314588ec7c162f649babf33099
|
|
|
d2787b |
Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
|
|
|
d2787b |
Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244549
|
|
|
d2787b |
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
|
|
d2787b |
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
|
|
|
d2787b |
---
|
|
|
d2787b |
tests/bugs/distribute/issue-2169.t | 33 +++++++++
|
|
|
d2787b |
xlators/cluster/dht/src/dht-common.c | 134 ++++++++++++++++++++++++++++++++---
|
|
|
d2787b |
xlators/cluster/dht/src/dht-common.h | 5 ++
|
|
|
d2787b |
3 files changed, 162 insertions(+), 10 deletions(-)
|
|
|
d2787b |
create mode 100755 tests/bugs/distribute/issue-2169.t
|
|
|
d2787b |
|
|
|
d2787b |
diff --git a/tests/bugs/distribute/issue-2169.t b/tests/bugs/distribute/issue-2169.t
|
|
|
d2787b |
new file mode 100755
|
|
|
d2787b |
index 0000000..91fa72a
|
|
|
d2787b |
--- /dev/null
|
|
|
d2787b |
+++ b/tests/bugs/distribute/issue-2169.t
|
|
|
d2787b |
@@ -0,0 +1,33 @@
|
|
|
d2787b |
+#!/bin/bash
|
|
|
d2787b |
+
|
|
|
d2787b |
+. $(dirname $0)/../../include.rc
|
|
|
d2787b |
+. $(dirname $0)/../../volume.rc
|
|
|
d2787b |
+
|
|
|
d2787b |
+cleanup
|
|
|
d2787b |
+
|
|
|
d2787b |
+TEST glusterd
|
|
|
d2787b |
+TEST ${CLI} volume create ${V0} ${H0}:/$B0/${V0}_0
|
|
|
d2787b |
+TEST ${CLI} volume set ${V0} readdir-ahead on
|
|
|
d2787b |
+TEST ${CLI} volume set ${V0} parallel-readdir on
|
|
|
d2787b |
+TEST ${CLI} volume start ${V0}
|
|
|
d2787b |
+
|
|
|
d2787b |
+TEST ${GFS} --volfile-server ${H0} --volfile-id ${V0} ${M0}
|
|
|
d2787b |
+
|
|
|
d2787b |
+TEST mkdir -p ${M0}/d/d.{000..999}
|
|
|
d2787b |
+
|
|
|
d2787b |
+EXPECT_WITHIN ${UMOUNT_TIMEOUT} "Y" force_umount ${M0}
|
|
|
d2787b |
+
|
|
|
d2787b |
+TEST ${CLI} volume add-brick ${V0} ${H0}:${B0}/${V0}_{1..7}
|
|
|
d2787b |
+
|
|
|
d2787b |
+TEST ${GFS} --volfile-server ${H0} --volfile-id ${V0} ${M0}
|
|
|
d2787b |
+
|
|
|
d2787b |
+ls -l ${M0}/d/ | wc -l
|
|
|
d2787b |
+
|
|
|
d2787b |
+EXPECT_WITHIN ${UMOUNT_TIMEOUT} "Y" force_umount ${M0}
|
|
|
d2787b |
+TEST ${GFS} --volfile-server ${H0} --volfile-id ${V0} ${M0}
|
|
|
d2787b |
+
|
|
|
d2787b |
+ls -l ${M0}/d/ | wc -l
|
|
|
d2787b |
+
|
|
|
d2787b |
+TEST ls ${M0}/d
|
|
|
d2787b |
+
|
|
|
d2787b |
+cleanup
|
|
|
d2787b |
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
|
|
|
d2787b |
index 0773092..ce0fbbf 100644
|
|
|
d2787b |
--- a/xlators/cluster/dht/src/dht-common.c
|
|
|
d2787b |
+++ b/xlators/cluster/dht/src/dht-common.c
|
|
|
d2787b |
@@ -24,8 +24,15 @@
|
|
|
d2787b |
#include <libgen.h>
|
|
|
d2787b |
#include <signal.h>
|
|
|
d2787b |
|
|
|
d2787b |
+#include <urcu/uatomic.h>
|
|
|
d2787b |
+
|
|
|
d2787b |
int run_defrag = 0;
|
|
|
d2787b |
|
|
|
d2787b |
+static int
|
|
|
d2787b |
+dht_rmdir_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
|
|
|
d2787b |
+ int op_ret, int op_errno, gf_dirent_t *entries,
|
|
|
d2787b |
+ dict_t *xdata);
|
|
|
d2787b |
+
|
|
|
d2787b |
int
|
|
|
d2787b |
dht_link2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret);
|
|
|
d2787b |
|
|
|
d2787b |
@@ -6681,6 +6688,94 @@ out:
|
|
|
d2787b |
return;
|
|
|
d2787b |
}
|
|
|
d2787b |
|
|
|
d2787b |
+/* Execute a READDIR request if no other request is in progress. Otherwise
|
|
|
d2787b |
+ * queue it to be executed when the current one finishes.
|
|
|
d2787b |
+ *
|
|
|
d2787b |
+ * When parallel-readdir is enabled and directory contents are cached, the
|
|
|
d2787b |
+ * callback of a readdirp will be called before returning from STACK_WIND.
|
|
|
d2787b |
+ * If the returned contents are not useful for DHT, and the buffer is not
|
|
|
d2787b |
+ * yet full, a nested readdirp request will be sent. This means that there
|
|
|
d2787b |
+ * will be many recursive calls. In the worst case there might be a stack
|
|
|
d2787b |
+ * overflow.
|
|
|
d2787b |
+ *
|
|
|
d2787b |
+ * To avoid this, we only wind a request if no other request is being wound.
|
|
|
d2787b |
+ * If there's another request, we simple store the values for the next call.
|
|
|
d2787b |
+ * When the thread processing the current wind completes it, it will take
|
|
|
d2787b |
+ * the new arguments and send the request from the top level stack. */
|
|
|
d2787b |
+static void
|
|
|
d2787b |
+dht_queue_readdir(call_frame_t *frame, xlator_t *xl, off_t offset,
|
|
|
d2787b |
+ fop_readdir_cbk_t cbk)
|
|
|
d2787b |
+{
|
|
|
d2787b |
+ dht_local_t *local;
|
|
|
d2787b |
+ int32_t queue;
|
|
|
d2787b |
+ xlator_t *this = NULL;
|
|
|
d2787b |
+
|
|
|
d2787b |
+ local = frame->local;
|
|
|
d2787b |
+ this = frame->this;
|
|
|
d2787b |
+
|
|
|
d2787b |
+ local->queue_xl = xl;
|
|
|
d2787b |
+ local->queue_offset = offset;
|
|
|
d2787b |
+
|
|
|
d2787b |
+ if (uatomic_add_return(&local->queue, 1) == 1) {
|
|
|
d2787b |
+ /* If we are here it means that we are the first one to send a
|
|
|
d2787b |
+ * readdir request. Any attempt to send more readdir requests will
|
|
|
d2787b |
+ * find local->queue > 1, so it won't do anything. The needed data
|
|
|
d2787b |
+ * to send the request has been stored into local->queue_*.
|
|
|
d2787b |
+ *
|
|
|
d2787b |
+ * Note: this works because we will only have 1 additional request
|
|
|
d2787b |
+ * at most (the one called by the cbk function) while we are
|
|
|
d2787b |
+ * processing another readdir. */
|
|
|
d2787b |
+ do {
|
|
|
d2787b |
+ STACK_WIND_COOKIE(frame, cbk, local->queue_xl, local->queue_xl,
|
|
|
d2787b |
+ local->queue_xl->fops->readdir, local->fd,
|
|
|
d2787b |
+ local->size, local->queue_offset, local->xattr);
|
|
|
d2787b |
+
|
|
|
d2787b |
+ /* If a new readdirp request has been added before returning
|
|
|
d2787b |
+ * from winding, we process it. */
|
|
|
d2787b |
+ } while ((queue = uatomic_sub_return(&local->queue, 1)) > 0);
|
|
|
d2787b |
+
|
|
|
d2787b |
+ if (queue < 0) {
|
|
|
d2787b |
+ /* A negative value means that an unwind has been called before
|
|
|
d2787b |
+ * returning from the previous wind. This means that 'local' is
|
|
|
d2787b |
+ * not needed anymore and must be destroyed. */
|
|
|
d2787b |
+ dht_local_wipe(this, local);
|
|
|
d2787b |
+ }
|
|
|
d2787b |
+ }
|
|
|
d2787b |
+}
|
|
|
d2787b |
+
|
|
|
d2787b |
+/* Execute a READDIRP request if no other request is in progress. Otherwise
|
|
|
d2787b |
+ * queue it to be executed when the current one finishes. */
|
|
|
d2787b |
+static void
|
|
|
d2787b |
+dht_queue_readdirp(call_frame_t *frame, xlator_t *xl, off_t offset,
|
|
|
d2787b |
+ fop_readdirp_cbk_t cbk)
|
|
|
d2787b |
+{
|
|
|
d2787b |
+ dht_local_t *local;
|
|
|
d2787b |
+ int32_t queue;
|
|
|
d2787b |
+ xlator_t *this = NULL;
|
|
|
d2787b |
+
|
|
|
d2787b |
+ local = frame->local;
|
|
|
d2787b |
+ this = frame->this;
|
|
|
d2787b |
+
|
|
|
d2787b |
+ local->queue_xl = xl;
|
|
|
d2787b |
+ local->queue_offset = offset;
|
|
|
d2787b |
+
|
|
|
d2787b |
+ /* Check dht_queue_readdir() comments for an explanation of this. */
|
|
|
d2787b |
+ if (uatomic_add_return(&local->queue, 1) == 1) {
|
|
|
d2787b |
+ do {
|
|
|
d2787b |
+ STACK_WIND_COOKIE(frame, cbk, local->queue_xl, local->queue_xl,
|
|
|
d2787b |
+ local->queue_xl->fops->readdirp, local->fd,
|
|
|
d2787b |
+ local->size, local->queue_offset, local->xattr);
|
|
|
d2787b |
+ } while ((queue = uatomic_sub_return(&local->queue, 1)) > 0);
|
|
|
d2787b |
+
|
|
|
d2787b |
+ if (queue < 0) {
|
|
|
d2787b |
+ /* A negative value means that an unwind has been called before
|
|
|
d2787b |
+ * returning from the previous wind. This means that 'local' is
|
|
|
d2787b |
+ * not needed anymore and must be destroyed. */
|
|
|
d2787b |
+ dht_local_wipe(this, local);
|
|
|
d2787b |
+ }
|
|
|
d2787b |
+ }
|
|
|
d2787b |
+}
|
|
|
d2787b |
+
|
|
|
d2787b |
/* Posix returns op_errno = ENOENT to indicate that there are no more
|
|
|
d2787b |
* entries
|
|
|
d2787b |
*/
|
|
|
d2787b |
@@ -6950,9 +7045,8 @@ done:
|
|
|
d2787b |
}
|
|
|
d2787b |
}
|
|
|
d2787b |
|
|
|
d2787b |
- STACK_WIND_COOKIE(frame, dht_readdirp_cbk, next_subvol, next_subvol,
|
|
|
d2787b |
- next_subvol->fops->readdirp, local->fd, local->size,
|
|
|
d2787b |
- next_offset, local->xattr);
|
|
|
d2787b |
+ dht_queue_readdirp(frame, next_subvol, next_offset, dht_readdirp_cbk);
|
|
|
d2787b |
+
|
|
|
d2787b |
return 0;
|
|
|
d2787b |
}
|
|
|
d2787b |
|
|
|
d2787b |
@@ -6970,6 +7064,17 @@ unwind:
|
|
|
d2787b |
if (prev != dht_last_up_subvol(this))
|
|
|
d2787b |
op_errno = 0;
|
|
|
d2787b |
|
|
|
d2787b |
+ /* If we are inside a recursive call (or not inside a recursive call but
|
|
|
d2787b |
+ * the cbk is completed before the wind returns), local->queue will be 1.
|
|
|
d2787b |
+ * In this case we cannot destroy 'local' because it will be needed by
|
|
|
d2787b |
+ * the caller of STACK_WIND. In this case, we decrease the value to let
|
|
|
d2787b |
+ * the caller know that the operation has terminated and it must destroy
|
|
|
d2787b |
+ * 'local'. If local->queue 0, we can destroy it here because there are
|
|
|
d2787b |
+ * no other users. */
|
|
|
d2787b |
+ if (uatomic_sub_return(&local->queue, 1) >= 0) {
|
|
|
d2787b |
+ frame->local = NULL;
|
|
|
d2787b |
+ }
|
|
|
d2787b |
+
|
|
|
d2787b |
DHT_STACK_UNWIND(readdirp, frame, op_ret, op_errno, &entries, NULL);
|
|
|
d2787b |
|
|
|
d2787b |
gf_dirent_free(&entries);
|
|
|
d2787b |
@@ -7071,9 +7176,8 @@ done:
|
|
|
d2787b |
goto unwind;
|
|
|
d2787b |
}
|
|
|
d2787b |
|
|
|
d2787b |
- STACK_WIND_COOKIE(frame, dht_readdir_cbk, next_subvol, next_subvol,
|
|
|
d2787b |
- next_subvol->fops->readdir, local->fd, local->size,
|
|
|
d2787b |
- next_offset, NULL);
|
|
|
d2787b |
+ dht_queue_readdir(frame, next_subvol, next_offset, dht_readdir_cbk);
|
|
|
d2787b |
+
|
|
|
d2787b |
return 0;
|
|
|
d2787b |
}
|
|
|
d2787b |
|
|
|
d2787b |
@@ -7089,6 +7193,17 @@ unwind:
|
|
|
d2787b |
if (prev != dht_last_up_subvol(this))
|
|
|
d2787b |
op_errno = 0;
|
|
|
d2787b |
|
|
|
d2787b |
+ /* If we are inside a recursive call (or not inside a recursive call but
|
|
|
d2787b |
+ * the cbk is completed before the wind returns), local->queue will be 1.
|
|
|
d2787b |
+ * In this case we cannot destroy 'local' because it will be needed by
|
|
|
d2787b |
+ * the caller of STACK_WIND. In this case, we decrease the value to let
|
|
|
d2787b |
+ * the caller know that the operation has terminated and it must destroy
|
|
|
d2787b |
+ * 'local'. If local->queue 0, we can destroy it here because there are
|
|
|
d2787b |
+ * no other users. */
|
|
|
d2787b |
+ if (uatomic_sub_return(&local->queue, 1) >= 0) {
|
|
|
d2787b |
+ frame->local = NULL;
|
|
|
d2787b |
+ }
|
|
|
d2787b |
+
|
|
|
d2787b |
if (!skip_hashed_check) {
|
|
|
d2787b |
DHT_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, NULL);
|
|
|
d2787b |
gf_dirent_free(&entries);
|
|
|
d2787b |
@@ -7096,6 +7211,7 @@ unwind:
|
|
|
d2787b |
} else {
|
|
|
d2787b |
DHT_STACK_UNWIND(readdir, frame, op_ret, op_errno, orig_entries, NULL);
|
|
|
d2787b |
}
|
|
|
d2787b |
+
|
|
|
d2787b |
return 0;
|
|
|
d2787b |
}
|
|
|
d2787b |
|
|
|
d2787b |
@@ -7172,11 +7288,9 @@ dht_do_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
|
|
|
d2787b |
}
|
|
|
d2787b |
}
|
|
|
d2787b |
|
|
|
d2787b |
- STACK_WIND_COOKIE(frame, dht_readdirp_cbk, xvol, xvol,
|
|
|
d2787b |
- xvol->fops->readdirp, fd, size, yoff, local->xattr);
|
|
|
d2787b |
+ dht_queue_readdirp(frame, xvol, yoff, dht_readdirp_cbk);
|
|
|
d2787b |
} else {
|
|
|
d2787b |
- STACK_WIND_COOKIE(frame, dht_readdir_cbk, xvol, xvol,
|
|
|
d2787b |
- xvol->fops->readdir, fd, size, yoff, local->xattr);
|
|
|
d2787b |
+ dht_queue_readdir(frame, xvol, yoff, dht_readdir_cbk);
|
|
|
d2787b |
}
|
|
|
d2787b |
|
|
|
d2787b |
return 0;
|
|
|
d2787b |
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
|
|
|
d2787b |
index 92f1b89..132b3b3 100644
|
|
|
d2787b |
--- a/xlators/cluster/dht/src/dht-common.h
|
|
|
d2787b |
+++ b/xlators/cluster/dht/src/dht-common.h
|
|
|
d2787b |
@@ -369,6 +369,11 @@ struct dht_local {
|
|
|
d2787b |
|
|
|
d2787b |
dht_dir_transaction_t lock[2], *current;
|
|
|
d2787b |
|
|
|
d2787b |
+ /* for nested readdirs */
|
|
|
d2787b |
+ xlator_t *queue_xl;
|
|
|
d2787b |
+ off_t queue_offset;
|
|
|
d2787b |
+ int32_t queue;
|
|
|
d2787b |
+
|
|
|
d2787b |
/* inodelks during filerename for backward compatibility */
|
|
|
d2787b |
dht_lock_t **rename_inodelk_backward_compatible;
|
|
|
d2787b |
int rename_inodelk_bc_count;
|
|
|
d2787b |
--
|
|
|
d2787b |
1.8.3.1
|
|
|
d2787b |
|