From ba57b043db1e19196cf860baeeeb1acfc9985cd2 Mon Sep 17 00:00:00 2001 From: Xavi Hernandez Date: Wed, 24 Feb 2021 15:04:23 +0100 Subject: [PATCH 557/584] cluster/dht: Fix stack overflow in readdir(p) When parallel-readdir is enabled, readdir(p) requests sent by DHT can be immediately processed and answered in the same thread before the call to STACK_WIND_COOKIE() completes. This means that the readdir(p) cbk is processed synchronously. In some cases it may decide to send another readdir(p) request, which causes a recursive call. When some special conditions happen and the directories are big, it's possible that the number of nested calls is so high that the process crashes because of a stack overflow. This patch fixes this by not allowing nested readdir(p) calls. When a nested call is detected, it's queued instead of sending it. The queued request is processed when the current call finishes by the top level stack function. Backport of 3 patches: > Upstream-patch: https://github.com/gluster/glusterfs/pull/2170 > Fixes: #2169 > Change-Id: Id763a8a51fb3c3314588ec7c162f649babf33099 > Signed-off-by: Xavi Hernandez > Upstream-patch: https://github.com/gluster/glusterfs/pull/2202 > Updates: #2169 > Change-Id: I97e73c0aae74fc5d80c975f56f2f7a64e3e1ae95 > Signed-off-by: Xavi Hernandez > Upstream-patch: https://github.com/gluster/glusterfs/pull/2242 > Fixes: #2239 > Change-Id: I6b2e48e87c85de27fad67a12d97abd91fa27c0c1 > Signed-off-by: Pranith Kumar K BUG: 1798897 Change-Id: Id763a8a51fb3c3314588ec7c162f649babf33099 Signed-off-by: Xavi Hernandez Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244549 Tested-by: RHGS Build Bot Reviewed-by: Sunil Kumar Heggodu Gopala Acharya --- tests/bugs/distribute/issue-2169.t | 33 +++++++++ xlators/cluster/dht/src/dht-common.c | 134 ++++++++++++++++++++++++++++++++--- xlators/cluster/dht/src/dht-common.h | 5 ++ 3 files changed, 162 insertions(+), 10 deletions(-) create mode 100755 tests/bugs/distribute/issue-2169.t diff --git a/tests/bugs/distribute/issue-2169.t b/tests/bugs/distribute/issue-2169.t new file mode 100755 index 0000000..91fa72a --- /dev/null +++ b/tests/bugs/distribute/issue-2169.t @@ -0,0 +1,33 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +cleanup + +TEST glusterd +TEST ${CLI} volume create ${V0} ${H0}:/$B0/${V0}_0 +TEST ${CLI} volume set ${V0} readdir-ahead on +TEST ${CLI} volume set ${V0} parallel-readdir on +TEST ${CLI} volume start ${V0} + +TEST ${GFS} --volfile-server ${H0} --volfile-id ${V0} ${M0} + +TEST mkdir -p ${M0}/d/d.{000..999} + +EXPECT_WITHIN ${UMOUNT_TIMEOUT} "Y" force_umount ${M0} + +TEST ${CLI} volume add-brick ${V0} ${H0}:${B0}/${V0}_{1..7} + +TEST ${GFS} --volfile-server ${H0} --volfile-id ${V0} ${M0} + +ls -l ${M0}/d/ | wc -l + +EXPECT_WITHIN ${UMOUNT_TIMEOUT} "Y" force_umount ${M0} +TEST ${GFS} --volfile-server ${H0} --volfile-id ${V0} ${M0} + +ls -l ${M0}/d/ | wc -l + +TEST ls ${M0}/d + +cleanup diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 0773092..ce0fbbf 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -24,8 +24,15 @@ #include #include +#include + int run_defrag = 0; +static int +dht_rmdir_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *entries, + dict_t *xdata); + int dht_link2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret); @@ -6681,6 +6688,94 @@ out: return; } +/* Execute a READDIR request if no other request is in progress. Otherwise + * queue it to be executed when the current one finishes. + * + * When parallel-readdir is enabled and directory contents are cached, the + * callback of a readdirp will be called before returning from STACK_WIND. + * If the returned contents are not useful for DHT, and the buffer is not + * yet full, a nested readdirp request will be sent. This means that there + * will be many recursive calls. In the worst case there might be a stack + * overflow. + * + * To avoid this, we only wind a request if no other request is being wound. + * If there's another request, we simple store the values for the next call. + * When the thread processing the current wind completes it, it will take + * the new arguments and send the request from the top level stack. */ +static void +dht_queue_readdir(call_frame_t *frame, xlator_t *xl, off_t offset, + fop_readdir_cbk_t cbk) +{ + dht_local_t *local; + int32_t queue; + xlator_t *this = NULL; + + local = frame->local; + this = frame->this; + + local->queue_xl = xl; + local->queue_offset = offset; + + if (uatomic_add_return(&local->queue, 1) == 1) { + /* If we are here it means that we are the first one to send a + * readdir request. Any attempt to send more readdir requests will + * find local->queue > 1, so it won't do anything. The needed data + * to send the request has been stored into local->queue_*. + * + * Note: this works because we will only have 1 additional request + * at most (the one called by the cbk function) while we are + * processing another readdir. */ + do { + STACK_WIND_COOKIE(frame, cbk, local->queue_xl, local->queue_xl, + local->queue_xl->fops->readdir, local->fd, + local->size, local->queue_offset, local->xattr); + + /* If a new readdirp request has been added before returning + * from winding, we process it. */ + } while ((queue = uatomic_sub_return(&local->queue, 1)) > 0); + + if (queue < 0) { + /* A negative value means that an unwind has been called before + * returning from the previous wind. This means that 'local' is + * not needed anymore and must be destroyed. */ + dht_local_wipe(this, local); + } + } +} + +/* Execute a READDIRP request if no other request is in progress. Otherwise + * queue it to be executed when the current one finishes. */ +static void +dht_queue_readdirp(call_frame_t *frame, xlator_t *xl, off_t offset, + fop_readdirp_cbk_t cbk) +{ + dht_local_t *local; + int32_t queue; + xlator_t *this = NULL; + + local = frame->local; + this = frame->this; + + local->queue_xl = xl; + local->queue_offset = offset; + + /* Check dht_queue_readdir() comments for an explanation of this. */ + if (uatomic_add_return(&local->queue, 1) == 1) { + do { + STACK_WIND_COOKIE(frame, cbk, local->queue_xl, local->queue_xl, + local->queue_xl->fops->readdirp, local->fd, + local->size, local->queue_offset, local->xattr); + } while ((queue = uatomic_sub_return(&local->queue, 1)) > 0); + + if (queue < 0) { + /* A negative value means that an unwind has been called before + * returning from the previous wind. This means that 'local' is + * not needed anymore and must be destroyed. */ + dht_local_wipe(this, local); + } + } +} + /* Posix returns op_errno = ENOENT to indicate that there are no more * entries */ @@ -6950,9 +7045,8 @@ done: } } - STACK_WIND_COOKIE(frame, dht_readdirp_cbk, next_subvol, next_subvol, - next_subvol->fops->readdirp, local->fd, local->size, - next_offset, local->xattr); + dht_queue_readdirp(frame, next_subvol, next_offset, dht_readdirp_cbk); + return 0; } @@ -6970,6 +7064,17 @@ unwind: if (prev != dht_last_up_subvol(this)) op_errno = 0; + /* If we are inside a recursive call (or not inside a recursive call but + * the cbk is completed before the wind returns), local->queue will be 1. + * In this case we cannot destroy 'local' because it will be needed by + * the caller of STACK_WIND. In this case, we decrease the value to let + * the caller know that the operation has terminated and it must destroy + * 'local'. If local->queue 0, we can destroy it here because there are + * no other users. */ + if (uatomic_sub_return(&local->queue, 1) >= 0) { + frame->local = NULL; + } + DHT_STACK_UNWIND(readdirp, frame, op_ret, op_errno, &entries, NULL); gf_dirent_free(&entries); @@ -7071,9 +7176,8 @@ done: goto unwind; } - STACK_WIND_COOKIE(frame, dht_readdir_cbk, next_subvol, next_subvol, - next_subvol->fops->readdir, local->fd, local->size, - next_offset, NULL); + dht_queue_readdir(frame, next_subvol, next_offset, dht_readdir_cbk); + return 0; } @@ -7089,6 +7193,17 @@ unwind: if (prev != dht_last_up_subvol(this)) op_errno = 0; + /* If we are inside a recursive call (or not inside a recursive call but + * the cbk is completed before the wind returns), local->queue will be 1. + * In this case we cannot destroy 'local' because it will be needed by + * the caller of STACK_WIND. In this case, we decrease the value to let + * the caller know that the operation has terminated and it must destroy + * 'local'. If local->queue 0, we can destroy it here because there are + * no other users. */ + if (uatomic_sub_return(&local->queue, 1) >= 0) { + frame->local = NULL; + } + if (!skip_hashed_check) { DHT_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, NULL); gf_dirent_free(&entries); @@ -7096,6 +7211,7 @@ unwind: } else { DHT_STACK_UNWIND(readdir, frame, op_ret, op_errno, orig_entries, NULL); } + return 0; } @@ -7172,11 +7288,9 @@ dht_do_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, } } - STACK_WIND_COOKIE(frame, dht_readdirp_cbk, xvol, xvol, - xvol->fops->readdirp, fd, size, yoff, local->xattr); + dht_queue_readdirp(frame, xvol, yoff, dht_readdirp_cbk); } else { - STACK_WIND_COOKIE(frame, dht_readdir_cbk, xvol, xvol, - xvol->fops->readdir, fd, size, yoff, local->xattr); + dht_queue_readdir(frame, xvol, yoff, dht_readdir_cbk); } return 0; diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 92f1b89..132b3b3 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -369,6 +369,11 @@ struct dht_local { dht_dir_transaction_t lock[2], *current; + /* for nested readdirs */ + xlator_t *queue_xl; + off_t queue_offset; + int32_t queue; + /* inodelks during filerename for backward compatibility */ dht_lock_t **rename_inodelk_backward_compatible; int rename_inodelk_bc_count; -- 1.8.3.1