From fad234b5a62df48b7abc726549f2abb6b0af7c04 Mon Sep 17 00:00:00 2001 From: Mohit Agrawal Date: Tue, 16 Oct 2018 07:50:47 +0530 Subject: [PATCH 402/404] core: glusterfsd keeping fd open in index xlator Problem: At the time of processing GF_EVENT_PARENT_DOWN at brick xlator, it forwards the event to next xlator only while xlator ensures no stub is in progress. At io-thread xlator it decreases stub_cnt before the process a stub and notify EVENT to next xlator Solution: Introduce a new counter to save stub_cnt and decrease the counter after process the stub completely at io-thread xlator. To avoid brick crash at the time of call xlator_mem_cleanup move only brick xlator if detach brick name has found in the graph Note: Thanks to pranith for sharing a simple reproducer to reproduce the same > fixes bz#1637934 > Change-Id: I1a694a001f7a5417e8771e3adf92c518969b6baa > (Cherry pick from commit 7bf95631b52bd05b06122180f8bd4aa62c70b1a9) > (Reviewed on upstream link https://review.gluster.org/#/c/glusterfs/+/21379/) Change-Id: I54b8ebb19819f9bbcbdd1448474ab084c0fd2eb6 BUG: 1631372 Signed-off-by: Mohit Agrawal Reviewed-on: https://code.engineering.redhat.com/gerrit/152908 Tested-by: RHGS Build Bot Reviewed-by: Sunil Kumar Heggodu Gopala Acharya --- glusterfsd/src/glusterfsd-mgmt.c | 15 +---- tests/bugs/glusterd/brick-mux-fd-cleanup.t | 78 +++++++++++++++++++++++++ xlators/performance/io-threads/src/io-threads.c | 23 ++++---- xlators/performance/io-threads/src/io-threads.h | 3 +- 4 files changed, 94 insertions(+), 25 deletions(-) create mode 100644 tests/bugs/glusterd/brick-mux-fd-cleanup.t diff --git a/glusterfsd/src/glusterfsd-mgmt.c b/glusterfsd/src/glusterfsd-mgmt.c index cbd436a..e3fceeb 100644 --- a/glusterfsd/src/glusterfsd-mgmt.c +++ b/glusterfsd/src/glusterfsd-mgmt.c @@ -270,6 +270,7 @@ xlator_mem_cleanup (xlator_t *this) { top = glusterfsd_ctx->active->first; LOCK (&ctx->volfile_lock); /* TODO here we have leak for xlator node in a graph */ + /* Need to move only top xlator from a graph */ for (trav_p = &top->children; *trav_p; trav_p = &(*trav_p)->next) { victim = (*trav_p)->xlator; if (victim->call_cleanup && !strcmp (victim->name, this->name)) { @@ -277,20 +278,6 @@ xlator_mem_cleanup (xlator_t *this) { break; } } - /* TODO Sometime brick xlator is not moved from graph so followed below - approach to move brick xlator from a graph, will move specific brick - xlator from graph only while inode table and mem_acct are cleaned up - */ - trav_p = &top->children; - while (*trav_p) { - victim = (*trav_p)->xlator; - if (victim->call_cleanup && !victim->itable && !victim->mem_acct) { - (*trav_p) = (*trav_p)->next; - } else { - trav_p = &(*trav_p)->next; - } - } - UNLOCK (&ctx->volfile_lock); } } diff --git a/tests/bugs/glusterd/brick-mux-fd-cleanup.t b/tests/bugs/glusterd/brick-mux-fd-cleanup.t new file mode 100644 index 0000000..de11c17 --- /dev/null +++ b/tests/bugs/glusterd/brick-mux-fd-cleanup.t @@ -0,0 +1,78 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +#This .t tests that the fds from client are closed on brick when gluster volume +#stop is executed in brick-mux setup. + +cleanup; +TEST glusterd +TEST pidof glusterd + +function keep_fd_open { +#This function has to be run as background job because opening the fd in +#foreground and running commands is leading to flush calls on these fds +#which is making it very difficult to create the race where fds will be left +#open even after the brick dies. + exec 5>$M1/a + exec 6>$M1/b + while [ -f $M0/a ]; do sleep 1; done +} + +function count_open_files { + local brick_pid="$1" + local pattern="$2" + ls -l /proc/$brick_pid/fd | grep -i "$pattern" | wc -l +} + +TEST $CLI volume set all cluster.brick-multiplex on +TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1} +TEST $CLI volume create $V1 replica 2 $H0:$B0/${V1}{2,3} +#Have same configuration on both bricks so that they are multiplexed +#Delay flush fop for a second +TEST $CLI volume heal $V0 disable +TEST $CLI volume heal $V1 disable +TEST $CLI volume set $V0 delay-gen posix +TEST $CLI volume set $V0 delay-gen.enable flush +TEST $CLI volume set $V0 delay-gen.delay-percentage 100 +TEST $CLI volume set $V0 delay-gen.delay-duration 1000000 +TEST $CLI volume set $V1 delay-gen posix +TEST $CLI volume set $V1 delay-gen.enable flush +TEST $CLI volume set $V1 delay-gen.delay-percentage 100 +TEST $CLI volume set $V1 delay-gen.delay-duration 1000000 + +TEST $CLI volume start $V0 +TEST $CLI volume start $V1 + +TEST $GFS -s $H0 --volfile-id=$V0 --direct-io-mode=enable $M0 +TEST $GFS -s $H0 --volfile-id=$V1 --direct-io-mode=enable $M1 + +TEST touch $M0/a +keep_fd_open & +TEST $CLI volume profile $V1 start +brick_pid=$(get_brick_pid $V1 $H0 $B0/${V1}2) +TEST count_open_files $brick_pid "$B0/${V1}2/a" +TEST count_open_files $brick_pid "$B0/${V1}2/b" +TEST count_open_files $brick_pid "$B0/${V1}3/a" +TEST count_open_files $brick_pid "$B0/${V1}3/b" + +#If any other flush fops are introduced into the system other than the one at +#cleanup it interferes with the race, so test for it +EXPECT "^0$" echo "$($CLI volume profile $V1 info incremental | grep -i flush | wc -l)" +#Stop the volume +TEST $CLI volume stop $V1 + +#Wait for cleanup resources or volume V1 +EXPECT_WITHIN $GRAPH_SWITCH_TIMEOUT "^0$" count_open_files $brick_pid "$B0/${V1}2/a" +EXPECT_WITHIN $GRAPH_SWITCH_TIMEOUT "^0$" count_open_files $brick_pid "$B0/${V1}2/b" +EXPECT_WITHIN $GRAPH_SWITCH_TIMEOUT "^0$" count_open_files $brick_pid "$B0/${V1}3/a" +EXPECT_WITHIN $GRAPH_SWITCH_TIMEOUT "^0$" count_open_files $brick_pid "$B0/${V1}3/b" + +TEST rm -f $M0/a #Exit keep_fd_open() +wait + +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M1 + +cleanup diff --git a/xlators/performance/io-threads/src/io-threads.c b/xlators/performance/io-threads/src/io-threads.c index 41d48ab..2944c7d 100644 --- a/xlators/performance/io-threads/src/io-threads.c +++ b/xlators/performance/io-threads/src/io-threads.c @@ -120,7 +120,7 @@ __iot_dequeue (iot_conf_t *conf, int *pri) if (!stub) return NULL; - GF_ATOMIC_DEC(conf->queue_size); + conf->queue_size--; conf->queue_sizes[*pri]--; return stub; @@ -153,7 +153,8 @@ __iot_enqueue (iot_conf_t *conf, call_stub_t *stub, int pri) } list_add_tail (&stub->list, &ctx->reqs); - GF_ATOMIC_INC(conf->queue_size); + conf->queue_size++; + GF_ATOMIC_INC(conf->stub_cnt); conf->queue_sizes[pri]++; } @@ -182,7 +183,7 @@ iot_worker (void *data) conf->ac_iot_count[pri]--; pri = -1; } - while (GF_ATOMIC_GET(conf->queue_size) == 0) { + while (conf->queue_size == 0) { if (conf->down) { bye = _gf_true;/*Avoid sleep*/ break; @@ -220,8 +221,10 @@ iot_worker (void *data) } pthread_mutex_unlock (&conf->mutex); - if (stub) /* guard against spurious wakeups */ + if (stub) { /* guard against spurious wakeups */ call_resume (stub); + GF_ATOMIC_DEC(conf->stub_cnt); + } stub = NULL; if (bye) @@ -816,7 +819,7 @@ __iot_workers_scale (iot_conf_t *conf) gf_msg_debug (conf->this->name, 0, "scaled threads to %d (queue_size=%d/%d)", conf->curr_count, - GF_ATOMIC_GET(conf->queue_size), scale); + conf->queue_size, scale); } else { break; } @@ -1030,7 +1033,7 @@ init (xlator_t *this) bool, out); conf->this = this; - GF_ATOMIC_INIT(conf->queue_size, 0); + GF_ATOMIC_INIT(conf->stub_cnt, 0); for (i = 0; i < IOT_PRI_MAX; i++) { INIT_LIST_HEAD (&conf->clients[i]); @@ -1075,7 +1078,7 @@ notify (xlator_t *this, int32_t event, void *data, ...) { iot_conf_t *conf = this->private; xlator_t *victim = data; - uint64_t queue_size = 0; + uint64_t stub_cnt = 0; struct timespec sleep_till = {0, }; if (GF_EVENT_PARENT_DOWN == event) { @@ -1083,14 +1086,14 @@ notify (xlator_t *this, int32_t event, void *data, ...) clock_gettime(CLOCK_REALTIME, &sleep_till); sleep_till.tv_sec += 1; /* Wait for draining stub from queue before notify PARENT_DOWN */ - queue_size = GF_ATOMIC_GET(conf->queue_size); + stub_cnt = GF_ATOMIC_GET(conf->stub_cnt); pthread_mutex_lock(&conf->mutex); { - while (queue_size) { + while (stub_cnt) { (void)pthread_cond_timedwait(&conf->cond, &conf->mutex, &sleep_till); - queue_size = GF_ATOMIC_GET(conf->queue_size); + stub_cnt = GF_ATOMIC_GET(conf->stub_cnt); } } pthread_mutex_unlock(&conf->mutex); diff --git a/xlators/performance/io-threads/src/io-threads.h b/xlators/performance/io-threads/src/io-threads.h index 7a6973c..57a136e 100644 --- a/xlators/performance/io-threads/src/io-threads.h +++ b/xlators/performance/io-threads/src/io-threads.h @@ -75,7 +75,8 @@ struct iot_conf { int32_t ac_iot_limit[IOT_PRI_MAX]; int32_t ac_iot_count[IOT_PRI_MAX]; int queue_sizes[IOT_PRI_MAX]; - gf_atomic_t queue_size; + int32_t queue_size; + gf_atomic_t stub_cnt; pthread_attr_t w_attr; gf_boolean_t least_priority; /*Enable/Disable least-priority */ -- 1.8.3.1