From a1d000c68bd02ae083ef4d0405b20f80ea019365 Mon Sep 17 00:00:00 2001 From: Ravishankar N Date: Fri, 9 Dec 2016 09:50:43 +0530 Subject: [PATCH 236/246] syncop: fix conditional wait bug in parallel dir scan Backport of: http://review.gluster.org/16073 Problem: The issue as seen by the user is detailed in the BZ but what is happening is if the no. of items in the wait queue == max-qlen, syncop_mt_dir_scan() does a pthread_cond_wait until the launched synctask workers dequeue the queue. But if for some reason the worker fails, the queue is never emptied due to which further invocations of syncop_mt_dir_scan() are blocked forever. Fix: Made some changes to _dir_scan_job_fn - If a worker encounters error while processing an entry, notify the readdir loop in syncop_mt_dir_scan() of the error but continue to process other entries in the queue, decrementing the qlen as and when we dequeue elements, and ending only when the queue is empty. - If the readdir loop in syncop_mt_dir_scan() gets an error form the worker, stop the readdir+queueing of further entries. Change-Id: Id132a9795b1322b7e601f5de8dfd5528d664cef9 BUG: 1403120 Signed-off-by: Ravishankar N Reviewed-on: https://code.engineering.redhat.com/gerrit/92588 Reviewed-by: Pranith Kumar Karampuri Tested-by: Pranith Kumar Karampuri --- libglusterfs/src/syncop-utils.c | 15 +++++++----- tests/bugs/core/bug-1402841.t-mt-dir-scan-race.t | 31 ++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 6 deletions(-) create mode 100755 tests/bugs/core/bug-1402841.t-mt-dir-scan-race.t diff --git a/libglusterfs/src/syncop-utils.c b/libglusterfs/src/syncop-utils.c index 8c25dd1..696e36b 100644 --- a/libglusterfs/src/syncop-utils.c +++ b/libglusterfs/src/syncop-utils.c @@ -266,9 +266,10 @@ _dir_scan_job_fn (void *data) entry = NULL; pthread_mutex_lock (scan_data->mut); { - if (ret || list_empty (&scan_data->q->list)) { - (*scan_data->jobs_running)--; + if (ret) *scan_data->retval |= ret; + if (list_empty (&scan_data->q->list)) { + (*scan_data->jobs_running)--; pthread_cond_broadcast (scan_data->cond); } else { entry = list_first_entry (&scan_data->q->list, @@ -406,10 +407,13 @@ syncop_mt_dir_scan (call_frame_t *frame, xlator_t *subvol, loc_t *loc, int pid, ret = fn (subvol, entry, loc, data); gf_dirent_entry_free (entry); if (ret) - break; + goto out; continue; } + if (retval) /*Any jobs failed?*/ + goto out; + pthread_mutex_lock (&mut); { while (qlen == max_qlen) @@ -423,8 +427,7 @@ syncop_mt_dir_scan (call_frame_t *frame, xlator_t *subvol, loc_t *loc, int pid, } } pthread_mutex_unlock (&mut); - if (retval) /*Any jobs failed?*/ - break; + if (!entry) continue; @@ -433,7 +436,7 @@ syncop_mt_dir_scan (call_frame_t *frame, xlator_t *subvol, loc_t *loc, int pid, &retval, &mut, &cond, &jobs_running, &qlen, fn, data); if (ret) - break; + goto out; } } diff --git a/tests/bugs/core/bug-1402841.t-mt-dir-scan-race.t b/tests/bugs/core/bug-1402841.t-mt-dir-scan-race.t new file mode 100755 index 0000000..e31c810 --- /dev/null +++ b/tests/bugs/core/bug-1402841.t-mt-dir-scan-race.t @@ -0,0 +1,31 @@ +#!/bin/bash +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1} +TEST $CLI volume set $V0 self-heal-daemon off +TEST $CLI volume set $V0 cluster.shd-wait-qlength 100 +TEST $CLI volume start $V0 + +TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0; +touch $M0/file{1..200} + +TEST kill_brick $V0 $H0 $B0/${V0}1 +for i in {1..200}; do echo hello>$M0/file$i; done +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 + +EXPECT "200" get_pending_heal_count $V0 +TEST $CLI volume set $V0 self-heal-daemon on +TEST $CLI volume heal $V0 +TEST $CLI volume set $V0 self-heal-daemon off +EXPECT_NOT "^0$" get_pending_heal_count $V0 +TEST $CLI volume set $V0 self-heal-daemon on +TEST $CLI volume heal $V0 +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 +TEST umount $M0 +cleanup; -- 2.9.3