|
|
3604df |
From a1d000c68bd02ae083ef4d0405b20f80ea019365 Mon Sep 17 00:00:00 2001
|
|
|
3604df |
From: Ravishankar N <ravishankar@redhat.com>
|
|
|
3604df |
Date: Fri, 9 Dec 2016 09:50:43 +0530
|
|
|
3604df |
Subject: [PATCH 236/246] syncop: fix conditional wait bug in parallel dir scan
|
|
|
3604df |
|
|
|
3604df |
Backport of: http://review.gluster.org/16073
|
|
|
3604df |
|
|
|
3604df |
Problem:
|
|
|
3604df |
The issue as seen by the user is detailed in the BZ but what is
|
|
|
3604df |
happening is if the no. of items in the wait queue == max-qlen,
|
|
|
3604df |
syncop_mt_dir_scan() does a pthread_cond_wait until the launched
|
|
|
3604df |
synctask workers dequeue the queue. But if for some reason the worker
|
|
|
3604df |
fails, the queue is never emptied due to which further invocations of
|
|
|
3604df |
syncop_mt_dir_scan() are blocked forever.
|
|
|
3604df |
|
|
|
3604df |
Fix: Made some changes to _dir_scan_job_fn
|
|
|
3604df |
|
|
|
3604df |
- If a worker encounters error while processing an entry, notify the
|
|
|
3604df |
readdir loop in syncop_mt_dir_scan() of the error but continue to process
|
|
|
3604df |
other entries in the queue, decrementing the qlen as and when we dequeue
|
|
|
3604df |
elements, and ending only when the queue is empty.
|
|
|
3604df |
|
|
|
3604df |
- If the readdir loop in syncop_mt_dir_scan() gets an error form the
|
|
|
3604df |
worker, stop the readdir+queueing of further entries.
|
|
|
3604df |
|
|
|
3604df |
Change-Id: Id132a9795b1322b7e601f5de8dfd5528d664cef9
|
|
|
3604df |
BUG: 1403120
|
|
|
3604df |
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
|
|
|
3604df |
Reviewed-on: https://code.engineering.redhat.com/gerrit/92588
|
|
|
3604df |
Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
|
|
|
3604df |
Tested-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
|
|
|
3604df |
---
|
|
|
3604df |
libglusterfs/src/syncop-utils.c | 15 +++++++-----
|
|
|
3604df |
tests/bugs/core/bug-1402841.t-mt-dir-scan-race.t | 31 ++++++++++++++++++++++++
|
|
|
3604df |
2 files changed, 40 insertions(+), 6 deletions(-)
|
|
|
3604df |
create mode 100755 tests/bugs/core/bug-1402841.t-mt-dir-scan-race.t
|
|
|
3604df |
|
|
|
3604df |
diff --git a/libglusterfs/src/syncop-utils.c b/libglusterfs/src/syncop-utils.c
|
|
|
3604df |
index 8c25dd1..696e36b 100644
|
|
|
3604df |
--- a/libglusterfs/src/syncop-utils.c
|
|
|
3604df |
+++ b/libglusterfs/src/syncop-utils.c
|
|
|
3604df |
@@ -266,9 +266,10 @@ _dir_scan_job_fn (void *data)
|
|
|
3604df |
entry = NULL;
|
|
|
3604df |
pthread_mutex_lock (scan_data->mut);
|
|
|
3604df |
{
|
|
|
3604df |
- if (ret || list_empty (&scan_data->q->list)) {
|
|
|
3604df |
- (*scan_data->jobs_running)--;
|
|
|
3604df |
+ if (ret)
|
|
|
3604df |
*scan_data->retval |= ret;
|
|
|
3604df |
+ if (list_empty (&scan_data->q->list)) {
|
|
|
3604df |
+ (*scan_data->jobs_running)--;
|
|
|
3604df |
pthread_cond_broadcast (scan_data->cond);
|
|
|
3604df |
} else {
|
|
|
3604df |
entry = list_first_entry (&scan_data->q->list,
|
|
|
3604df |
@@ -406,10 +407,13 @@ syncop_mt_dir_scan (call_frame_t *frame, xlator_t *subvol, loc_t *loc, int pid,
|
|
|
3604df |
ret = fn (subvol, entry, loc, data);
|
|
|
3604df |
gf_dirent_entry_free (entry);
|
|
|
3604df |
if (ret)
|
|
|
3604df |
- break;
|
|
|
3604df |
+ goto out;
|
|
|
3604df |
continue;
|
|
|
3604df |
}
|
|
|
3604df |
|
|
|
3604df |
+ if (retval) /*Any jobs failed?*/
|
|
|
3604df |
+ goto out;
|
|
|
3604df |
+
|
|
|
3604df |
pthread_mutex_lock (&mut;;
|
|
|
3604df |
{
|
|
|
3604df |
while (qlen == max_qlen)
|
|
|
3604df |
@@ -423,8 +427,7 @@ syncop_mt_dir_scan (call_frame_t *frame, xlator_t *subvol, loc_t *loc, int pid,
|
|
|
3604df |
}
|
|
|
3604df |
}
|
|
|
3604df |
pthread_mutex_unlock (&mut;;
|
|
|
3604df |
- if (retval) /*Any jobs failed?*/
|
|
|
3604df |
- break;
|
|
|
3604df |
+
|
|
|
3604df |
|
|
|
3604df |
if (!entry)
|
|
|
3604df |
continue;
|
|
|
3604df |
@@ -433,7 +436,7 @@ syncop_mt_dir_scan (call_frame_t *frame, xlator_t *subvol, loc_t *loc, int pid,
|
|
|
3604df |
&retval, &mut, &cond,
|
|
|
3604df |
&jobs_running, &qlen, fn, data);
|
|
|
3604df |
if (ret)
|
|
|
3604df |
- break;
|
|
|
3604df |
+ goto out;
|
|
|
3604df |
}
|
|
|
3604df |
}
|
|
|
3604df |
|
|
|
3604df |
diff --git a/tests/bugs/core/bug-1402841.t-mt-dir-scan-race.t b/tests/bugs/core/bug-1402841.t-mt-dir-scan-race.t
|
|
|
3604df |
new file mode 100755
|
|
|
3604df |
index 0000000..e31c810
|
|
|
3604df |
--- /dev/null
|
|
|
3604df |
+++ b/tests/bugs/core/bug-1402841.t-mt-dir-scan-race.t
|
|
|
3604df |
@@ -0,0 +1,31 @@
|
|
|
3604df |
+#!/bin/bash
|
|
|
3604df |
+. $(dirname $0)/../../include.rc
|
|
|
3604df |
+. $(dirname $0)/../../volume.rc
|
|
|
3604df |
+cleanup;
|
|
|
3604df |
+
|
|
|
3604df |
+TEST glusterd
|
|
|
3604df |
+TEST pidof glusterd
|
|
|
3604df |
+TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1}
|
|
|
3604df |
+TEST $CLI volume set $V0 self-heal-daemon off
|
|
|
3604df |
+TEST $CLI volume set $V0 cluster.shd-wait-qlength 100
|
|
|
3604df |
+TEST $CLI volume start $V0
|
|
|
3604df |
+
|
|
|
3604df |
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0;
|
|
|
3604df |
+touch $M0/file{1..200}
|
|
|
3604df |
+
|
|
|
3604df |
+TEST kill_brick $V0 $H0 $B0/${V0}1
|
|
|
3604df |
+for i in {1..200}; do echo hello>$M0/file$i; done
|
|
|
3604df |
+TEST $CLI volume start $V0 force
|
|
|
3604df |
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1
|
|
|
3604df |
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
|
|
|
3604df |
+
|
|
|
3604df |
+EXPECT "200" get_pending_heal_count $V0
|
|
|
3604df |
+TEST $CLI volume set $V0 self-heal-daemon on
|
|
|
3604df |
+TEST $CLI volume heal $V0
|
|
|
3604df |
+TEST $CLI volume set $V0 self-heal-daemon off
|
|
|
3604df |
+EXPECT_NOT "^0$" get_pending_heal_count $V0
|
|
|
3604df |
+TEST $CLI volume set $V0 self-heal-daemon on
|
|
|
3604df |
+TEST $CLI volume heal $V0
|
|
|
3604df |
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
|
|
|
3604df |
+TEST umount $M0
|
|
|
3604df |
+cleanup;
|
|
|
3604df |
--
|
|
|
3604df |
2.9.3
|
|
|
3604df |
|