3604df
From a1d000c68bd02ae083ef4d0405b20f80ea019365 Mon Sep 17 00:00:00 2001
3604df
From: Ravishankar N <ravishankar@redhat.com>
3604df
Date: Fri, 9 Dec 2016 09:50:43 +0530
3604df
Subject: [PATCH 236/246] syncop: fix conditional wait bug in parallel dir scan
3604df
3604df
Backport of:  http://review.gluster.org/16073
3604df
3604df
Problem:
3604df
The issue as seen by the user is detailed in the BZ but what is
3604df
happening is if the no. of items in the wait queue == max-qlen,
3604df
syncop_mt_dir_scan() does a pthread_cond_wait until the launched
3604df
synctask workers dequeue the queue. But if for some reason the worker
3604df
fails, the queue is never emptied due to which further invocations of
3604df
syncop_mt_dir_scan() are blocked forever.
3604df
3604df
Fix: Made some changes to _dir_scan_job_fn
3604df
3604df
- If a worker encounters error while processing an entry, notify the
3604df
  readdir loop in syncop_mt_dir_scan() of the error but continue to process
3604df
  other entries in the queue, decrementing the qlen as and when we dequeue
3604df
  elements, and ending only when the queue is empty.
3604df
3604df
- If the readdir loop in syncop_mt_dir_scan() gets an error form the
3604df
  worker, stop the readdir+queueing of further entries.
3604df
3604df
Change-Id: Id132a9795b1322b7e601f5de8dfd5528d664cef9
3604df
BUG: 1403120
3604df
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
3604df
Reviewed-on: https://code.engineering.redhat.com/gerrit/92588
3604df
Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
3604df
Tested-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
3604df
---
3604df
 libglusterfs/src/syncop-utils.c                  | 15 +++++++-----
3604df
 tests/bugs/core/bug-1402841.t-mt-dir-scan-race.t | 31 ++++++++++++++++++++++++
3604df
 2 files changed, 40 insertions(+), 6 deletions(-)
3604df
 create mode 100755 tests/bugs/core/bug-1402841.t-mt-dir-scan-race.t
3604df
3604df
diff --git a/libglusterfs/src/syncop-utils.c b/libglusterfs/src/syncop-utils.c
3604df
index 8c25dd1..696e36b 100644
3604df
--- a/libglusterfs/src/syncop-utils.c
3604df
+++ b/libglusterfs/src/syncop-utils.c
3604df
@@ -266,9 +266,10 @@ _dir_scan_job_fn (void *data)
3604df
                 entry = NULL;
3604df
                 pthread_mutex_lock (scan_data->mut);
3604df
                 {
3604df
-                        if (ret || list_empty (&scan_data->q->list)) {
3604df
-                                (*scan_data->jobs_running)--;
3604df
+                        if (ret)
3604df
                                 *scan_data->retval |= ret;
3604df
+                        if (list_empty (&scan_data->q->list)) {
3604df
+                                (*scan_data->jobs_running)--;
3604df
                                 pthread_cond_broadcast (scan_data->cond);
3604df
                         } else {
3604df
                                 entry = list_first_entry (&scan_data->q->list,
3604df
@@ -406,10 +407,13 @@ syncop_mt_dir_scan (call_frame_t *frame, xlator_t *subvol, loc_t *loc, int pid,
3604df
                                 ret = fn (subvol, entry, loc, data);
3604df
                                 gf_dirent_entry_free (entry);
3604df
                                 if (ret)
3604df
-                                        break;
3604df
+                                        goto out;
3604df
                                 continue;
3604df
                         }
3604df
 
3604df
+                        if (retval) /*Any jobs failed?*/
3604df
+                                goto out;
3604df
+
3604df
                         pthread_mutex_lock (&mut;;
3604df
                         {
3604df
                                 while (qlen == max_qlen)
3604df
@@ -423,8 +427,7 @@ syncop_mt_dir_scan (call_frame_t *frame, xlator_t *subvol, loc_t *loc, int pid,
3604df
                                 }
3604df
                         }
3604df
                         pthread_mutex_unlock (&mut;;
3604df
-                        if (retval) /*Any jobs failed?*/
3604df
-                                break;
3604df
+
3604df
 
3604df
                         if (!entry)
3604df
                                 continue;
3604df
@@ -433,7 +436,7 @@ syncop_mt_dir_scan (call_frame_t *frame, xlator_t *subvol, loc_t *loc, int pid,
3604df
                                                   &retval, &mut, &cond,
3604df
                                                 &jobs_running, &qlen, fn, data);
3604df
                         if (ret)
3604df
-                                break;
3604df
+                                goto out;
3604df
                 }
3604df
         }
3604df
 
3604df
diff --git a/tests/bugs/core/bug-1402841.t-mt-dir-scan-race.t b/tests/bugs/core/bug-1402841.t-mt-dir-scan-race.t
3604df
new file mode 100755
3604df
index 0000000..e31c810
3604df
--- /dev/null
3604df
+++ b/tests/bugs/core/bug-1402841.t-mt-dir-scan-race.t
3604df
@@ -0,0 +1,31 @@
3604df
+#!/bin/bash
3604df
+. $(dirname $0)/../../include.rc
3604df
+. $(dirname $0)/../../volume.rc
3604df
+cleanup;
3604df
+
3604df
+TEST glusterd
3604df
+TEST pidof glusterd
3604df
+TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1}
3604df
+TEST $CLI volume set $V0 self-heal-daemon off
3604df
+TEST $CLI volume set $V0 cluster.shd-wait-qlength 100
3604df
+TEST $CLI volume start $V0
3604df
+
3604df
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0;
3604df
+touch $M0/file{1..200}
3604df
+
3604df
+TEST kill_brick $V0 $H0 $B0/${V0}1
3604df
+for i in {1..200}; do echo hello>$M0/file$i; done
3604df
+TEST $CLI volume start $V0 force
3604df
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1
3604df
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
3604df
+
3604df
+EXPECT "200" get_pending_heal_count $V0
3604df
+TEST $CLI volume set $V0 self-heal-daemon on
3604df
+TEST $CLI volume heal $V0
3604df
+TEST $CLI volume set $V0 self-heal-daemon off
3604df
+EXPECT_NOT "^0$" get_pending_heal_count $V0
3604df
+TEST $CLI volume set $V0 self-heal-daemon on
3604df
+TEST $CLI volume heal $V0
3604df
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
3604df
+TEST umount $M0
3604df
+cleanup;
3604df
-- 
3604df
2.9.3
3604df