9f5ccc
From 66600fb55522d405a68d7340a5680a2633c4237e Mon Sep 17 00:00:00 2001
9f5ccc
From: Xavi Hernandez <xhernandez@redhat.com>
9f5ccc
Date: Thu, 30 Apr 2020 11:19:01 +0200
9f5ccc
Subject: [PATCH 377/379] syncop: improve scaling and implement more tools
9f5ccc
9f5ccc
The current scaling of the syncop thread pool is not working properly
9f5ccc
and can leave some tasks in the run queue more time than necessary
9f5ccc
when the maximum number of threads is not reached.
9f5ccc
9f5ccc
This patch provides a better scaling condition to react faster to
9f5ccc
pending work.
9f5ccc
9f5ccc
Condition variables and sleep in the context of a synctask have also
9f5ccc
been implemented. Their purpose is to replace regular condition
9f5ccc
variables and sleeps that block synctask threads and prevent other
9f5ccc
tasks to be executed.
9f5ccc
9f5ccc
The new features have been applied to several places in glusterd.
9f5ccc
9f5ccc
upstream patch: https://review.gluster.org/#/c/glusterfs/+/24396/
9f5ccc
9f5ccc
> Change-Id: Ic50b7c73c104f9e41f08101a357d30b95efccfbf
9f5ccc
> Fixes: #1116
9f5ccc
> Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
9f5ccc
9f5ccc
Change-Id: Ic50b7c73c104f9e41f08101a357d30b95efccfbf
9f5ccc
BUG: 1810516
9f5ccc
Signed-off-by: Sanju Rakonde <srakonde@redhta.com>
9f5ccc
Reviewed-on: https://code.engineering.redhat.com/gerrit/200409
9f5ccc
Tested-by: Sanju Rakonde <srakonde@redhat.com>
9f5ccc
Tested-by: RHGS Build Bot <nigelb@redhat.com>
9f5ccc
Reviewed-by: Xavi Hernandez Juan <xhernandez@redhat.com>
9f5ccc
---
9f5ccc
 libglusterfs/src/glusterfs/syncop.h                |  52 +++-
9f5ccc
 libglusterfs/src/libglusterfs.sym                  |   7 +
9f5ccc
 libglusterfs/src/syncop.c                          | 306 ++++++++++++++++-----
9f5ccc
 xlators/cluster/dht/src/dht-rebalance.c            |   2 +-
9f5ccc
 xlators/mgmt/glusterd/src/glusterd-op-sm.c         |   9 +-
9f5ccc
 xlators/mgmt/glusterd/src/glusterd-proc-mgmt.c     |   2 +-
9f5ccc
 .../mgmt/glusterd/src/glusterd-snapshot-utils.c    |   5 +-
9f5ccc
 xlators/mgmt/glusterd/src/glusterd-syncop.h        |   2 +-
9f5ccc
 xlators/mgmt/glusterd/src/glusterd-utils.c         |  29 +-
9f5ccc
 xlators/mgmt/glusterd/src/glusterd.c               |   2 +
9f5ccc
 xlators/mgmt/glusterd/src/glusterd.h               |   2 +
9f5ccc
 11 files changed, 317 insertions(+), 101 deletions(-)
9f5ccc
9f5ccc
diff --git a/libglusterfs/src/glusterfs/syncop.h b/libglusterfs/src/glusterfs/syncop.h
9f5ccc
index e0f1017..3011b4c 100644
9f5ccc
--- a/libglusterfs/src/glusterfs/syncop.h
9f5ccc
+++ b/libglusterfs/src/glusterfs/syncop.h
9f5ccc
@@ -15,6 +15,7 @@
9f5ccc
 #include <sys/time.h>
9f5ccc
 #include <pthread.h>
9f5ccc
 #include <ucontext.h>
9f5ccc
+#include "glusterfs/timer.h"
9f5ccc
 
9f5ccc
 #define SYNCENV_PROC_MAX 16
9f5ccc
 #define SYNCENV_PROC_MIN 2
9f5ccc
@@ -32,6 +33,7 @@
9f5ccc
 struct synctask;
9f5ccc
 struct syncproc;
9f5ccc
 struct syncenv;
9f5ccc
+struct synccond;
9f5ccc
 
9f5ccc
 typedef int (*synctask_cbk_t)(int ret, call_frame_t *frame, void *opaque);
9f5ccc
 
9f5ccc
@@ -55,9 +57,12 @@ struct synctask {
9f5ccc
     call_frame_t *opframe;
9f5ccc
     synctask_cbk_t synccbk;
9f5ccc
     synctask_fn_t syncfn;
9f5ccc
-    synctask_state_t state;
9f5ccc
+    struct timespec *delta;
9f5ccc
+    gf_timer_t *timer;
9f5ccc
+    struct synccond *synccond;
9f5ccc
     void *opaque;
9f5ccc
     void *stack;
9f5ccc
+    synctask_state_t state;
9f5ccc
     int woken;
9f5ccc
     int slept;
9f5ccc
     int ret;
9f5ccc
@@ -85,19 +90,21 @@ struct syncproc {
9f5ccc
 /* hosts the scheduler thread and framework for executing synctasks */
9f5ccc
 struct syncenv {
9f5ccc
     struct syncproc proc[SYNCENV_PROC_MAX];
9f5ccc
-    int procs;
9f5ccc
+
9f5ccc
+    pthread_mutex_t mutex;
9f5ccc
+    pthread_cond_t cond;
9f5ccc
 
9f5ccc
     struct list_head runq;
9f5ccc
-    int runcount;
9f5ccc
     struct list_head waitq;
9f5ccc
-    int waitcount;
9f5ccc
+
9f5ccc
+    int procs;
9f5ccc
+    int procs_idle;
9f5ccc
+
9f5ccc
+    int runcount;
9f5ccc
 
9f5ccc
     int procmin;
9f5ccc
     int procmax;
9f5ccc
 
9f5ccc
-    pthread_mutex_t mutex;
9f5ccc
-    pthread_cond_t cond;
9f5ccc
-
9f5ccc
     size_t stacksize;
9f5ccc
 
9f5ccc
     int destroy; /* FLAG to mark syncenv is in destroy mode
9f5ccc
@@ -123,6 +130,13 @@ struct synclock {
9f5ccc
 };
9f5ccc
 typedef struct synclock synclock_t;
9f5ccc
 
9f5ccc
+struct synccond {
9f5ccc
+    pthread_mutex_t pmutex;
9f5ccc
+    pthread_cond_t pcond;
9f5ccc
+    struct list_head waitq;
9f5ccc
+};
9f5ccc
+typedef struct synccond synccond_t;
9f5ccc
+
9f5ccc
 struct syncbarrier {
9f5ccc
     gf_boolean_t initialized; /*Set on successful initialization*/
9f5ccc
     pthread_mutex_t guard;    /* guard the remaining members, pair @cond */
9f5ccc
@@ -219,7 +233,7 @@ struct syncopctx {
9f5ccc
 #define __yield(args)                                                          \
9f5ccc
     do {                                                                       \
9f5ccc
         if (args->task) {                                                      \
9f5ccc
-            synctask_yield(args->task);                                        \
9f5ccc
+            synctask_yield(args->task, NULL);                                  \
9f5ccc
         } else {                                                               \
9f5ccc
             pthread_mutex_lock(&args->mutex);                                  \
9f5ccc
             {                                                                  \
9f5ccc
@@ -307,7 +321,9 @@ synctask_join(struct synctask *task);
9f5ccc
 void
9f5ccc
 synctask_wake(struct synctask *task);
9f5ccc
 void
9f5ccc
-synctask_yield(struct synctask *task);
9f5ccc
+synctask_yield(struct synctask *task, struct timespec *delta);
9f5ccc
+void
9f5ccc
+synctask_sleep(int32_t secs);
9f5ccc
 void
9f5ccc
 synctask_waitfor(struct synctask *task, int count);
9f5ccc
 
9f5ccc
@@ -405,6 +421,24 @@ synclock_trylock(synclock_t *lock);
9f5ccc
 int
9f5ccc
 synclock_unlock(synclock_t *lock);
9f5ccc
 
9f5ccc
+int32_t
9f5ccc
+synccond_init(synccond_t *cond);
9f5ccc
+
9f5ccc
+void
9f5ccc
+synccond_destroy(synccond_t *cond);
9f5ccc
+
9f5ccc
+int
9f5ccc
+synccond_wait(synccond_t *cond, synclock_t *lock);
9f5ccc
+
9f5ccc
+int
9f5ccc
+synccond_timedwait(synccond_t *cond, synclock_t *lock, struct timespec *delta);
9f5ccc
+
9f5ccc
+void
9f5ccc
+synccond_signal(synccond_t *cond);
9f5ccc
+
9f5ccc
+void
9f5ccc
+synccond_broadcast(synccond_t *cond);
9f5ccc
+
9f5ccc
 int
9f5ccc
 syncbarrier_init(syncbarrier_t *barrier);
9f5ccc
 int
9f5ccc
diff --git a/libglusterfs/src/libglusterfs.sym b/libglusterfs/src/libglusterfs.sym
9f5ccc
index 467a1b7..5a721e0 100644
9f5ccc
--- a/libglusterfs/src/libglusterfs.sym
9f5ccc
+++ b/libglusterfs/src/libglusterfs.sym
9f5ccc
@@ -938,6 +938,12 @@ syncbarrier_destroy
9f5ccc
 syncbarrier_init
9f5ccc
 syncbarrier_wait
9f5ccc
 syncbarrier_wake
9f5ccc
+synccond_init
9f5ccc
+synccond_destroy
9f5ccc
+synccond_wait
9f5ccc
+synccond_timedwait
9f5ccc
+synccond_signal
9f5ccc
+synccond_broadcast
9f5ccc
 syncenv_destroy
9f5ccc
 syncenv_new
9f5ccc
 synclock_destroy
9f5ccc
@@ -1015,6 +1021,7 @@ synctask_new
9f5ccc
 synctask_new1
9f5ccc
 synctask_set
9f5ccc
 synctask_setid
9f5ccc
+synctask_sleep
9f5ccc
 synctask_wake
9f5ccc
 synctask_yield
9f5ccc
 sys_access
9f5ccc
diff --git a/libglusterfs/src/syncop.c b/libglusterfs/src/syncop.c
9f5ccc
index 693970f..71d37b7 100644
9f5ccc
--- a/libglusterfs/src/syncop.c
9f5ccc
+++ b/libglusterfs/src/syncop.c
9f5ccc
@@ -154,10 +154,14 @@ out:
9f5ccc
     return ret;
9f5ccc
 }
9f5ccc
 
9f5ccc
+void *
9f5ccc
+syncenv_processor(void *thdata);
9f5ccc
+
9f5ccc
 static void
9f5ccc
 __run(struct synctask *task)
9f5ccc
 {
9f5ccc
     struct syncenv *env = NULL;
9f5ccc
+    int32_t total, ret, i;
9f5ccc
 
9f5ccc
     env = task->env;
9f5ccc
 
9f5ccc
@@ -173,7 +177,6 @@ __run(struct synctask *task)
9f5ccc
             env->runcount--;
9f5ccc
             break;
9f5ccc
         case SYNCTASK_WAIT:
9f5ccc
-            env->waitcount--;
9f5ccc
             break;
9f5ccc
         case SYNCTASK_DONE:
9f5ccc
             gf_msg(task->xl->name, GF_LOG_WARNING, 0, LG_MSG_COMPLETED_TASK,
9f5ccc
@@ -187,8 +190,27 @@ __run(struct synctask *task)
9f5ccc
     }
9f5ccc
 
9f5ccc
     list_add_tail(&task->all_tasks, &env->runq);
9f5ccc
-    env->runcount++;
9f5ccc
     task->state = SYNCTASK_RUN;
9f5ccc
+
9f5ccc
+    env->runcount++;
9f5ccc
+
9f5ccc
+    total = env->procs + env->runcount - env->procs_idle;
9f5ccc
+    if (total > env->procmax) {
9f5ccc
+        total = env->procmax;
9f5ccc
+    }
9f5ccc
+    if (total > env->procs) {
9f5ccc
+        for (i = 0; i < env->procmax; i++) {
9f5ccc
+            if (env->proc[i].env == NULL) {
9f5ccc
+                env->proc[i].env = env;
9f5ccc
+                ret = gf_thread_create(&env->proc[i].processor, NULL,
9f5ccc
+                                       syncenv_processor, &env->proc[i],
9f5ccc
+                                       "sproc%d", i);
9f5ccc
+                if ((ret < 0) || (++env->procs >= total)) {
9f5ccc
+                    break;
9f5ccc
+                }
9f5ccc
+            }
9f5ccc
+        }
9f5ccc
+    }
9f5ccc
 }
9f5ccc
 
9f5ccc
 static void
9f5ccc
@@ -210,7 +232,6 @@ __wait(struct synctask *task)
9f5ccc
             gf_msg(task->xl->name, GF_LOG_WARNING, 0, LG_MSG_REWAITING_TASK,
9f5ccc
                    "re-waiting already waiting "
9f5ccc
                    "task");
9f5ccc
-            env->waitcount--;
9f5ccc
             break;
9f5ccc
         case SYNCTASK_DONE:
9f5ccc
             gf_msg(task->xl->name, GF_LOG_WARNING, 0, LG_MSG_COMPLETED_TASK,
9f5ccc
@@ -223,12 +244,11 @@ __wait(struct synctask *task)
9f5ccc
     }
9f5ccc
 
9f5ccc
     list_add_tail(&task->all_tasks, &env->waitq);
9f5ccc
-    env->waitcount++;
9f5ccc
     task->state = SYNCTASK_WAIT;
9f5ccc
 }
9f5ccc
 
9f5ccc
 void
9f5ccc
-synctask_yield(struct synctask *task)
9f5ccc
+synctask_yield(struct synctask *task, struct timespec *delta)
9f5ccc
 {
9f5ccc
     xlator_t *oldTHIS = THIS;
9f5ccc
 
9f5ccc
@@ -237,6 +257,8 @@ synctask_yield(struct synctask *task)
9f5ccc
     task->proc->sched.uc_flags &= ~_UC_TLSBASE;
9f5ccc
 #endif
9f5ccc
 
9f5ccc
+    task->delta = delta;
9f5ccc
+
9f5ccc
     if (task->state != SYNCTASK_DONE) {
9f5ccc
         task->state = SYNCTASK_SUSPEND;
9f5ccc
     }
9f5ccc
@@ -249,6 +271,35 @@ synctask_yield(struct synctask *task)
9f5ccc
 }
9f5ccc
 
9f5ccc
 void
9f5ccc
+synctask_sleep(int32_t secs)
9f5ccc
+{
9f5ccc
+    struct timespec delta;
9f5ccc
+    struct synctask *task;
9f5ccc
+
9f5ccc
+    task = synctask_get();
9f5ccc
+
9f5ccc
+    if (task == NULL) {
9f5ccc
+        sleep(secs);
9f5ccc
+    } else {
9f5ccc
+        delta.tv_sec = secs;
9f5ccc
+        delta.tv_nsec = 0;
9f5ccc
+
9f5ccc
+        synctask_yield(task, &delta);
9f5ccc
+    }
9f5ccc
+}
9f5ccc
+
9f5ccc
+static void
9f5ccc
+__synctask_wake(struct synctask *task)
9f5ccc
+{
9f5ccc
+    task->woken = 1;
9f5ccc
+
9f5ccc
+    if (task->slept)
9f5ccc
+        __run(task);
9f5ccc
+
9f5ccc
+    pthread_cond_broadcast(&task->env->cond);
9f5ccc
+}
9f5ccc
+
9f5ccc
+void
9f5ccc
 synctask_wake(struct synctask *task)
9f5ccc
 {
9f5ccc
     struct syncenv *env = NULL;
9f5ccc
@@ -257,13 +308,18 @@ synctask_wake(struct synctask *task)
9f5ccc
 
9f5ccc
     pthread_mutex_lock(&env->mutex);
9f5ccc
     {
9f5ccc
-        task->woken = 1;
9f5ccc
+        if (task->timer != NULL) {
9f5ccc
+            if (gf_timer_call_cancel(task->xl->ctx, task->timer) != 0) {
9f5ccc
+                goto unlock;
9f5ccc
+            }
9f5ccc
 
9f5ccc
-        if (task->slept)
9f5ccc
-            __run(task);
9f5ccc
+            task->timer = NULL;
9f5ccc
+            task->synccond = NULL;
9f5ccc
+        }
9f5ccc
 
9f5ccc
-        pthread_cond_broadcast(&env->cond);
9f5ccc
+        __synctask_wake(task);
9f5ccc
     }
9f5ccc
+unlock:
9f5ccc
     pthread_mutex_unlock(&env->mutex);
9f5ccc
 }
9f5ccc
 
9f5ccc
@@ -282,7 +338,7 @@ synctask_wrap(void)
9f5ccc
 
9f5ccc
     task->state = SYNCTASK_DONE;
9f5ccc
 
9f5ccc
-    synctask_yield(task);
9f5ccc
+    synctask_yield(task, NULL);
9f5ccc
 }
9f5ccc
 
9f5ccc
 void
9f5ccc
@@ -422,11 +478,6 @@ synctask_create(struct syncenv *env, size_t stacksize, synctask_fn_t fn,
9f5ccc
     }
9f5ccc
 
9f5ccc
     synctask_wake(newtask);
9f5ccc
-    /*
9f5ccc
-     * Make sure someone's there to execute anything we just put on the
9f5ccc
-     * run queue.
9f5ccc
-     */
9f5ccc
-    syncenv_scale(env);
9f5ccc
 
9f5ccc
     return newtask;
9f5ccc
 err:
9f5ccc
@@ -520,8 +571,12 @@ syncenv_task(struct syncproc *proc)
9f5ccc
                 goto unlock;
9f5ccc
             }
9f5ccc
 
9f5ccc
+            env->procs_idle++;
9f5ccc
+
9f5ccc
             sleep_till.tv_sec = time(NULL) + SYNCPROC_IDLE_TIME;
9f5ccc
             ret = pthread_cond_timedwait(&env->cond, &env->mutex, &sleep_till);
9f5ccc
+
9f5ccc
+            env->procs_idle--;
9f5ccc
         }
9f5ccc
 
9f5ccc
         task = list_entry(env->runq.next, struct synctask, all_tasks);
9f5ccc
@@ -540,6 +595,34 @@ unlock:
9f5ccc
     return task;
9f5ccc
 }
9f5ccc
 
9f5ccc
+static void
9f5ccc
+synctask_timer(void *data)
9f5ccc
+{
9f5ccc
+    struct synctask *task = data;
9f5ccc
+    struct synccond *cond;
9f5ccc
+
9f5ccc
+    cond = task->synccond;
9f5ccc
+    if (cond != NULL) {
9f5ccc
+        pthread_mutex_lock(&cond->pmutex);
9f5ccc
+
9f5ccc
+        list_del_init(&task->waitq);
9f5ccc
+        task->synccond = NULL;
9f5ccc
+
9f5ccc
+        pthread_mutex_unlock(&cond->pmutex);
9f5ccc
+
9f5ccc
+        task->ret = -ETIMEDOUT;
9f5ccc
+    }
9f5ccc
+
9f5ccc
+    pthread_mutex_lock(&task->env->mutex);
9f5ccc
+
9f5ccc
+    gf_timer_call_cancel(task->xl->ctx, task->timer);
9f5ccc
+    task->timer = NULL;
9f5ccc
+
9f5ccc
+    __synctask_wake(task);
9f5ccc
+
9f5ccc
+    pthread_mutex_unlock(&task->env->mutex);
9f5ccc
+}
9f5ccc
+
9f5ccc
 void
9f5ccc
 synctask_switchto(struct synctask *task)
9f5ccc
 {
9f5ccc
@@ -572,7 +655,14 @@ synctask_switchto(struct synctask *task)
9f5ccc
         } else {
9f5ccc
             task->slept = 1;
9f5ccc
             __wait(task);
9f5ccc
+
9f5ccc
+            if (task->delta != NULL) {
9f5ccc
+                task->timer = gf_timer_call_after(task->xl->ctx, *task->delta,
9f5ccc
+                                                  synctask_timer, task);
9f5ccc
+            }
9f5ccc
         }
9f5ccc
+
9f5ccc
+        task->delta = NULL;
9f5ccc
     }
9f5ccc
     pthread_mutex_unlock(&env->mutex);
9f5ccc
 }
9f5ccc
@@ -580,65 +670,18 @@ synctask_switchto(struct synctask *task)
9f5ccc
 void *
9f5ccc
 syncenv_processor(void *thdata)
9f5ccc
 {
9f5ccc
-    struct syncenv *env = NULL;
9f5ccc
     struct syncproc *proc = NULL;
9f5ccc
     struct synctask *task = NULL;
9f5ccc
 
9f5ccc
     proc = thdata;
9f5ccc
-    env = proc->env;
9f5ccc
-
9f5ccc
-    for (;;) {
9f5ccc
-        task = syncenv_task(proc);
9f5ccc
-        if (!task)
9f5ccc
-            break;
9f5ccc
 
9f5ccc
+    while ((task = syncenv_task(proc)) != NULL) {
9f5ccc
         synctask_switchto(task);
9f5ccc
-
9f5ccc
-        syncenv_scale(env);
9f5ccc
     }
9f5ccc
 
9f5ccc
     return NULL;
9f5ccc
 }
9f5ccc
 
9f5ccc
-void
9f5ccc
-syncenv_scale(struct syncenv *env)
9f5ccc
-{
9f5ccc
-    int diff = 0;
9f5ccc
-    int scale = 0;
9f5ccc
-    int i = 0;
9f5ccc
-    int ret = 0;
9f5ccc
-
9f5ccc
-    pthread_mutex_lock(&env->mutex);
9f5ccc
-    {
9f5ccc
-        if (env->procs > env->runcount)
9f5ccc
-            goto unlock;
9f5ccc
-
9f5ccc
-        scale = env->runcount;
9f5ccc
-        if (scale > env->procmax)
9f5ccc
-            scale = env->procmax;
9f5ccc
-        if (scale > env->procs)
9f5ccc
-            diff = scale - env->procs;
9f5ccc
-        while (diff) {
9f5ccc
-            diff--;
9f5ccc
-            for (; (i < env->procmax); i++) {
9f5ccc
-                if (env->proc[i].processor == 0)
9f5ccc
-                    break;
9f5ccc
-            }
9f5ccc
-
9f5ccc
-            env->proc[i].env = env;
9f5ccc
-            ret = gf_thread_create(&env->proc[i].processor, NULL,
9f5ccc
-                                   syncenv_processor, &env->proc[i],
9f5ccc
-                                   "sproc%03hx", env->procs & 0x3ff);
9f5ccc
-            if (ret)
9f5ccc
-                break;
9f5ccc
-            env->procs++;
9f5ccc
-            i++;
9f5ccc
-        }
9f5ccc
-    }
9f5ccc
-unlock:
9f5ccc
-    pthread_mutex_unlock(&env->mutex);
9f5ccc
-}
9f5ccc
-
9f5ccc
 /* The syncenv threads are cleaned up in this routine.
9f5ccc
  */
9f5ccc
 void
9f5ccc
@@ -715,12 +758,13 @@ syncenv_new(size_t stacksize, int procmin, int procmax)
9f5ccc
         newenv->stacksize = stacksize;
9f5ccc
     newenv->procmin = procmin;
9f5ccc
     newenv->procmax = procmax;
9f5ccc
+    newenv->procs_idle = 0;
9f5ccc
 
9f5ccc
     for (i = 0; i < newenv->procmin; i++) {
9f5ccc
         newenv->proc[i].env = newenv;
9f5ccc
         ret = gf_thread_create(&newenv->proc[i].processor, NULL,
9f5ccc
                                syncenv_processor, &newenv->proc[i], "sproc%d",
9f5ccc
-                               newenv->procs);
9f5ccc
+                               i);
9f5ccc
         if (ret)
9f5ccc
             break;
9f5ccc
         newenv->procs++;
9f5ccc
@@ -810,7 +854,7 @@ __synclock_lock(struct synclock *lock)
9f5ccc
             task->woken = 0;
9f5ccc
             list_add_tail(&task->waitq, &lock->waitq);
9f5ccc
             pthread_mutex_unlock(&lock->guard);
9f5ccc
-            synctask_yield(task);
9f5ccc
+            synctask_yield(task, NULL);
9f5ccc
             /* task is removed from waitq in unlock,
9f5ccc
              * under lock->guard.*/
9f5ccc
             pthread_mutex_lock(&lock->guard);
9f5ccc
@@ -963,6 +1007,136 @@ synclock_unlock(synclock_t *lock)
9f5ccc
     return ret;
9f5ccc
 }
9f5ccc
 
9f5ccc
+/* Condition variables */
9f5ccc
+
9f5ccc
+int32_t
9f5ccc
+synccond_init(synccond_t *cond)
9f5ccc
+{
9f5ccc
+    int32_t ret;
9f5ccc
+
9f5ccc
+    INIT_LIST_HEAD(&cond->waitq);
9f5ccc
+
9f5ccc
+    ret = pthread_mutex_init(&cond->pmutex, NULL);
9f5ccc
+    if (ret != 0) {
9f5ccc
+        return -ret;
9f5ccc
+    }
9f5ccc
+
9f5ccc
+    ret = pthread_cond_init(&cond->pcond, NULL);
9f5ccc
+    if (ret != 0) {
9f5ccc
+        pthread_mutex_destroy(&cond->pmutex);
9f5ccc
+    }
9f5ccc
+
9f5ccc
+    return -ret;
9f5ccc
+}
9f5ccc
+
9f5ccc
+void
9f5ccc
+synccond_destroy(synccond_t *cond)
9f5ccc
+{
9f5ccc
+    pthread_cond_destroy(&cond->pcond);
9f5ccc
+    pthread_mutex_destroy(&cond->pmutex);
9f5ccc
+}
9f5ccc
+
9f5ccc
+int
9f5ccc
+synccond_timedwait(synccond_t *cond, synclock_t *lock, struct timespec *delta)
9f5ccc
+{
9f5ccc
+    struct timespec now;
9f5ccc
+    struct synctask *task = NULL;
9f5ccc
+    int ret;
9f5ccc
+
9f5ccc
+    task = synctask_get();
9f5ccc
+
9f5ccc
+    if (task == NULL) {
9f5ccc
+        if (delta != NULL) {
9f5ccc
+            timespec_now_realtime(&now;;
9f5ccc
+            timespec_adjust_delta(&now, *delta);
9f5ccc
+        }
9f5ccc
+
9f5ccc
+        pthread_mutex_lock(&cond->pmutex);
9f5ccc
+
9f5ccc
+        if (delta == NULL) {
9f5ccc
+            ret = -pthread_cond_wait(&cond->pcond, &cond->pmutex);
9f5ccc
+        } else {
9f5ccc
+            ret = -pthread_cond_timedwait(&cond->pcond, &cond->pmutex, &now;;
9f5ccc
+        }
9f5ccc
+    } else {
9f5ccc
+        pthread_mutex_lock(&cond->pmutex);
9f5ccc
+
9f5ccc
+        list_add_tail(&task->waitq, &cond->waitq);
9f5ccc
+        task->synccond = cond;
9f5ccc
+
9f5ccc
+        ret = synclock_unlock(lock);
9f5ccc
+        if (ret == 0) {
9f5ccc
+            pthread_mutex_unlock(&cond->pmutex);
9f5ccc
+
9f5ccc
+            synctask_yield(task, delta);
9f5ccc
+
9f5ccc
+            ret = synclock_lock(lock);
9f5ccc
+            if (ret == 0) {
9f5ccc
+                ret = task->ret;
9f5ccc
+            }
9f5ccc
+            task->ret = 0;
9f5ccc
+
9f5ccc
+            return ret;
9f5ccc
+        }
9f5ccc
+
9f5ccc
+        list_del_init(&task->waitq);
9f5ccc
+    }
9f5ccc
+
9f5ccc
+    pthread_mutex_unlock(&cond->pmutex);
9f5ccc
+
9f5ccc
+    return ret;
9f5ccc
+}
9f5ccc
+
9f5ccc
+int
9f5ccc
+synccond_wait(synccond_t *cond, synclock_t *lock)
9f5ccc
+{
9f5ccc
+    return synccond_timedwait(cond, lock, NULL);
9f5ccc
+}
9f5ccc
+
9f5ccc
+void
9f5ccc
+synccond_signal(synccond_t *cond)
9f5ccc
+{
9f5ccc
+    struct synctask *task;
9f5ccc
+
9f5ccc
+    pthread_mutex_lock(&cond->pmutex);
9f5ccc
+
9f5ccc
+    if (!list_empty(&cond->waitq)) {
9f5ccc
+        task = list_first_entry(&cond->waitq, struct synctask, waitq);
9f5ccc
+        list_del_init(&task->waitq);
9f5ccc
+
9f5ccc
+        pthread_mutex_unlock(&cond->pmutex);
9f5ccc
+
9f5ccc
+        synctask_wake(task);
9f5ccc
+    } else {
9f5ccc
+        pthread_cond_signal(&cond->pcond);
9f5ccc
+
9f5ccc
+        pthread_mutex_unlock(&cond->pmutex);
9f5ccc
+    }
9f5ccc
+}
9f5ccc
+
9f5ccc
+void
9f5ccc
+synccond_broadcast(synccond_t *cond)
9f5ccc
+{
9f5ccc
+    struct list_head list;
9f5ccc
+    struct synctask *task;
9f5ccc
+
9f5ccc
+    INIT_LIST_HEAD(&list);
9f5ccc
+
9f5ccc
+    pthread_mutex_lock(&cond->pmutex);
9f5ccc
+
9f5ccc
+    list_splice_init(&cond->waitq, &list);
9f5ccc
+    pthread_cond_broadcast(&cond->pcond);
9f5ccc
+
9f5ccc
+    pthread_mutex_unlock(&cond->pmutex);
9f5ccc
+
9f5ccc
+    while (!list_empty(&list)) {
9f5ccc
+        task = list_first_entry(&list, struct synctask, waitq);
9f5ccc
+        list_del_init(&task->waitq);
9f5ccc
+
9f5ccc
+        synctask_wake(task);
9f5ccc
+    }
9f5ccc
+}
9f5ccc
+
9f5ccc
 /* Barriers */
9f5ccc
 
9f5ccc
 int
9f5ccc
@@ -1032,7 +1206,7 @@ __syncbarrier_wait(struct syncbarrier *barrier, int waitfor)
9f5ccc
             /* called within a synctask */
9f5ccc
             list_add_tail(&task->waitq, &barrier->waitq);
9f5ccc
             pthread_mutex_unlock(&barrier->guard);
9f5ccc
-            synctask_yield(task);
9f5ccc
+            synctask_yield(task, NULL);
9f5ccc
             pthread_mutex_lock(&barrier->guard);
9f5ccc
         } else {
9f5ccc
             /* called by a non-synctask */
9f5ccc
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
9f5ccc
index c692119..957deaa 100644
9f5ccc
--- a/xlators/cluster/dht/src/dht-rebalance.c
9f5ccc
+++ b/xlators/cluster/dht/src/dht-rebalance.c
9f5ccc
@@ -5224,7 +5224,7 @@ gf_defrag_pause_tier(xlator_t *this, gf_defrag_info_t *defrag)
9f5ccc
     defrag->tier_conf.pause_timer = gf_timer_call_after(
9f5ccc
         this->ctx, delta, gf_defrag_pause_tier_timeout, this);
9f5ccc
 
9f5ccc
-    synctask_yield(defrag->tier_conf.pause_synctask);
9f5ccc
+    synctask_yield(defrag->tier_conf.pause_synctask, NULL);
9f5ccc
 
9f5ccc
     if (gf_defrag_get_pause_state(&defrag->tier_conf) == TIER_PAUSED)
9f5ccc
         goto out;
9f5ccc
diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
9f5ccc
index 0d29de2..6475611 100644
9f5ccc
--- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c
9f5ccc
+++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
9f5ccc
@@ -6076,13 +6076,8 @@ glusterd_op_stage_validate(glusterd_op_t op, dict_t *dict, char **op_errstr,
9f5ccc
 static void
9f5ccc
 glusterd_wait_for_blockers(glusterd_conf_t *priv)
9f5ccc
 {
9f5ccc
-    uint64_t blockers = GF_ATOMIC_GET(priv->blockers);
9f5ccc
-
9f5ccc
-    while (blockers) {
9f5ccc
-        synclock_unlock(&priv->big_lock);
9f5ccc
-        sleep(1);
9f5ccc
-        blockers = GF_ATOMIC_GET(priv->blockers);
9f5ccc
-        synclock_lock(&priv->big_lock);
9f5ccc
+    while (GF_ATOMIC_GET(priv->blockers)) {
9f5ccc
+        synccond_wait(&priv->cond_blockers, &priv->big_lock);
9f5ccc
     }
9f5ccc
 }
9f5ccc
 
9f5ccc
diff --git a/xlators/mgmt/glusterd/src/glusterd-proc-mgmt.c b/xlators/mgmt/glusterd/src/glusterd-proc-mgmt.c
9f5ccc
index 36018a0..f55a5fd 100644
9f5ccc
--- a/xlators/mgmt/glusterd/src/glusterd-proc-mgmt.c
9f5ccc
+++ b/xlators/mgmt/glusterd/src/glusterd-proc-mgmt.c
9f5ccc
@@ -112,7 +112,7 @@ glusterd_proc_stop(glusterd_proc_t *proc, int sig, int flags)
9f5ccc
         goto out;
9f5ccc
 
9f5ccc
     synclock_unlock(&conf->big_lock);
9f5ccc
-    sleep(1);
9f5ccc
+    synctask_sleep(1);
9f5ccc
     synclock_lock(&conf->big_lock);
9f5ccc
     if (gf_is_service_running(proc->pidfile, &pid)) {
9f5ccc
         ret = kill(pid, SIGKILL);
9f5ccc
diff --git a/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c b/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c
9f5ccc
index d225854..386eed2 100644
9f5ccc
--- a/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c
9f5ccc
+++ b/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c
9f5ccc
@@ -1961,9 +1961,7 @@ glusterd_update_snaps_synctask(void *opaque)
9f5ccc
     synclock_lock(&conf->big_lock);
9f5ccc
 
9f5ccc
     while (conf->restart_bricks) {
9f5ccc
-        synclock_unlock(&conf->big_lock);
9f5ccc
-        sleep(2);
9f5ccc
-        synclock_lock(&conf->big_lock);
9f5ccc
+        synccond_wait(&conf->cond_restart_bricks, &conf->big_lock);
9f5ccc
     }
9f5ccc
     conf->restart_bricks = _gf_true;
9f5ccc
 
9f5ccc
@@ -2070,6 +2068,7 @@ out:
9f5ccc
     if (dict)
9f5ccc
         dict_unref(dict);
9f5ccc
     conf->restart_bricks = _gf_false;
9f5ccc
+    synccond_broadcast(&conf->cond_restart_bricks);
9f5ccc
 
9f5ccc
     return ret;
9f5ccc
 }
9f5ccc
diff --git a/xlators/mgmt/glusterd/src/glusterd-syncop.h b/xlators/mgmt/glusterd/src/glusterd-syncop.h
9f5ccc
index ce4a940..a265f21 100644
9f5ccc
--- a/xlators/mgmt/glusterd/src/glusterd-syncop.h
9f5ccc
+++ b/xlators/mgmt/glusterd/src/glusterd-syncop.h
9f5ccc
@@ -32,7 +32,7 @@
9f5ccc
         ret = gd_syncop_submit_request(rpc, req, stb, cookie, prog, procnum,   \
9f5ccc
                                        cbk, (xdrproc_t)xdrproc);               \
9f5ccc
         if (!ret)                                                              \
9f5ccc
-            synctask_yield(stb->task);                                         \
9f5ccc
+            synctask_yield(stb->task, NULL);                                   \
9f5ccc
         else                                                                   \
9f5ccc
             gf_asprintf(&stb->errstr,                                          \
9f5ccc
                         "%s failed. Check log file"                            \
9f5ccc
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
9f5ccc
index 812c698..ce9931c 100644
9f5ccc
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
9f5ccc
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
9f5ccc
@@ -5068,22 +5068,22 @@ glusterd_import_friend_volumes_synctask(void *opaque)
9f5ccc
      * restarted (refer glusterd_restart_bricks ())
9f5ccc
      */
9f5ccc
     while (conf->restart_bricks) {
9f5ccc
-        synclock_unlock(&conf->big_lock);
9f5ccc
-        sleep(2);
9f5ccc
-        synclock_lock(&conf->big_lock);
9f5ccc
+        synccond_wait(&conf->cond_restart_bricks, &conf->big_lock);
9f5ccc
     }
9f5ccc
     conf->restart_bricks = _gf_true;
9f5ccc
 
9f5ccc
     while (i <= count) {
9f5ccc
         ret = glusterd_import_friend_volume(peer_data, i);
9f5ccc
         if (ret) {
9f5ccc
-            conf->restart_bricks = _gf_false;
9f5ccc
-            goto out;
9f5ccc
+            break;
9f5ccc
         }
9f5ccc
         i++;
9f5ccc
     }
9f5ccc
-    glusterd_svcs_manager(NULL);
9f5ccc
+    if (i > count) {
9f5ccc
+        glusterd_svcs_manager(NULL);
9f5ccc
+    }
9f5ccc
     conf->restart_bricks = _gf_false;
9f5ccc
+    synccond_broadcast(&conf->cond_restart_bricks);
9f5ccc
 out:
9f5ccc
     if (peer_data)
9f5ccc
         dict_unref(peer_data);
9f5ccc
@@ -5769,7 +5769,9 @@ my_callback(struct rpc_req *req, struct iovec *iov, int count, void *v_frame)
9f5ccc
     call_frame_t *frame = v_frame;
9f5ccc
     glusterd_conf_t *conf = frame->this->private;
9f5ccc
 
9f5ccc
-    GF_ATOMIC_DEC(conf->blockers);
9f5ccc
+    if (GF_ATOMIC_DEC(conf->blockers) == 0) {
9f5ccc
+        synccond_broadcast(&conf->cond_blockers);
9f5ccc
+    }
9f5ccc
 
9f5ccc
     STACK_DESTROY(frame->root);
9f5ccc
     return 0;
9f5ccc
@@ -5865,7 +5867,9 @@ attach_brick_callback(struct rpc_req *req, struct iovec *iov, int count,
9f5ccc
         }
9f5ccc
     }
9f5ccc
 out:
9f5ccc
-    GF_ATOMIC_DEC(conf->blockers);
9f5ccc
+    if (GF_ATOMIC_DEC(conf->blockers) == 0) {
9f5ccc
+        synccond_broadcast(&conf->cond_blockers);
9f5ccc
+    }
9f5ccc
     STACK_DESTROY(frame->root);
9f5ccc
     return 0;
9f5ccc
 }
9f5ccc
@@ -6053,7 +6057,7 @@ attach_brick(xlator_t *this, glusterd_brickinfo_t *brickinfo,
9f5ccc
          * TBD: see if there's a better way
9f5ccc
          */
9f5ccc
         synclock_unlock(&conf->big_lock);
9f5ccc
-        sleep(1);
9f5ccc
+        synctask_sleep(1);
9f5ccc
         synclock_lock(&conf->big_lock);
9f5ccc
     }
9f5ccc
 
9f5ccc
@@ -6193,7 +6197,7 @@ find_compat_brick_in_vol(glusterd_conf_t *conf,
9f5ccc
                          "brick %s is still"
9f5ccc
                          " starting, waiting for 2 seconds ",
9f5ccc
                          other_brick->path);
9f5ccc
-            sleep(2);
9f5ccc
+            synctask_sleep(2);
9f5ccc
             synclock_lock(&conf->big_lock);
9f5ccc
             retries--;
9f5ccc
         }
9f5ccc
@@ -6680,9 +6684,7 @@ glusterd_restart_bricks(void *opaque)
9f5ccc
      * glusterd_compare_friend_data ())
9f5ccc
      */
9f5ccc
     while (conf->restart_bricks) {
9f5ccc
-        synclock_unlock(&conf->big_lock);
9f5ccc
-        sleep(2);
9f5ccc
-        synclock_lock(&conf->big_lock);
9f5ccc
+        synccond_wait(&conf->cond_restart_bricks, &conf->big_lock);
9f5ccc
     }
9f5ccc
     conf->restart_bricks = _gf_true;
9f5ccc
 
9f5ccc
@@ -6798,6 +6800,7 @@ out:
9f5ccc
     GF_ATOMIC_DEC(conf->blockers);
9f5ccc
     conf->restart_done = _gf_true;
9f5ccc
     conf->restart_bricks = _gf_false;
9f5ccc
+    synccond_broadcast(&conf->cond_restart_bricks);
9f5ccc
 
9f5ccc
 return_block:
9f5ccc
     return ret;
9f5ccc
diff --git a/xlators/mgmt/glusterd/src/glusterd.c b/xlators/mgmt/glusterd/src/glusterd.c
9f5ccc
index d360312..a01034a 100644
9f5ccc
--- a/xlators/mgmt/glusterd/src/glusterd.c
9f5ccc
+++ b/xlators/mgmt/glusterd/src/glusterd.c
9f5ccc
@@ -1845,6 +1845,8 @@ init(xlator_t *this)
9f5ccc
     (void)strncpy(conf->rundir, rundir, sizeof(conf->rundir));
9f5ccc
 
9f5ccc
     synclock_init(&conf->big_lock, SYNC_LOCK_RECURSIVE);
9f5ccc
+    synccond_init(&conf->cond_restart_bricks);
9f5ccc
+    synccond_init(&conf->cond_blockers);
9f5ccc
     pthread_mutex_init(&conf->xprt_lock, NULL);
9f5ccc
     INIT_LIST_HEAD(&conf->xprt_list);
9f5ccc
     pthread_mutex_init(&conf->import_volumes, NULL);
9f5ccc
diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h
9f5ccc
index 2be005c..1c6c3b1 100644
9f5ccc
--- a/xlators/mgmt/glusterd/src/glusterd.h
9f5ccc
+++ b/xlators/mgmt/glusterd/src/glusterd.h
9f5ccc
@@ -209,6 +209,8 @@ typedef struct {
9f5ccc
     dict_t *opts;
9f5ccc
     synclock_t big_lock;
9f5ccc
     gf_boolean_t restart_done;
9f5ccc
+    synccond_t cond_restart_bricks;
9f5ccc
+    synccond_t cond_blockers;
9f5ccc
     rpcsvc_t *uds_rpc; /* RPCSVC for the unix domain socket */
9f5ccc
     uint32_t base_port;
9f5ccc
     uint32_t max_port;
9f5ccc
-- 
9f5ccc
1.8.3.1
9f5ccc