e3c68b
From 66600fb55522d405a68d7340a5680a2633c4237e Mon Sep 17 00:00:00 2001
e3c68b
From: Xavi Hernandez <xhernandez@redhat.com>
e3c68b
Date: Thu, 30 Apr 2020 11:19:01 +0200
e3c68b
Subject: [PATCH 377/379] syncop: improve scaling and implement more tools
e3c68b
e3c68b
The current scaling of the syncop thread pool is not working properly
e3c68b
and can leave some tasks in the run queue more time than necessary
e3c68b
when the maximum number of threads is not reached.
e3c68b
e3c68b
This patch provides a better scaling condition to react faster to
e3c68b
pending work.
e3c68b
e3c68b
Condition variables and sleep in the context of a synctask have also
e3c68b
been implemented. Their purpose is to replace regular condition
e3c68b
variables and sleeps that block synctask threads and prevent other
e3c68b
tasks to be executed.
e3c68b
e3c68b
The new features have been applied to several places in glusterd.
e3c68b
e3c68b
upstream patch: https://review.gluster.org/#/c/glusterfs/+/24396/
e3c68b
e3c68b
> Change-Id: Ic50b7c73c104f9e41f08101a357d30b95efccfbf
e3c68b
> Fixes: #1116
e3c68b
> Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
e3c68b
e3c68b
Change-Id: Ic50b7c73c104f9e41f08101a357d30b95efccfbf
e3c68b
BUG: 1810516
e3c68b
Signed-off-by: Sanju Rakonde <srakonde@redhta.com>
e3c68b
Reviewed-on: https://code.engineering.redhat.com/gerrit/200409
e3c68b
Tested-by: Sanju Rakonde <srakonde@redhat.com>
e3c68b
Tested-by: RHGS Build Bot <nigelb@redhat.com>
e3c68b
Reviewed-by: Xavi Hernandez Juan <xhernandez@redhat.com>
e3c68b
---
e3c68b
 libglusterfs/src/glusterfs/syncop.h                |  52 +++-
e3c68b
 libglusterfs/src/libglusterfs.sym                  |   7 +
e3c68b
 libglusterfs/src/syncop.c                          | 306 ++++++++++++++++-----
e3c68b
 xlators/cluster/dht/src/dht-rebalance.c            |   2 +-
e3c68b
 xlators/mgmt/glusterd/src/glusterd-op-sm.c         |   9 +-
e3c68b
 xlators/mgmt/glusterd/src/glusterd-proc-mgmt.c     |   2 +-
e3c68b
 .../mgmt/glusterd/src/glusterd-snapshot-utils.c    |   5 +-
e3c68b
 xlators/mgmt/glusterd/src/glusterd-syncop.h        |   2 +-
e3c68b
 xlators/mgmt/glusterd/src/glusterd-utils.c         |  29 +-
e3c68b
 xlators/mgmt/glusterd/src/glusterd.c               |   2 +
e3c68b
 xlators/mgmt/glusterd/src/glusterd.h               |   2 +
e3c68b
 11 files changed, 317 insertions(+), 101 deletions(-)
e3c68b
e3c68b
diff --git a/libglusterfs/src/glusterfs/syncop.h b/libglusterfs/src/glusterfs/syncop.h
e3c68b
index e0f1017..3011b4c 100644
e3c68b
--- a/libglusterfs/src/glusterfs/syncop.h
e3c68b
+++ b/libglusterfs/src/glusterfs/syncop.h
e3c68b
@@ -15,6 +15,7 @@
e3c68b
 #include <sys/time.h>
e3c68b
 #include <pthread.h>
e3c68b
 #include <ucontext.h>
e3c68b
+#include "glusterfs/timer.h"
e3c68b
 
e3c68b
 #define SYNCENV_PROC_MAX 16
e3c68b
 #define SYNCENV_PROC_MIN 2
e3c68b
@@ -32,6 +33,7 @@
e3c68b
 struct synctask;
e3c68b
 struct syncproc;
e3c68b
 struct syncenv;
e3c68b
+struct synccond;
e3c68b
 
e3c68b
 typedef int (*synctask_cbk_t)(int ret, call_frame_t *frame, void *opaque);
e3c68b
 
e3c68b
@@ -55,9 +57,12 @@ struct synctask {
e3c68b
     call_frame_t *opframe;
e3c68b
     synctask_cbk_t synccbk;
e3c68b
     synctask_fn_t syncfn;
e3c68b
-    synctask_state_t state;
e3c68b
+    struct timespec *delta;
e3c68b
+    gf_timer_t *timer;
e3c68b
+    struct synccond *synccond;
e3c68b
     void *opaque;
e3c68b
     void *stack;
e3c68b
+    synctask_state_t state;
e3c68b
     int woken;
e3c68b
     int slept;
e3c68b
     int ret;
e3c68b
@@ -85,19 +90,21 @@ struct syncproc {
e3c68b
 /* hosts the scheduler thread and framework for executing synctasks */
e3c68b
 struct syncenv {
e3c68b
     struct syncproc proc[SYNCENV_PROC_MAX];
e3c68b
-    int procs;
e3c68b
+
e3c68b
+    pthread_mutex_t mutex;
e3c68b
+    pthread_cond_t cond;
e3c68b
 
e3c68b
     struct list_head runq;
e3c68b
-    int runcount;
e3c68b
     struct list_head waitq;
e3c68b
-    int waitcount;
e3c68b
+
e3c68b
+    int procs;
e3c68b
+    int procs_idle;
e3c68b
+
e3c68b
+    int runcount;
e3c68b
 
e3c68b
     int procmin;
e3c68b
     int procmax;
e3c68b
 
e3c68b
-    pthread_mutex_t mutex;
e3c68b
-    pthread_cond_t cond;
e3c68b
-
e3c68b
     size_t stacksize;
e3c68b
 
e3c68b
     int destroy; /* FLAG to mark syncenv is in destroy mode
e3c68b
@@ -123,6 +130,13 @@ struct synclock {
e3c68b
 };
e3c68b
 typedef struct synclock synclock_t;
e3c68b
 
e3c68b
+struct synccond {
e3c68b
+    pthread_mutex_t pmutex;
e3c68b
+    pthread_cond_t pcond;
e3c68b
+    struct list_head waitq;
e3c68b
+};
e3c68b
+typedef struct synccond synccond_t;
e3c68b
+
e3c68b
 struct syncbarrier {
e3c68b
     gf_boolean_t initialized; /*Set on successful initialization*/
e3c68b
     pthread_mutex_t guard;    /* guard the remaining members, pair @cond */
e3c68b
@@ -219,7 +233,7 @@ struct syncopctx {
e3c68b
 #define __yield(args)                                                          \
e3c68b
     do {                                                                       \
e3c68b
         if (args->task) {                                                      \
e3c68b
-            synctask_yield(args->task);                                        \
e3c68b
+            synctask_yield(args->task, NULL);                                  \
e3c68b
         } else {                                                               \
e3c68b
             pthread_mutex_lock(&args->mutex);                                  \
e3c68b
             {                                                                  \
e3c68b
@@ -307,7 +321,9 @@ synctask_join(struct synctask *task);
e3c68b
 void
e3c68b
 synctask_wake(struct synctask *task);
e3c68b
 void
e3c68b
-synctask_yield(struct synctask *task);
e3c68b
+synctask_yield(struct synctask *task, struct timespec *delta);
e3c68b
+void
e3c68b
+synctask_sleep(int32_t secs);
e3c68b
 void
e3c68b
 synctask_waitfor(struct synctask *task, int count);
e3c68b
 
e3c68b
@@ -405,6 +421,24 @@ synclock_trylock(synclock_t *lock);
e3c68b
 int
e3c68b
 synclock_unlock(synclock_t *lock);
e3c68b
 
e3c68b
+int32_t
e3c68b
+synccond_init(synccond_t *cond);
e3c68b
+
e3c68b
+void
e3c68b
+synccond_destroy(synccond_t *cond);
e3c68b
+
e3c68b
+int
e3c68b
+synccond_wait(synccond_t *cond, synclock_t *lock);
e3c68b
+
e3c68b
+int
e3c68b
+synccond_timedwait(synccond_t *cond, synclock_t *lock, struct timespec *delta);
e3c68b
+
e3c68b
+void
e3c68b
+synccond_signal(synccond_t *cond);
e3c68b
+
e3c68b
+void
e3c68b
+synccond_broadcast(synccond_t *cond);
e3c68b
+
e3c68b
 int
e3c68b
 syncbarrier_init(syncbarrier_t *barrier);
e3c68b
 int
e3c68b
diff --git a/libglusterfs/src/libglusterfs.sym b/libglusterfs/src/libglusterfs.sym
e3c68b
index 467a1b7..5a721e0 100644
e3c68b
--- a/libglusterfs/src/libglusterfs.sym
e3c68b
+++ b/libglusterfs/src/libglusterfs.sym
e3c68b
@@ -938,6 +938,12 @@ syncbarrier_destroy
e3c68b
 syncbarrier_init
e3c68b
 syncbarrier_wait
e3c68b
 syncbarrier_wake
e3c68b
+synccond_init
e3c68b
+synccond_destroy
e3c68b
+synccond_wait
e3c68b
+synccond_timedwait
e3c68b
+synccond_signal
e3c68b
+synccond_broadcast
e3c68b
 syncenv_destroy
e3c68b
 syncenv_new
e3c68b
 synclock_destroy
e3c68b
@@ -1015,6 +1021,7 @@ synctask_new
e3c68b
 synctask_new1
e3c68b
 synctask_set
e3c68b
 synctask_setid
e3c68b
+synctask_sleep
e3c68b
 synctask_wake
e3c68b
 synctask_yield
e3c68b
 sys_access
e3c68b
diff --git a/libglusterfs/src/syncop.c b/libglusterfs/src/syncop.c
e3c68b
index 693970f..71d37b7 100644
e3c68b
--- a/libglusterfs/src/syncop.c
e3c68b
+++ b/libglusterfs/src/syncop.c
e3c68b
@@ -154,10 +154,14 @@ out:
e3c68b
     return ret;
e3c68b
 }
e3c68b
 
e3c68b
+void *
e3c68b
+syncenv_processor(void *thdata);
e3c68b
+
e3c68b
 static void
e3c68b
 __run(struct synctask *task)
e3c68b
 {
e3c68b
     struct syncenv *env = NULL;
e3c68b
+    int32_t total, ret, i;
e3c68b
 
e3c68b
     env = task->env;
e3c68b
 
e3c68b
@@ -173,7 +177,6 @@ __run(struct synctask *task)
e3c68b
             env->runcount--;
e3c68b
             break;
e3c68b
         case SYNCTASK_WAIT:
e3c68b
-            env->waitcount--;
e3c68b
             break;
e3c68b
         case SYNCTASK_DONE:
e3c68b
             gf_msg(task->xl->name, GF_LOG_WARNING, 0, LG_MSG_COMPLETED_TASK,
e3c68b
@@ -187,8 +190,27 @@ __run(struct synctask *task)
e3c68b
     }
e3c68b
 
e3c68b
     list_add_tail(&task->all_tasks, &env->runq);
e3c68b
-    env->runcount++;
e3c68b
     task->state = SYNCTASK_RUN;
e3c68b
+
e3c68b
+    env->runcount++;
e3c68b
+
e3c68b
+    total = env->procs + env->runcount - env->procs_idle;
e3c68b
+    if (total > env->procmax) {
e3c68b
+        total = env->procmax;
e3c68b
+    }
e3c68b
+    if (total > env->procs) {
e3c68b
+        for (i = 0; i < env->procmax; i++) {
e3c68b
+            if (env->proc[i].env == NULL) {
e3c68b
+                env->proc[i].env = env;
e3c68b
+                ret = gf_thread_create(&env->proc[i].processor, NULL,
e3c68b
+                                       syncenv_processor, &env->proc[i],
e3c68b
+                                       "sproc%d", i);
e3c68b
+                if ((ret < 0) || (++env->procs >= total)) {
e3c68b
+                    break;
e3c68b
+                }
e3c68b
+            }
e3c68b
+        }
e3c68b
+    }
e3c68b
 }
e3c68b
 
e3c68b
 static void
e3c68b
@@ -210,7 +232,6 @@ __wait(struct synctask *task)
e3c68b
             gf_msg(task->xl->name, GF_LOG_WARNING, 0, LG_MSG_REWAITING_TASK,
e3c68b
                    "re-waiting already waiting "
e3c68b
                    "task");
e3c68b
-            env->waitcount--;
e3c68b
             break;
e3c68b
         case SYNCTASK_DONE:
e3c68b
             gf_msg(task->xl->name, GF_LOG_WARNING, 0, LG_MSG_COMPLETED_TASK,
e3c68b
@@ -223,12 +244,11 @@ __wait(struct synctask *task)
e3c68b
     }
e3c68b
 
e3c68b
     list_add_tail(&task->all_tasks, &env->waitq);
e3c68b
-    env->waitcount++;
e3c68b
     task->state = SYNCTASK_WAIT;
e3c68b
 }
e3c68b
 
e3c68b
 void
e3c68b
-synctask_yield(struct synctask *task)
e3c68b
+synctask_yield(struct synctask *task, struct timespec *delta)
e3c68b
 {
e3c68b
     xlator_t *oldTHIS = THIS;
e3c68b
 
e3c68b
@@ -237,6 +257,8 @@ synctask_yield(struct synctask *task)
e3c68b
     task->proc->sched.uc_flags &= ~_UC_TLSBASE;
e3c68b
 #endif
e3c68b
 
e3c68b
+    task->delta = delta;
e3c68b
+
e3c68b
     if (task->state != SYNCTASK_DONE) {
e3c68b
         task->state = SYNCTASK_SUSPEND;
e3c68b
     }
e3c68b
@@ -249,6 +271,35 @@ synctask_yield(struct synctask *task)
e3c68b
 }
e3c68b
 
e3c68b
 void
e3c68b
+synctask_sleep(int32_t secs)
e3c68b
+{
e3c68b
+    struct timespec delta;
e3c68b
+    struct synctask *task;
e3c68b
+
e3c68b
+    task = synctask_get();
e3c68b
+
e3c68b
+    if (task == NULL) {
e3c68b
+        sleep(secs);
e3c68b
+    } else {
e3c68b
+        delta.tv_sec = secs;
e3c68b
+        delta.tv_nsec = 0;
e3c68b
+
e3c68b
+        synctask_yield(task, &delta);
e3c68b
+    }
e3c68b
+}
e3c68b
+
e3c68b
+static void
e3c68b
+__synctask_wake(struct synctask *task)
e3c68b
+{
e3c68b
+    task->woken = 1;
e3c68b
+
e3c68b
+    if (task->slept)
e3c68b
+        __run(task);
e3c68b
+
e3c68b
+    pthread_cond_broadcast(&task->env->cond);
e3c68b
+}
e3c68b
+
e3c68b
+void
e3c68b
 synctask_wake(struct synctask *task)
e3c68b
 {
e3c68b
     struct syncenv *env = NULL;
e3c68b
@@ -257,13 +308,18 @@ synctask_wake(struct synctask *task)
e3c68b
 
e3c68b
     pthread_mutex_lock(&env->mutex);
e3c68b
     {
e3c68b
-        task->woken = 1;
e3c68b
+        if (task->timer != NULL) {
e3c68b
+            if (gf_timer_call_cancel(task->xl->ctx, task->timer) != 0) {
e3c68b
+                goto unlock;
e3c68b
+            }
e3c68b
 
e3c68b
-        if (task->slept)
e3c68b
-            __run(task);
e3c68b
+            task->timer = NULL;
e3c68b
+            task->synccond = NULL;
e3c68b
+        }
e3c68b
 
e3c68b
-        pthread_cond_broadcast(&env->cond);
e3c68b
+        __synctask_wake(task);
e3c68b
     }
e3c68b
+unlock:
e3c68b
     pthread_mutex_unlock(&env->mutex);
e3c68b
 }
e3c68b
 
e3c68b
@@ -282,7 +338,7 @@ synctask_wrap(void)
e3c68b
 
e3c68b
     task->state = SYNCTASK_DONE;
e3c68b
 
e3c68b
-    synctask_yield(task);
e3c68b
+    synctask_yield(task, NULL);
e3c68b
 }
e3c68b
 
e3c68b
 void
e3c68b
@@ -422,11 +478,6 @@ synctask_create(struct syncenv *env, size_t stacksize, synctask_fn_t fn,
e3c68b
     }
e3c68b
 
e3c68b
     synctask_wake(newtask);
e3c68b
-    /*
e3c68b
-     * Make sure someone's there to execute anything we just put on the
e3c68b
-     * run queue.
e3c68b
-     */
e3c68b
-    syncenv_scale(env);
e3c68b
 
e3c68b
     return newtask;
e3c68b
 err:
e3c68b
@@ -520,8 +571,12 @@ syncenv_task(struct syncproc *proc)
e3c68b
                 goto unlock;
e3c68b
             }
e3c68b
 
e3c68b
+            env->procs_idle++;
e3c68b
+
e3c68b
             sleep_till.tv_sec = time(NULL) + SYNCPROC_IDLE_TIME;
e3c68b
             ret = pthread_cond_timedwait(&env->cond, &env->mutex, &sleep_till);
e3c68b
+
e3c68b
+            env->procs_idle--;
e3c68b
         }
e3c68b
 
e3c68b
         task = list_entry(env->runq.next, struct synctask, all_tasks);
e3c68b
@@ -540,6 +595,34 @@ unlock:
e3c68b
     return task;
e3c68b
 }
e3c68b
 
e3c68b
+static void
e3c68b
+synctask_timer(void *data)
e3c68b
+{
e3c68b
+    struct synctask *task = data;
e3c68b
+    struct synccond *cond;
e3c68b
+
e3c68b
+    cond = task->synccond;
e3c68b
+    if (cond != NULL) {
e3c68b
+        pthread_mutex_lock(&cond->pmutex);
e3c68b
+
e3c68b
+        list_del_init(&task->waitq);
e3c68b
+        task->synccond = NULL;
e3c68b
+
e3c68b
+        pthread_mutex_unlock(&cond->pmutex);
e3c68b
+
e3c68b
+        task->ret = -ETIMEDOUT;
e3c68b
+    }
e3c68b
+
e3c68b
+    pthread_mutex_lock(&task->env->mutex);
e3c68b
+
e3c68b
+    gf_timer_call_cancel(task->xl->ctx, task->timer);
e3c68b
+    task->timer = NULL;
e3c68b
+
e3c68b
+    __synctask_wake(task);
e3c68b
+
e3c68b
+    pthread_mutex_unlock(&task->env->mutex);
e3c68b
+}
e3c68b
+
e3c68b
 void
e3c68b
 synctask_switchto(struct synctask *task)
e3c68b
 {
e3c68b
@@ -572,7 +655,14 @@ synctask_switchto(struct synctask *task)
e3c68b
         } else {
e3c68b
             task->slept = 1;
e3c68b
             __wait(task);
e3c68b
+
e3c68b
+            if (task->delta != NULL) {
e3c68b
+                task->timer = gf_timer_call_after(task->xl->ctx, *task->delta,
e3c68b
+                                                  synctask_timer, task);
e3c68b
+            }
e3c68b
         }
e3c68b
+
e3c68b
+        task->delta = NULL;
e3c68b
     }
e3c68b
     pthread_mutex_unlock(&env->mutex);
e3c68b
 }
e3c68b
@@ -580,65 +670,18 @@ synctask_switchto(struct synctask *task)
e3c68b
 void *
e3c68b
 syncenv_processor(void *thdata)
e3c68b
 {
e3c68b
-    struct syncenv *env = NULL;
e3c68b
     struct syncproc *proc = NULL;
e3c68b
     struct synctask *task = NULL;
e3c68b
 
e3c68b
     proc = thdata;
e3c68b
-    env = proc->env;
e3c68b
-
e3c68b
-    for (;;) {
e3c68b
-        task = syncenv_task(proc);
e3c68b
-        if (!task)
e3c68b
-            break;
e3c68b
 
e3c68b
+    while ((task = syncenv_task(proc)) != NULL) {
e3c68b
         synctask_switchto(task);
e3c68b
-
e3c68b
-        syncenv_scale(env);
e3c68b
     }
e3c68b
 
e3c68b
     return NULL;
e3c68b
 }
e3c68b
 
e3c68b
-void
e3c68b
-syncenv_scale(struct syncenv *env)
e3c68b
-{
e3c68b
-    int diff = 0;
e3c68b
-    int scale = 0;
e3c68b
-    int i = 0;
e3c68b
-    int ret = 0;
e3c68b
-
e3c68b
-    pthread_mutex_lock(&env->mutex);
e3c68b
-    {
e3c68b
-        if (env->procs > env->runcount)
e3c68b
-            goto unlock;
e3c68b
-
e3c68b
-        scale = env->runcount;
e3c68b
-        if (scale > env->procmax)
e3c68b
-            scale = env->procmax;
e3c68b
-        if (scale > env->procs)
e3c68b
-            diff = scale - env->procs;
e3c68b
-        while (diff) {
e3c68b
-            diff--;
e3c68b
-            for (; (i < env->procmax); i++) {
e3c68b
-                if (env->proc[i].processor == 0)
e3c68b
-                    break;
e3c68b
-            }
e3c68b
-
e3c68b
-            env->proc[i].env = env;
e3c68b
-            ret = gf_thread_create(&env->proc[i].processor, NULL,
e3c68b
-                                   syncenv_processor, &env->proc[i],
e3c68b
-                                   "sproc%03hx", env->procs & 0x3ff);
e3c68b
-            if (ret)
e3c68b
-                break;
e3c68b
-            env->procs++;
e3c68b
-            i++;
e3c68b
-        }
e3c68b
-    }
e3c68b
-unlock:
e3c68b
-    pthread_mutex_unlock(&env->mutex);
e3c68b
-}
e3c68b
-
e3c68b
 /* The syncenv threads are cleaned up in this routine.
e3c68b
  */
e3c68b
 void
e3c68b
@@ -715,12 +758,13 @@ syncenv_new(size_t stacksize, int procmin, int procmax)
e3c68b
         newenv->stacksize = stacksize;
e3c68b
     newenv->procmin = procmin;
e3c68b
     newenv->procmax = procmax;
e3c68b
+    newenv->procs_idle = 0;
e3c68b
 
e3c68b
     for (i = 0; i < newenv->procmin; i++) {
e3c68b
         newenv->proc[i].env = newenv;
e3c68b
         ret = gf_thread_create(&newenv->proc[i].processor, NULL,
e3c68b
                                syncenv_processor, &newenv->proc[i], "sproc%d",
e3c68b
-                               newenv->procs);
e3c68b
+                               i);
e3c68b
         if (ret)
e3c68b
             break;
e3c68b
         newenv->procs++;
e3c68b
@@ -810,7 +854,7 @@ __synclock_lock(struct synclock *lock)
e3c68b
             task->woken = 0;
e3c68b
             list_add_tail(&task->waitq, &lock->waitq);
e3c68b
             pthread_mutex_unlock(&lock->guard);
e3c68b
-            synctask_yield(task);
e3c68b
+            synctask_yield(task, NULL);
e3c68b
             /* task is removed from waitq in unlock,
e3c68b
              * under lock->guard.*/
e3c68b
             pthread_mutex_lock(&lock->guard);
e3c68b
@@ -963,6 +1007,136 @@ synclock_unlock(synclock_t *lock)
e3c68b
     return ret;
e3c68b
 }
e3c68b
 
e3c68b
+/* Condition variables */
e3c68b
+
e3c68b
+int32_t
e3c68b
+synccond_init(synccond_t *cond)
e3c68b
+{
e3c68b
+    int32_t ret;
e3c68b
+
e3c68b
+    INIT_LIST_HEAD(&cond->waitq);
e3c68b
+
e3c68b
+    ret = pthread_mutex_init(&cond->pmutex, NULL);
e3c68b
+    if (ret != 0) {
e3c68b
+        return -ret;
e3c68b
+    }
e3c68b
+
e3c68b
+    ret = pthread_cond_init(&cond->pcond, NULL);
e3c68b
+    if (ret != 0) {
e3c68b
+        pthread_mutex_destroy(&cond->pmutex);
e3c68b
+    }
e3c68b
+
e3c68b
+    return -ret;
e3c68b
+}
e3c68b
+
e3c68b
+void
e3c68b
+synccond_destroy(synccond_t *cond)
e3c68b
+{
e3c68b
+    pthread_cond_destroy(&cond->pcond);
e3c68b
+    pthread_mutex_destroy(&cond->pmutex);
e3c68b
+}
e3c68b
+
e3c68b
+int
e3c68b
+synccond_timedwait(synccond_t *cond, synclock_t *lock, struct timespec *delta)
e3c68b
+{
e3c68b
+    struct timespec now;
e3c68b
+    struct synctask *task = NULL;
e3c68b
+    int ret;
e3c68b
+
e3c68b
+    task = synctask_get();
e3c68b
+
e3c68b
+    if (task == NULL) {
e3c68b
+        if (delta != NULL) {
e3c68b
+            timespec_now_realtime(&now;;
e3c68b
+            timespec_adjust_delta(&now, *delta);
e3c68b
+        }
e3c68b
+
e3c68b
+        pthread_mutex_lock(&cond->pmutex);
e3c68b
+
e3c68b
+        if (delta == NULL) {
e3c68b
+            ret = -pthread_cond_wait(&cond->pcond, &cond->pmutex);
e3c68b
+        } else {
e3c68b
+            ret = -pthread_cond_timedwait(&cond->pcond, &cond->pmutex, &now;;
e3c68b
+        }
e3c68b
+    } else {
e3c68b
+        pthread_mutex_lock(&cond->pmutex);
e3c68b
+
e3c68b
+        list_add_tail(&task->waitq, &cond->waitq);
e3c68b
+        task->synccond = cond;
e3c68b
+
e3c68b
+        ret = synclock_unlock(lock);
e3c68b
+        if (ret == 0) {
e3c68b
+            pthread_mutex_unlock(&cond->pmutex);
e3c68b
+
e3c68b
+            synctask_yield(task, delta);
e3c68b
+
e3c68b
+            ret = synclock_lock(lock);
e3c68b
+            if (ret == 0) {
e3c68b
+                ret = task->ret;
e3c68b
+            }
e3c68b
+            task->ret = 0;
e3c68b
+
e3c68b
+            return ret;
e3c68b
+        }
e3c68b
+
e3c68b
+        list_del_init(&task->waitq);
e3c68b
+    }
e3c68b
+
e3c68b
+    pthread_mutex_unlock(&cond->pmutex);
e3c68b
+
e3c68b
+    return ret;
e3c68b
+}
e3c68b
+
e3c68b
+int
e3c68b
+synccond_wait(synccond_t *cond, synclock_t *lock)
e3c68b
+{
e3c68b
+    return synccond_timedwait(cond, lock, NULL);
e3c68b
+}
e3c68b
+
e3c68b
+void
e3c68b
+synccond_signal(synccond_t *cond)
e3c68b
+{
e3c68b
+    struct synctask *task;
e3c68b
+
e3c68b
+    pthread_mutex_lock(&cond->pmutex);
e3c68b
+
e3c68b
+    if (!list_empty(&cond->waitq)) {
e3c68b
+        task = list_first_entry(&cond->waitq, struct synctask, waitq);
e3c68b
+        list_del_init(&task->waitq);
e3c68b
+
e3c68b
+        pthread_mutex_unlock(&cond->pmutex);
e3c68b
+
e3c68b
+        synctask_wake(task);
e3c68b
+    } else {
e3c68b
+        pthread_cond_signal(&cond->pcond);
e3c68b
+
e3c68b
+        pthread_mutex_unlock(&cond->pmutex);
e3c68b
+    }
e3c68b
+}
e3c68b
+
e3c68b
+void
e3c68b
+synccond_broadcast(synccond_t *cond)
e3c68b
+{
e3c68b
+    struct list_head list;
e3c68b
+    struct synctask *task;
e3c68b
+
e3c68b
+    INIT_LIST_HEAD(&list);
e3c68b
+
e3c68b
+    pthread_mutex_lock(&cond->pmutex);
e3c68b
+
e3c68b
+    list_splice_init(&cond->waitq, &list);
e3c68b
+    pthread_cond_broadcast(&cond->pcond);
e3c68b
+
e3c68b
+    pthread_mutex_unlock(&cond->pmutex);
e3c68b
+
e3c68b
+    while (!list_empty(&list)) {
e3c68b
+        task = list_first_entry(&list, struct synctask, waitq);
e3c68b
+        list_del_init(&task->waitq);
e3c68b
+
e3c68b
+        synctask_wake(task);
e3c68b
+    }
e3c68b
+}
e3c68b
+
e3c68b
 /* Barriers */
e3c68b
 
e3c68b
 int
e3c68b
@@ -1032,7 +1206,7 @@ __syncbarrier_wait(struct syncbarrier *barrier, int waitfor)
e3c68b
             /* called within a synctask */
e3c68b
             list_add_tail(&task->waitq, &barrier->waitq);
e3c68b
             pthread_mutex_unlock(&barrier->guard);
e3c68b
-            synctask_yield(task);
e3c68b
+            synctask_yield(task, NULL);
e3c68b
             pthread_mutex_lock(&barrier->guard);
e3c68b
         } else {
e3c68b
             /* called by a non-synctask */
e3c68b
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
e3c68b
index c692119..957deaa 100644
e3c68b
--- a/xlators/cluster/dht/src/dht-rebalance.c
e3c68b
+++ b/xlators/cluster/dht/src/dht-rebalance.c
e3c68b
@@ -5224,7 +5224,7 @@ gf_defrag_pause_tier(xlator_t *this, gf_defrag_info_t *defrag)
e3c68b
     defrag->tier_conf.pause_timer = gf_timer_call_after(
e3c68b
         this->ctx, delta, gf_defrag_pause_tier_timeout, this);
e3c68b
 
e3c68b
-    synctask_yield(defrag->tier_conf.pause_synctask);
e3c68b
+    synctask_yield(defrag->tier_conf.pause_synctask, NULL);
e3c68b
 
e3c68b
     if (gf_defrag_get_pause_state(&defrag->tier_conf) == TIER_PAUSED)
e3c68b
         goto out;
e3c68b
diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
e3c68b
index 0d29de2..6475611 100644
e3c68b
--- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c
e3c68b
+++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
e3c68b
@@ -6076,13 +6076,8 @@ glusterd_op_stage_validate(glusterd_op_t op, dict_t *dict, char **op_errstr,
e3c68b
 static void
e3c68b
 glusterd_wait_for_blockers(glusterd_conf_t *priv)
e3c68b
 {
e3c68b
-    uint64_t blockers = GF_ATOMIC_GET(priv->blockers);
e3c68b
-
e3c68b
-    while (blockers) {
e3c68b
-        synclock_unlock(&priv->big_lock);
e3c68b
-        sleep(1);
e3c68b
-        blockers = GF_ATOMIC_GET(priv->blockers);
e3c68b
-        synclock_lock(&priv->big_lock);
e3c68b
+    while (GF_ATOMIC_GET(priv->blockers)) {
e3c68b
+        synccond_wait(&priv->cond_blockers, &priv->big_lock);
e3c68b
     }
e3c68b
 }
e3c68b
 
e3c68b
diff --git a/xlators/mgmt/glusterd/src/glusterd-proc-mgmt.c b/xlators/mgmt/glusterd/src/glusterd-proc-mgmt.c
e3c68b
index 36018a0..f55a5fd 100644
e3c68b
--- a/xlators/mgmt/glusterd/src/glusterd-proc-mgmt.c
e3c68b
+++ b/xlators/mgmt/glusterd/src/glusterd-proc-mgmt.c
e3c68b
@@ -112,7 +112,7 @@ glusterd_proc_stop(glusterd_proc_t *proc, int sig, int flags)
e3c68b
         goto out;
e3c68b
 
e3c68b
     synclock_unlock(&conf->big_lock);
e3c68b
-    sleep(1);
e3c68b
+    synctask_sleep(1);
e3c68b
     synclock_lock(&conf->big_lock);
e3c68b
     if (gf_is_service_running(proc->pidfile, &pid)) {
e3c68b
         ret = kill(pid, SIGKILL);
e3c68b
diff --git a/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c b/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c
e3c68b
index d225854..386eed2 100644
e3c68b
--- a/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c
e3c68b
+++ b/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c
e3c68b
@@ -1961,9 +1961,7 @@ glusterd_update_snaps_synctask(void *opaque)
e3c68b
     synclock_lock(&conf->big_lock);
e3c68b
 
e3c68b
     while (conf->restart_bricks) {
e3c68b
-        synclock_unlock(&conf->big_lock);
e3c68b
-        sleep(2);
e3c68b
-        synclock_lock(&conf->big_lock);
e3c68b
+        synccond_wait(&conf->cond_restart_bricks, &conf->big_lock);
e3c68b
     }
e3c68b
     conf->restart_bricks = _gf_true;
e3c68b
 
e3c68b
@@ -2070,6 +2068,7 @@ out:
e3c68b
     if (dict)
e3c68b
         dict_unref(dict);
e3c68b
     conf->restart_bricks = _gf_false;
e3c68b
+    synccond_broadcast(&conf->cond_restart_bricks);
e3c68b
 
e3c68b
     return ret;
e3c68b
 }
e3c68b
diff --git a/xlators/mgmt/glusterd/src/glusterd-syncop.h b/xlators/mgmt/glusterd/src/glusterd-syncop.h
e3c68b
index ce4a940..a265f21 100644
e3c68b
--- a/xlators/mgmt/glusterd/src/glusterd-syncop.h
e3c68b
+++ b/xlators/mgmt/glusterd/src/glusterd-syncop.h
e3c68b
@@ -32,7 +32,7 @@
e3c68b
         ret = gd_syncop_submit_request(rpc, req, stb, cookie, prog, procnum,   \
e3c68b
                                        cbk, (xdrproc_t)xdrproc);               \
e3c68b
         if (!ret)                                                              \
e3c68b
-            synctask_yield(stb->task);                                         \
e3c68b
+            synctask_yield(stb->task, NULL);                                   \
e3c68b
         else                                                                   \
e3c68b
             gf_asprintf(&stb->errstr,                                          \
e3c68b
                         "%s failed. Check log file"                            \
e3c68b
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
e3c68b
index 812c698..ce9931c 100644
e3c68b
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
e3c68b
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
e3c68b
@@ -5068,22 +5068,22 @@ glusterd_import_friend_volumes_synctask(void *opaque)
e3c68b
      * restarted (refer glusterd_restart_bricks ())
e3c68b
      */
e3c68b
     while (conf->restart_bricks) {
e3c68b
-        synclock_unlock(&conf->big_lock);
e3c68b
-        sleep(2);
e3c68b
-        synclock_lock(&conf->big_lock);
e3c68b
+        synccond_wait(&conf->cond_restart_bricks, &conf->big_lock);
e3c68b
     }
e3c68b
     conf->restart_bricks = _gf_true;
e3c68b
 
e3c68b
     while (i <= count) {
e3c68b
         ret = glusterd_import_friend_volume(peer_data, i);
e3c68b
         if (ret) {
e3c68b
-            conf->restart_bricks = _gf_false;
e3c68b
-            goto out;
e3c68b
+            break;
e3c68b
         }
e3c68b
         i++;
e3c68b
     }
e3c68b
-    glusterd_svcs_manager(NULL);
e3c68b
+    if (i > count) {
e3c68b
+        glusterd_svcs_manager(NULL);
e3c68b
+    }
e3c68b
     conf->restart_bricks = _gf_false;
e3c68b
+    synccond_broadcast(&conf->cond_restart_bricks);
e3c68b
 out:
e3c68b
     if (peer_data)
e3c68b
         dict_unref(peer_data);
e3c68b
@@ -5769,7 +5769,9 @@ my_callback(struct rpc_req *req, struct iovec *iov, int count, void *v_frame)
e3c68b
     call_frame_t *frame = v_frame;
e3c68b
     glusterd_conf_t *conf = frame->this->private;
e3c68b
 
e3c68b
-    GF_ATOMIC_DEC(conf->blockers);
e3c68b
+    if (GF_ATOMIC_DEC(conf->blockers) == 0) {
e3c68b
+        synccond_broadcast(&conf->cond_blockers);
e3c68b
+    }
e3c68b
 
e3c68b
     STACK_DESTROY(frame->root);
e3c68b
     return 0;
e3c68b
@@ -5865,7 +5867,9 @@ attach_brick_callback(struct rpc_req *req, struct iovec *iov, int count,
e3c68b
         }
e3c68b
     }
e3c68b
 out:
e3c68b
-    GF_ATOMIC_DEC(conf->blockers);
e3c68b
+    if (GF_ATOMIC_DEC(conf->blockers) == 0) {
e3c68b
+        synccond_broadcast(&conf->cond_blockers);
e3c68b
+    }
e3c68b
     STACK_DESTROY(frame->root);
e3c68b
     return 0;
e3c68b
 }
e3c68b
@@ -6053,7 +6057,7 @@ attach_brick(xlator_t *this, glusterd_brickinfo_t *brickinfo,
e3c68b
          * TBD: see if there's a better way
e3c68b
          */
e3c68b
         synclock_unlock(&conf->big_lock);
e3c68b
-        sleep(1);
e3c68b
+        synctask_sleep(1);
e3c68b
         synclock_lock(&conf->big_lock);
e3c68b
     }
e3c68b
 
e3c68b
@@ -6193,7 +6197,7 @@ find_compat_brick_in_vol(glusterd_conf_t *conf,
e3c68b
                          "brick %s is still"
e3c68b
                          " starting, waiting for 2 seconds ",
e3c68b
                          other_brick->path);
e3c68b
-            sleep(2);
e3c68b
+            synctask_sleep(2);
e3c68b
             synclock_lock(&conf->big_lock);
e3c68b
             retries--;
e3c68b
         }
e3c68b
@@ -6680,9 +6684,7 @@ glusterd_restart_bricks(void *opaque)
e3c68b
      * glusterd_compare_friend_data ())
e3c68b
      */
e3c68b
     while (conf->restart_bricks) {
e3c68b
-        synclock_unlock(&conf->big_lock);
e3c68b
-        sleep(2);
e3c68b
-        synclock_lock(&conf->big_lock);
e3c68b
+        synccond_wait(&conf->cond_restart_bricks, &conf->big_lock);
e3c68b
     }
e3c68b
     conf->restart_bricks = _gf_true;
e3c68b
 
e3c68b
@@ -6798,6 +6800,7 @@ out:
e3c68b
     GF_ATOMIC_DEC(conf->blockers);
e3c68b
     conf->restart_done = _gf_true;
e3c68b
     conf->restart_bricks = _gf_false;
e3c68b
+    synccond_broadcast(&conf->cond_restart_bricks);
e3c68b
 
e3c68b
 return_block:
e3c68b
     return ret;
e3c68b
diff --git a/xlators/mgmt/glusterd/src/glusterd.c b/xlators/mgmt/glusterd/src/glusterd.c
e3c68b
index d360312..a01034a 100644
e3c68b
--- a/xlators/mgmt/glusterd/src/glusterd.c
e3c68b
+++ b/xlators/mgmt/glusterd/src/glusterd.c
e3c68b
@@ -1845,6 +1845,8 @@ init(xlator_t *this)
e3c68b
     (void)strncpy(conf->rundir, rundir, sizeof(conf->rundir));
e3c68b
 
e3c68b
     synclock_init(&conf->big_lock, SYNC_LOCK_RECURSIVE);
e3c68b
+    synccond_init(&conf->cond_restart_bricks);
e3c68b
+    synccond_init(&conf->cond_blockers);
e3c68b
     pthread_mutex_init(&conf->xprt_lock, NULL);
e3c68b
     INIT_LIST_HEAD(&conf->xprt_list);
e3c68b
     pthread_mutex_init(&conf->import_volumes, NULL);
e3c68b
diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h
e3c68b
index 2be005c..1c6c3b1 100644
e3c68b
--- a/xlators/mgmt/glusterd/src/glusterd.h
e3c68b
+++ b/xlators/mgmt/glusterd/src/glusterd.h
e3c68b
@@ -209,6 +209,8 @@ typedef struct {
e3c68b
     dict_t *opts;
e3c68b
     synclock_t big_lock;
e3c68b
     gf_boolean_t restart_done;
e3c68b
+    synccond_t cond_restart_bricks;
e3c68b
+    synccond_t cond_blockers;
e3c68b
     rpcsvc_t *uds_rpc; /* RPCSVC for the unix domain socket */
e3c68b
     uint32_t base_port;
e3c68b
     uint32_t max_port;
e3c68b
-- 
e3c68b
1.8.3.1
e3c68b