1df6c8
From 66600fb55522d405a68d7340a5680a2633c4237e Mon Sep 17 00:00:00 2001
1df6c8
From: Xavi Hernandez <xhernandez@redhat.com>
1df6c8
Date: Thu, 30 Apr 2020 11:19:01 +0200
1df6c8
Subject: [PATCH 377/379] syncop: improve scaling and implement more tools
1df6c8
1df6c8
The current scaling of the syncop thread pool is not working properly
1df6c8
and can leave some tasks in the run queue more time than necessary
1df6c8
when the maximum number of threads is not reached.
1df6c8
1df6c8
This patch provides a better scaling condition to react faster to
1df6c8
pending work.
1df6c8
1df6c8
Condition variables and sleep in the context of a synctask have also
1df6c8
been implemented. Their purpose is to replace regular condition
1df6c8
variables and sleeps that block synctask threads and prevent other
1df6c8
tasks to be executed.
1df6c8
1df6c8
The new features have been applied to several places in glusterd.
1df6c8
1df6c8
upstream patch: https://review.gluster.org/#/c/glusterfs/+/24396/
1df6c8
1df6c8
> Change-Id: Ic50b7c73c104f9e41f08101a357d30b95efccfbf
1df6c8
> Fixes: #1116
1df6c8
> Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
1df6c8
1df6c8
Change-Id: Ic50b7c73c104f9e41f08101a357d30b95efccfbf
1df6c8
BUG: 1810516
1df6c8
Signed-off-by: Sanju Rakonde <srakonde@redhta.com>
1df6c8
Reviewed-on: https://code.engineering.redhat.com/gerrit/200409
1df6c8
Tested-by: Sanju Rakonde <srakonde@redhat.com>
1df6c8
Tested-by: RHGS Build Bot <nigelb@redhat.com>
1df6c8
Reviewed-by: Xavi Hernandez Juan <xhernandez@redhat.com>
1df6c8
---
1df6c8
 libglusterfs/src/glusterfs/syncop.h                |  52 +++-
1df6c8
 libglusterfs/src/libglusterfs.sym                  |   7 +
1df6c8
 libglusterfs/src/syncop.c                          | 306 ++++++++++++++++-----
1df6c8
 xlators/cluster/dht/src/dht-rebalance.c            |   2 +-
1df6c8
 xlators/mgmt/glusterd/src/glusterd-op-sm.c         |   9 +-
1df6c8
 xlators/mgmt/glusterd/src/glusterd-proc-mgmt.c     |   2 +-
1df6c8
 .../mgmt/glusterd/src/glusterd-snapshot-utils.c    |   5 +-
1df6c8
 xlators/mgmt/glusterd/src/glusterd-syncop.h        |   2 +-
1df6c8
 xlators/mgmt/glusterd/src/glusterd-utils.c         |  29 +-
1df6c8
 xlators/mgmt/glusterd/src/glusterd.c               |   2 +
1df6c8
 xlators/mgmt/glusterd/src/glusterd.h               |   2 +
1df6c8
 11 files changed, 317 insertions(+), 101 deletions(-)
1df6c8
1df6c8
diff --git a/libglusterfs/src/glusterfs/syncop.h b/libglusterfs/src/glusterfs/syncop.h
1df6c8
index e0f1017..3011b4c 100644
1df6c8
--- a/libglusterfs/src/glusterfs/syncop.h
1df6c8
+++ b/libglusterfs/src/glusterfs/syncop.h
1df6c8
@@ -15,6 +15,7 @@
1df6c8
 #include <sys/time.h>
1df6c8
 #include <pthread.h>
1df6c8
 #include <ucontext.h>
1df6c8
+#include "glusterfs/timer.h"
1df6c8
 
1df6c8
 #define SYNCENV_PROC_MAX 16
1df6c8
 #define SYNCENV_PROC_MIN 2
1df6c8
@@ -32,6 +33,7 @@
1df6c8
 struct synctask;
1df6c8
 struct syncproc;
1df6c8
 struct syncenv;
1df6c8
+struct synccond;
1df6c8
 
1df6c8
 typedef int (*synctask_cbk_t)(int ret, call_frame_t *frame, void *opaque);
1df6c8
 
1df6c8
@@ -55,9 +57,12 @@ struct synctask {
1df6c8
     call_frame_t *opframe;
1df6c8
     synctask_cbk_t synccbk;
1df6c8
     synctask_fn_t syncfn;
1df6c8
-    synctask_state_t state;
1df6c8
+    struct timespec *delta;
1df6c8
+    gf_timer_t *timer;
1df6c8
+    struct synccond *synccond;
1df6c8
     void *opaque;
1df6c8
     void *stack;
1df6c8
+    synctask_state_t state;
1df6c8
     int woken;
1df6c8
     int slept;
1df6c8
     int ret;
1df6c8
@@ -85,19 +90,21 @@ struct syncproc {
1df6c8
 /* hosts the scheduler thread and framework for executing synctasks */
1df6c8
 struct syncenv {
1df6c8
     struct syncproc proc[SYNCENV_PROC_MAX];
1df6c8
-    int procs;
1df6c8
+
1df6c8
+    pthread_mutex_t mutex;
1df6c8
+    pthread_cond_t cond;
1df6c8
 
1df6c8
     struct list_head runq;
1df6c8
-    int runcount;
1df6c8
     struct list_head waitq;
1df6c8
-    int waitcount;
1df6c8
+
1df6c8
+    int procs;
1df6c8
+    int procs_idle;
1df6c8
+
1df6c8
+    int runcount;
1df6c8
 
1df6c8
     int procmin;
1df6c8
     int procmax;
1df6c8
 
1df6c8
-    pthread_mutex_t mutex;
1df6c8
-    pthread_cond_t cond;
1df6c8
-
1df6c8
     size_t stacksize;
1df6c8
 
1df6c8
     int destroy; /* FLAG to mark syncenv is in destroy mode
1df6c8
@@ -123,6 +130,13 @@ struct synclock {
1df6c8
 };
1df6c8
 typedef struct synclock synclock_t;
1df6c8
 
1df6c8
+struct synccond {
1df6c8
+    pthread_mutex_t pmutex;
1df6c8
+    pthread_cond_t pcond;
1df6c8
+    struct list_head waitq;
1df6c8
+};
1df6c8
+typedef struct synccond synccond_t;
1df6c8
+
1df6c8
 struct syncbarrier {
1df6c8
     gf_boolean_t initialized; /*Set on successful initialization*/
1df6c8
     pthread_mutex_t guard;    /* guard the remaining members, pair @cond */
1df6c8
@@ -219,7 +233,7 @@ struct syncopctx {
1df6c8
 #define __yield(args)                                                          \
1df6c8
     do {                                                                       \
1df6c8
         if (args->task) {                                                      \
1df6c8
-            synctask_yield(args->task);                                        \
1df6c8
+            synctask_yield(args->task, NULL);                                  \
1df6c8
         } else {                                                               \
1df6c8
             pthread_mutex_lock(&args->mutex);                                  \
1df6c8
             {                                                                  \
1df6c8
@@ -307,7 +321,9 @@ synctask_join(struct synctask *task);
1df6c8
 void
1df6c8
 synctask_wake(struct synctask *task);
1df6c8
 void
1df6c8
-synctask_yield(struct synctask *task);
1df6c8
+synctask_yield(struct synctask *task, struct timespec *delta);
1df6c8
+void
1df6c8
+synctask_sleep(int32_t secs);
1df6c8
 void
1df6c8
 synctask_waitfor(struct synctask *task, int count);
1df6c8
 
1df6c8
@@ -405,6 +421,24 @@ synclock_trylock(synclock_t *lock);
1df6c8
 int
1df6c8
 synclock_unlock(synclock_t *lock);
1df6c8
 
1df6c8
+int32_t
1df6c8
+synccond_init(synccond_t *cond);
1df6c8
+
1df6c8
+void
1df6c8
+synccond_destroy(synccond_t *cond);
1df6c8
+
1df6c8
+int
1df6c8
+synccond_wait(synccond_t *cond, synclock_t *lock);
1df6c8
+
1df6c8
+int
1df6c8
+synccond_timedwait(synccond_t *cond, synclock_t *lock, struct timespec *delta);
1df6c8
+
1df6c8
+void
1df6c8
+synccond_signal(synccond_t *cond);
1df6c8
+
1df6c8
+void
1df6c8
+synccond_broadcast(synccond_t *cond);
1df6c8
+
1df6c8
 int
1df6c8
 syncbarrier_init(syncbarrier_t *barrier);
1df6c8
 int
1df6c8
diff --git a/libglusterfs/src/libglusterfs.sym b/libglusterfs/src/libglusterfs.sym
1df6c8
index 467a1b7..5a721e0 100644
1df6c8
--- a/libglusterfs/src/libglusterfs.sym
1df6c8
+++ b/libglusterfs/src/libglusterfs.sym
1df6c8
@@ -938,6 +938,12 @@ syncbarrier_destroy
1df6c8
 syncbarrier_init
1df6c8
 syncbarrier_wait
1df6c8
 syncbarrier_wake
1df6c8
+synccond_init
1df6c8
+synccond_destroy
1df6c8
+synccond_wait
1df6c8
+synccond_timedwait
1df6c8
+synccond_signal
1df6c8
+synccond_broadcast
1df6c8
 syncenv_destroy
1df6c8
 syncenv_new
1df6c8
 synclock_destroy
1df6c8
@@ -1015,6 +1021,7 @@ synctask_new
1df6c8
 synctask_new1
1df6c8
 synctask_set
1df6c8
 synctask_setid
1df6c8
+synctask_sleep
1df6c8
 synctask_wake
1df6c8
 synctask_yield
1df6c8
 sys_access
1df6c8
diff --git a/libglusterfs/src/syncop.c b/libglusterfs/src/syncop.c
1df6c8
index 693970f..71d37b7 100644
1df6c8
--- a/libglusterfs/src/syncop.c
1df6c8
+++ b/libglusterfs/src/syncop.c
1df6c8
@@ -154,10 +154,14 @@ out:
1df6c8
     return ret;
1df6c8
 }
1df6c8
 
1df6c8
+void *
1df6c8
+syncenv_processor(void *thdata);
1df6c8
+
1df6c8
 static void
1df6c8
 __run(struct synctask *task)
1df6c8
 {
1df6c8
     struct syncenv *env = NULL;
1df6c8
+    int32_t total, ret, i;
1df6c8
 
1df6c8
     env = task->env;
1df6c8
 
1df6c8
@@ -173,7 +177,6 @@ __run(struct synctask *task)
1df6c8
             env->runcount--;
1df6c8
             break;
1df6c8
         case SYNCTASK_WAIT:
1df6c8
-            env->waitcount--;
1df6c8
             break;
1df6c8
         case SYNCTASK_DONE:
1df6c8
             gf_msg(task->xl->name, GF_LOG_WARNING, 0, LG_MSG_COMPLETED_TASK,
1df6c8
@@ -187,8 +190,27 @@ __run(struct synctask *task)
1df6c8
     }
1df6c8
 
1df6c8
     list_add_tail(&task->all_tasks, &env->runq);
1df6c8
-    env->runcount++;
1df6c8
     task->state = SYNCTASK_RUN;
1df6c8
+
1df6c8
+    env->runcount++;
1df6c8
+
1df6c8
+    total = env->procs + env->runcount - env->procs_idle;
1df6c8
+    if (total > env->procmax) {
1df6c8
+        total = env->procmax;
1df6c8
+    }
1df6c8
+    if (total > env->procs) {
1df6c8
+        for (i = 0; i < env->procmax; i++) {
1df6c8
+            if (env->proc[i].env == NULL) {
1df6c8
+                env->proc[i].env = env;
1df6c8
+                ret = gf_thread_create(&env->proc[i].processor, NULL,
1df6c8
+                                       syncenv_processor, &env->proc[i],
1df6c8
+                                       "sproc%d", i);
1df6c8
+                if ((ret < 0) || (++env->procs >= total)) {
1df6c8
+                    break;
1df6c8
+                }
1df6c8
+            }
1df6c8
+        }
1df6c8
+    }
1df6c8
 }
1df6c8
 
1df6c8
 static void
1df6c8
@@ -210,7 +232,6 @@ __wait(struct synctask *task)
1df6c8
             gf_msg(task->xl->name, GF_LOG_WARNING, 0, LG_MSG_REWAITING_TASK,
1df6c8
                    "re-waiting already waiting "
1df6c8
                    "task");
1df6c8
-            env->waitcount--;
1df6c8
             break;
1df6c8
         case SYNCTASK_DONE:
1df6c8
             gf_msg(task->xl->name, GF_LOG_WARNING, 0, LG_MSG_COMPLETED_TASK,
1df6c8
@@ -223,12 +244,11 @@ __wait(struct synctask *task)
1df6c8
     }
1df6c8
 
1df6c8
     list_add_tail(&task->all_tasks, &env->waitq);
1df6c8
-    env->waitcount++;
1df6c8
     task->state = SYNCTASK_WAIT;
1df6c8
 }
1df6c8
 
1df6c8
 void
1df6c8
-synctask_yield(struct synctask *task)
1df6c8
+synctask_yield(struct synctask *task, struct timespec *delta)
1df6c8
 {
1df6c8
     xlator_t *oldTHIS = THIS;
1df6c8
 
1df6c8
@@ -237,6 +257,8 @@ synctask_yield(struct synctask *task)
1df6c8
     task->proc->sched.uc_flags &= ~_UC_TLSBASE;
1df6c8
 #endif
1df6c8
 
1df6c8
+    task->delta = delta;
1df6c8
+
1df6c8
     if (task->state != SYNCTASK_DONE) {
1df6c8
         task->state = SYNCTASK_SUSPEND;
1df6c8
     }
1df6c8
@@ -249,6 +271,35 @@ synctask_yield(struct synctask *task)
1df6c8
 }
1df6c8
 
1df6c8
 void
1df6c8
+synctask_sleep(int32_t secs)
1df6c8
+{
1df6c8
+    struct timespec delta;
1df6c8
+    struct synctask *task;
1df6c8
+
1df6c8
+    task = synctask_get();
1df6c8
+
1df6c8
+    if (task == NULL) {
1df6c8
+        sleep(secs);
1df6c8
+    } else {
1df6c8
+        delta.tv_sec = secs;
1df6c8
+        delta.tv_nsec = 0;
1df6c8
+
1df6c8
+        synctask_yield(task, &delta);
1df6c8
+    }
1df6c8
+}
1df6c8
+
1df6c8
+static void
1df6c8
+__synctask_wake(struct synctask *task)
1df6c8
+{
1df6c8
+    task->woken = 1;
1df6c8
+
1df6c8
+    if (task->slept)
1df6c8
+        __run(task);
1df6c8
+
1df6c8
+    pthread_cond_broadcast(&task->env->cond);
1df6c8
+}
1df6c8
+
1df6c8
+void
1df6c8
 synctask_wake(struct synctask *task)
1df6c8
 {
1df6c8
     struct syncenv *env = NULL;
1df6c8
@@ -257,13 +308,18 @@ synctask_wake(struct synctask *task)
1df6c8
 
1df6c8
     pthread_mutex_lock(&env->mutex);
1df6c8
     {
1df6c8
-        task->woken = 1;
1df6c8
+        if (task->timer != NULL) {
1df6c8
+            if (gf_timer_call_cancel(task->xl->ctx, task->timer) != 0) {
1df6c8
+                goto unlock;
1df6c8
+            }
1df6c8
 
1df6c8
-        if (task->slept)
1df6c8
-            __run(task);
1df6c8
+            task->timer = NULL;
1df6c8
+            task->synccond = NULL;
1df6c8
+        }
1df6c8
 
1df6c8
-        pthread_cond_broadcast(&env->cond);
1df6c8
+        __synctask_wake(task);
1df6c8
     }
1df6c8
+unlock:
1df6c8
     pthread_mutex_unlock(&env->mutex);
1df6c8
 }
1df6c8
 
1df6c8
@@ -282,7 +338,7 @@ synctask_wrap(void)
1df6c8
 
1df6c8
     task->state = SYNCTASK_DONE;
1df6c8
 
1df6c8
-    synctask_yield(task);
1df6c8
+    synctask_yield(task, NULL);
1df6c8
 }
1df6c8
 
1df6c8
 void
1df6c8
@@ -422,11 +478,6 @@ synctask_create(struct syncenv *env, size_t stacksize, synctask_fn_t fn,
1df6c8
     }
1df6c8
 
1df6c8
     synctask_wake(newtask);
1df6c8
-    /*
1df6c8
-     * Make sure someone's there to execute anything we just put on the
1df6c8
-     * run queue.
1df6c8
-     */
1df6c8
-    syncenv_scale(env);
1df6c8
 
1df6c8
     return newtask;
1df6c8
 err:
1df6c8
@@ -520,8 +571,12 @@ syncenv_task(struct syncproc *proc)
1df6c8
                 goto unlock;
1df6c8
             }
1df6c8
 
1df6c8
+            env->procs_idle++;
1df6c8
+
1df6c8
             sleep_till.tv_sec = time(NULL) + SYNCPROC_IDLE_TIME;
1df6c8
             ret = pthread_cond_timedwait(&env->cond, &env->mutex, &sleep_till);
1df6c8
+
1df6c8
+            env->procs_idle--;
1df6c8
         }
1df6c8
 
1df6c8
         task = list_entry(env->runq.next, struct synctask, all_tasks);
1df6c8
@@ -540,6 +595,34 @@ unlock:
1df6c8
     return task;
1df6c8
 }
1df6c8
 
1df6c8
+static void
1df6c8
+synctask_timer(void *data)
1df6c8
+{
1df6c8
+    struct synctask *task = data;
1df6c8
+    struct synccond *cond;
1df6c8
+
1df6c8
+    cond = task->synccond;
1df6c8
+    if (cond != NULL) {
1df6c8
+        pthread_mutex_lock(&cond->pmutex);
1df6c8
+
1df6c8
+        list_del_init(&task->waitq);
1df6c8
+        task->synccond = NULL;
1df6c8
+
1df6c8
+        pthread_mutex_unlock(&cond->pmutex);
1df6c8
+
1df6c8
+        task->ret = -ETIMEDOUT;
1df6c8
+    }
1df6c8
+
1df6c8
+    pthread_mutex_lock(&task->env->mutex);
1df6c8
+
1df6c8
+    gf_timer_call_cancel(task->xl->ctx, task->timer);
1df6c8
+    task->timer = NULL;
1df6c8
+
1df6c8
+    __synctask_wake(task);
1df6c8
+
1df6c8
+    pthread_mutex_unlock(&task->env->mutex);
1df6c8
+}
1df6c8
+
1df6c8
 void
1df6c8
 synctask_switchto(struct synctask *task)
1df6c8
 {
1df6c8
@@ -572,7 +655,14 @@ synctask_switchto(struct synctask *task)
1df6c8
         } else {
1df6c8
             task->slept = 1;
1df6c8
             __wait(task);
1df6c8
+
1df6c8
+            if (task->delta != NULL) {
1df6c8
+                task->timer = gf_timer_call_after(task->xl->ctx, *task->delta,
1df6c8
+                                                  synctask_timer, task);
1df6c8
+            }
1df6c8
         }
1df6c8
+
1df6c8
+        task->delta = NULL;
1df6c8
     }
1df6c8
     pthread_mutex_unlock(&env->mutex);
1df6c8
 }
1df6c8
@@ -580,65 +670,18 @@ synctask_switchto(struct synctask *task)
1df6c8
 void *
1df6c8
 syncenv_processor(void *thdata)
1df6c8
 {
1df6c8
-    struct syncenv *env = NULL;
1df6c8
     struct syncproc *proc = NULL;
1df6c8
     struct synctask *task = NULL;
1df6c8
 
1df6c8
     proc = thdata;
1df6c8
-    env = proc->env;
1df6c8
-
1df6c8
-    for (;;) {
1df6c8
-        task = syncenv_task(proc);
1df6c8
-        if (!task)
1df6c8
-            break;
1df6c8
 
1df6c8
+    while ((task = syncenv_task(proc)) != NULL) {
1df6c8
         synctask_switchto(task);
1df6c8
-
1df6c8
-        syncenv_scale(env);
1df6c8
     }
1df6c8
 
1df6c8
     return NULL;
1df6c8
 }
1df6c8
 
1df6c8
-void
1df6c8
-syncenv_scale(struct syncenv *env)
1df6c8
-{
1df6c8
-    int diff = 0;
1df6c8
-    int scale = 0;
1df6c8
-    int i = 0;
1df6c8
-    int ret = 0;
1df6c8
-
1df6c8
-    pthread_mutex_lock(&env->mutex);
1df6c8
-    {
1df6c8
-        if (env->procs > env->runcount)
1df6c8
-            goto unlock;
1df6c8
-
1df6c8
-        scale = env->runcount;
1df6c8
-        if (scale > env->procmax)
1df6c8
-            scale = env->procmax;
1df6c8
-        if (scale > env->procs)
1df6c8
-            diff = scale - env->procs;
1df6c8
-        while (diff) {
1df6c8
-            diff--;
1df6c8
-            for (; (i < env->procmax); i++) {
1df6c8
-                if (env->proc[i].processor == 0)
1df6c8
-                    break;
1df6c8
-            }
1df6c8
-
1df6c8
-            env->proc[i].env = env;
1df6c8
-            ret = gf_thread_create(&env->proc[i].processor, NULL,
1df6c8
-                                   syncenv_processor, &env->proc[i],
1df6c8
-                                   "sproc%03hx", env->procs & 0x3ff);
1df6c8
-            if (ret)
1df6c8
-                break;
1df6c8
-            env->procs++;
1df6c8
-            i++;
1df6c8
-        }
1df6c8
-    }
1df6c8
-unlock:
1df6c8
-    pthread_mutex_unlock(&env->mutex);
1df6c8
-}
1df6c8
-
1df6c8
 /* The syncenv threads are cleaned up in this routine.
1df6c8
  */
1df6c8
 void
1df6c8
@@ -715,12 +758,13 @@ syncenv_new(size_t stacksize, int procmin, int procmax)
1df6c8
         newenv->stacksize = stacksize;
1df6c8
     newenv->procmin = procmin;
1df6c8
     newenv->procmax = procmax;
1df6c8
+    newenv->procs_idle = 0;
1df6c8
 
1df6c8
     for (i = 0; i < newenv->procmin; i++) {
1df6c8
         newenv->proc[i].env = newenv;
1df6c8
         ret = gf_thread_create(&newenv->proc[i].processor, NULL,
1df6c8
                                syncenv_processor, &newenv->proc[i], "sproc%d",
1df6c8
-                               newenv->procs);
1df6c8
+                               i);
1df6c8
         if (ret)
1df6c8
             break;
1df6c8
         newenv->procs++;
1df6c8
@@ -810,7 +854,7 @@ __synclock_lock(struct synclock *lock)
1df6c8
             task->woken = 0;
1df6c8
             list_add_tail(&task->waitq, &lock->waitq);
1df6c8
             pthread_mutex_unlock(&lock->guard);
1df6c8
-            synctask_yield(task);
1df6c8
+            synctask_yield(task, NULL);
1df6c8
             /* task is removed from waitq in unlock,
1df6c8
              * under lock->guard.*/
1df6c8
             pthread_mutex_lock(&lock->guard);
1df6c8
@@ -963,6 +1007,136 @@ synclock_unlock(synclock_t *lock)
1df6c8
     return ret;
1df6c8
 }
1df6c8
 
1df6c8
+/* Condition variables */
1df6c8
+
1df6c8
+int32_t
1df6c8
+synccond_init(synccond_t *cond)
1df6c8
+{
1df6c8
+    int32_t ret;
1df6c8
+
1df6c8
+    INIT_LIST_HEAD(&cond->waitq);
1df6c8
+
1df6c8
+    ret = pthread_mutex_init(&cond->pmutex, NULL);
1df6c8
+    if (ret != 0) {
1df6c8
+        return -ret;
1df6c8
+    }
1df6c8
+
1df6c8
+    ret = pthread_cond_init(&cond->pcond, NULL);
1df6c8
+    if (ret != 0) {
1df6c8
+        pthread_mutex_destroy(&cond->pmutex);
1df6c8
+    }
1df6c8
+
1df6c8
+    return -ret;
1df6c8
+}
1df6c8
+
1df6c8
+void
1df6c8
+synccond_destroy(synccond_t *cond)
1df6c8
+{
1df6c8
+    pthread_cond_destroy(&cond->pcond);
1df6c8
+    pthread_mutex_destroy(&cond->pmutex);
1df6c8
+}
1df6c8
+
1df6c8
+int
1df6c8
+synccond_timedwait(synccond_t *cond, synclock_t *lock, struct timespec *delta)
1df6c8
+{
1df6c8
+    struct timespec now;
1df6c8
+    struct synctask *task = NULL;
1df6c8
+    int ret;
1df6c8
+
1df6c8
+    task = synctask_get();
1df6c8
+
1df6c8
+    if (task == NULL) {
1df6c8
+        if (delta != NULL) {
1df6c8
+            timespec_now_realtime(&now;;
1df6c8
+            timespec_adjust_delta(&now, *delta);
1df6c8
+        }
1df6c8
+
1df6c8
+        pthread_mutex_lock(&cond->pmutex);
1df6c8
+
1df6c8
+        if (delta == NULL) {
1df6c8
+            ret = -pthread_cond_wait(&cond->pcond, &cond->pmutex);
1df6c8
+        } else {
1df6c8
+            ret = -pthread_cond_timedwait(&cond->pcond, &cond->pmutex, &now;;
1df6c8
+        }
1df6c8
+    } else {
1df6c8
+        pthread_mutex_lock(&cond->pmutex);
1df6c8
+
1df6c8
+        list_add_tail(&task->waitq, &cond->waitq);
1df6c8
+        task->synccond = cond;
1df6c8
+
1df6c8
+        ret = synclock_unlock(lock);
1df6c8
+        if (ret == 0) {
1df6c8
+            pthread_mutex_unlock(&cond->pmutex);
1df6c8
+
1df6c8
+            synctask_yield(task, delta);
1df6c8
+
1df6c8
+            ret = synclock_lock(lock);
1df6c8
+            if (ret == 0) {
1df6c8
+                ret = task->ret;
1df6c8
+            }
1df6c8
+            task->ret = 0;
1df6c8
+
1df6c8
+            return ret;
1df6c8
+        }
1df6c8
+
1df6c8
+        list_del_init(&task->waitq);
1df6c8
+    }
1df6c8
+
1df6c8
+    pthread_mutex_unlock(&cond->pmutex);
1df6c8
+
1df6c8
+    return ret;
1df6c8
+}
1df6c8
+
1df6c8
+int
1df6c8
+synccond_wait(synccond_t *cond, synclock_t *lock)
1df6c8
+{
1df6c8
+    return synccond_timedwait(cond, lock, NULL);
1df6c8
+}
1df6c8
+
1df6c8
+void
1df6c8
+synccond_signal(synccond_t *cond)
1df6c8
+{
1df6c8
+    struct synctask *task;
1df6c8
+
1df6c8
+    pthread_mutex_lock(&cond->pmutex);
1df6c8
+
1df6c8
+    if (!list_empty(&cond->waitq)) {
1df6c8
+        task = list_first_entry(&cond->waitq, struct synctask, waitq);
1df6c8
+        list_del_init(&task->waitq);
1df6c8
+
1df6c8
+        pthread_mutex_unlock(&cond->pmutex);
1df6c8
+
1df6c8
+        synctask_wake(task);
1df6c8
+    } else {
1df6c8
+        pthread_cond_signal(&cond->pcond);
1df6c8
+
1df6c8
+        pthread_mutex_unlock(&cond->pmutex);
1df6c8
+    }
1df6c8
+}
1df6c8
+
1df6c8
+void
1df6c8
+synccond_broadcast(synccond_t *cond)
1df6c8
+{
1df6c8
+    struct list_head list;
1df6c8
+    struct synctask *task;
1df6c8
+
1df6c8
+    INIT_LIST_HEAD(&list);
1df6c8
+
1df6c8
+    pthread_mutex_lock(&cond->pmutex);
1df6c8
+
1df6c8
+    list_splice_init(&cond->waitq, &list);
1df6c8
+    pthread_cond_broadcast(&cond->pcond);
1df6c8
+
1df6c8
+    pthread_mutex_unlock(&cond->pmutex);
1df6c8
+
1df6c8
+    while (!list_empty(&list)) {
1df6c8
+        task = list_first_entry(&list, struct synctask, waitq);
1df6c8
+        list_del_init(&task->waitq);
1df6c8
+
1df6c8
+        synctask_wake(task);
1df6c8
+    }
1df6c8
+}
1df6c8
+
1df6c8
 /* Barriers */
1df6c8
 
1df6c8
 int
1df6c8
@@ -1032,7 +1206,7 @@ __syncbarrier_wait(struct syncbarrier *barrier, int waitfor)
1df6c8
             /* called within a synctask */
1df6c8
             list_add_tail(&task->waitq, &barrier->waitq);
1df6c8
             pthread_mutex_unlock(&barrier->guard);
1df6c8
-            synctask_yield(task);
1df6c8
+            synctask_yield(task, NULL);
1df6c8
             pthread_mutex_lock(&barrier->guard);
1df6c8
         } else {
1df6c8
             /* called by a non-synctask */
1df6c8
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
1df6c8
index c692119..957deaa 100644
1df6c8
--- a/xlators/cluster/dht/src/dht-rebalance.c
1df6c8
+++ b/xlators/cluster/dht/src/dht-rebalance.c
1df6c8
@@ -5224,7 +5224,7 @@ gf_defrag_pause_tier(xlator_t *this, gf_defrag_info_t *defrag)
1df6c8
     defrag->tier_conf.pause_timer = gf_timer_call_after(
1df6c8
         this->ctx, delta, gf_defrag_pause_tier_timeout, this);
1df6c8
 
1df6c8
-    synctask_yield(defrag->tier_conf.pause_synctask);
1df6c8
+    synctask_yield(defrag->tier_conf.pause_synctask, NULL);
1df6c8
 
1df6c8
     if (gf_defrag_get_pause_state(&defrag->tier_conf) == TIER_PAUSED)
1df6c8
         goto out;
1df6c8
diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
1df6c8
index 0d29de2..6475611 100644
1df6c8
--- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c
1df6c8
+++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
1df6c8
@@ -6076,13 +6076,8 @@ glusterd_op_stage_validate(glusterd_op_t op, dict_t *dict, char **op_errstr,
1df6c8
 static void
1df6c8
 glusterd_wait_for_blockers(glusterd_conf_t *priv)
1df6c8
 {
1df6c8
-    uint64_t blockers = GF_ATOMIC_GET(priv->blockers);
1df6c8
-
1df6c8
-    while (blockers) {
1df6c8
-        synclock_unlock(&priv->big_lock);
1df6c8
-        sleep(1);
1df6c8
-        blockers = GF_ATOMIC_GET(priv->blockers);
1df6c8
-        synclock_lock(&priv->big_lock);
1df6c8
+    while (GF_ATOMIC_GET(priv->blockers)) {
1df6c8
+        synccond_wait(&priv->cond_blockers, &priv->big_lock);
1df6c8
     }
1df6c8
 }
1df6c8
 
1df6c8
diff --git a/xlators/mgmt/glusterd/src/glusterd-proc-mgmt.c b/xlators/mgmt/glusterd/src/glusterd-proc-mgmt.c
1df6c8
index 36018a0..f55a5fd 100644
1df6c8
--- a/xlators/mgmt/glusterd/src/glusterd-proc-mgmt.c
1df6c8
+++ b/xlators/mgmt/glusterd/src/glusterd-proc-mgmt.c
1df6c8
@@ -112,7 +112,7 @@ glusterd_proc_stop(glusterd_proc_t *proc, int sig, int flags)
1df6c8
         goto out;
1df6c8
 
1df6c8
     synclock_unlock(&conf->big_lock);
1df6c8
-    sleep(1);
1df6c8
+    synctask_sleep(1);
1df6c8
     synclock_lock(&conf->big_lock);
1df6c8
     if (gf_is_service_running(proc->pidfile, &pid)) {
1df6c8
         ret = kill(pid, SIGKILL);
1df6c8
diff --git a/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c b/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c
1df6c8
index d225854..386eed2 100644
1df6c8
--- a/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c
1df6c8
+++ b/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c
1df6c8
@@ -1961,9 +1961,7 @@ glusterd_update_snaps_synctask(void *opaque)
1df6c8
     synclock_lock(&conf->big_lock);
1df6c8
 
1df6c8
     while (conf->restart_bricks) {
1df6c8
-        synclock_unlock(&conf->big_lock);
1df6c8
-        sleep(2);
1df6c8
-        synclock_lock(&conf->big_lock);
1df6c8
+        synccond_wait(&conf->cond_restart_bricks, &conf->big_lock);
1df6c8
     }
1df6c8
     conf->restart_bricks = _gf_true;
1df6c8
 
1df6c8
@@ -2070,6 +2068,7 @@ out:
1df6c8
     if (dict)
1df6c8
         dict_unref(dict);
1df6c8
     conf->restart_bricks = _gf_false;
1df6c8
+    synccond_broadcast(&conf->cond_restart_bricks);
1df6c8
 
1df6c8
     return ret;
1df6c8
 }
1df6c8
diff --git a/xlators/mgmt/glusterd/src/glusterd-syncop.h b/xlators/mgmt/glusterd/src/glusterd-syncop.h
1df6c8
index ce4a940..a265f21 100644
1df6c8
--- a/xlators/mgmt/glusterd/src/glusterd-syncop.h
1df6c8
+++ b/xlators/mgmt/glusterd/src/glusterd-syncop.h
1df6c8
@@ -32,7 +32,7 @@
1df6c8
         ret = gd_syncop_submit_request(rpc, req, stb, cookie, prog, procnum,   \
1df6c8
                                        cbk, (xdrproc_t)xdrproc);               \
1df6c8
         if (!ret)                                                              \
1df6c8
-            synctask_yield(stb->task);                                         \
1df6c8
+            synctask_yield(stb->task, NULL);                                   \
1df6c8
         else                                                                   \
1df6c8
             gf_asprintf(&stb->errstr,                                          \
1df6c8
                         "%s failed. Check log file"                            \
1df6c8
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
1df6c8
index 812c698..ce9931c 100644
1df6c8
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
1df6c8
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
1df6c8
@@ -5068,22 +5068,22 @@ glusterd_import_friend_volumes_synctask(void *opaque)
1df6c8
      * restarted (refer glusterd_restart_bricks ())
1df6c8
      */
1df6c8
     while (conf->restart_bricks) {
1df6c8
-        synclock_unlock(&conf->big_lock);
1df6c8
-        sleep(2);
1df6c8
-        synclock_lock(&conf->big_lock);
1df6c8
+        synccond_wait(&conf->cond_restart_bricks, &conf->big_lock);
1df6c8
     }
1df6c8
     conf->restart_bricks = _gf_true;
1df6c8
 
1df6c8
     while (i <= count) {
1df6c8
         ret = glusterd_import_friend_volume(peer_data, i);
1df6c8
         if (ret) {
1df6c8
-            conf->restart_bricks = _gf_false;
1df6c8
-            goto out;
1df6c8
+            break;
1df6c8
         }
1df6c8
         i++;
1df6c8
     }
1df6c8
-    glusterd_svcs_manager(NULL);
1df6c8
+    if (i > count) {
1df6c8
+        glusterd_svcs_manager(NULL);
1df6c8
+    }
1df6c8
     conf->restart_bricks = _gf_false;
1df6c8
+    synccond_broadcast(&conf->cond_restart_bricks);
1df6c8
 out:
1df6c8
     if (peer_data)
1df6c8
         dict_unref(peer_data);
1df6c8
@@ -5769,7 +5769,9 @@ my_callback(struct rpc_req *req, struct iovec *iov, int count, void *v_frame)
1df6c8
     call_frame_t *frame = v_frame;
1df6c8
     glusterd_conf_t *conf = frame->this->private;
1df6c8
 
1df6c8
-    GF_ATOMIC_DEC(conf->blockers);
1df6c8
+    if (GF_ATOMIC_DEC(conf->blockers) == 0) {
1df6c8
+        synccond_broadcast(&conf->cond_blockers);
1df6c8
+    }
1df6c8
 
1df6c8
     STACK_DESTROY(frame->root);
1df6c8
     return 0;
1df6c8
@@ -5865,7 +5867,9 @@ attach_brick_callback(struct rpc_req *req, struct iovec *iov, int count,
1df6c8
         }
1df6c8
     }
1df6c8
 out:
1df6c8
-    GF_ATOMIC_DEC(conf->blockers);
1df6c8
+    if (GF_ATOMIC_DEC(conf->blockers) == 0) {
1df6c8
+        synccond_broadcast(&conf->cond_blockers);
1df6c8
+    }
1df6c8
     STACK_DESTROY(frame->root);
1df6c8
     return 0;
1df6c8
 }
1df6c8
@@ -6053,7 +6057,7 @@ attach_brick(xlator_t *this, glusterd_brickinfo_t *brickinfo,
1df6c8
          * TBD: see if there's a better way
1df6c8
          */
1df6c8
         synclock_unlock(&conf->big_lock);
1df6c8
-        sleep(1);
1df6c8
+        synctask_sleep(1);
1df6c8
         synclock_lock(&conf->big_lock);
1df6c8
     }
1df6c8
 
1df6c8
@@ -6193,7 +6197,7 @@ find_compat_brick_in_vol(glusterd_conf_t *conf,
1df6c8
                          "brick %s is still"
1df6c8
                          " starting, waiting for 2 seconds ",
1df6c8
                          other_brick->path);
1df6c8
-            sleep(2);
1df6c8
+            synctask_sleep(2);
1df6c8
             synclock_lock(&conf->big_lock);
1df6c8
             retries--;
1df6c8
         }
1df6c8
@@ -6680,9 +6684,7 @@ glusterd_restart_bricks(void *opaque)
1df6c8
      * glusterd_compare_friend_data ())
1df6c8
      */
1df6c8
     while (conf->restart_bricks) {
1df6c8
-        synclock_unlock(&conf->big_lock);
1df6c8
-        sleep(2);
1df6c8
-        synclock_lock(&conf->big_lock);
1df6c8
+        synccond_wait(&conf->cond_restart_bricks, &conf->big_lock);
1df6c8
     }
1df6c8
     conf->restart_bricks = _gf_true;
1df6c8
 
1df6c8
@@ -6798,6 +6800,7 @@ out:
1df6c8
     GF_ATOMIC_DEC(conf->blockers);
1df6c8
     conf->restart_done = _gf_true;
1df6c8
     conf->restart_bricks = _gf_false;
1df6c8
+    synccond_broadcast(&conf->cond_restart_bricks);
1df6c8
 
1df6c8
 return_block:
1df6c8
     return ret;
1df6c8
diff --git a/xlators/mgmt/glusterd/src/glusterd.c b/xlators/mgmt/glusterd/src/glusterd.c
1df6c8
index d360312..a01034a 100644
1df6c8
--- a/xlators/mgmt/glusterd/src/glusterd.c
1df6c8
+++ b/xlators/mgmt/glusterd/src/glusterd.c
1df6c8
@@ -1845,6 +1845,8 @@ init(xlator_t *this)
1df6c8
     (void)strncpy(conf->rundir, rundir, sizeof(conf->rundir));
1df6c8
 
1df6c8
     synclock_init(&conf->big_lock, SYNC_LOCK_RECURSIVE);
1df6c8
+    synccond_init(&conf->cond_restart_bricks);
1df6c8
+    synccond_init(&conf->cond_blockers);
1df6c8
     pthread_mutex_init(&conf->xprt_lock, NULL);
1df6c8
     INIT_LIST_HEAD(&conf->xprt_list);
1df6c8
     pthread_mutex_init(&conf->import_volumes, NULL);
1df6c8
diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h
1df6c8
index 2be005c..1c6c3b1 100644
1df6c8
--- a/xlators/mgmt/glusterd/src/glusterd.h
1df6c8
+++ b/xlators/mgmt/glusterd/src/glusterd.h
1df6c8
@@ -209,6 +209,8 @@ typedef struct {
1df6c8
     dict_t *opts;
1df6c8
     synclock_t big_lock;
1df6c8
     gf_boolean_t restart_done;
1df6c8
+    synccond_t cond_restart_bricks;
1df6c8
+    synccond_t cond_blockers;
1df6c8
     rpcsvc_t *uds_rpc; /* RPCSVC for the unix domain socket */
1df6c8
     uint32_t base_port;
1df6c8
     uint32_t max_port;
1df6c8
-- 
1df6c8
1.8.3.1
1df6c8