17b94a
From 66600fb55522d405a68d7340a5680a2633c4237e Mon Sep 17 00:00:00 2001
17b94a
From: Xavi Hernandez <xhernandez@redhat.com>
17b94a
Date: Thu, 30 Apr 2020 11:19:01 +0200
17b94a
Subject: [PATCH 377/379] syncop: improve scaling and implement more tools
17b94a
17b94a
The current scaling of the syncop thread pool is not working properly
17b94a
and can leave some tasks in the run queue more time than necessary
17b94a
when the maximum number of threads is not reached.
17b94a
17b94a
This patch provides a better scaling condition to react faster to
17b94a
pending work.
17b94a
17b94a
Condition variables and sleep in the context of a synctask have also
17b94a
been implemented. Their purpose is to replace regular condition
17b94a
variables and sleeps that block synctask threads and prevent other
17b94a
tasks to be executed.
17b94a
17b94a
The new features have been applied to several places in glusterd.
17b94a
17b94a
upstream patch: https://review.gluster.org/#/c/glusterfs/+/24396/
17b94a
17b94a
> Change-Id: Ic50b7c73c104f9e41f08101a357d30b95efccfbf
17b94a
> Fixes: #1116
17b94a
> Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
17b94a
17b94a
Change-Id: Ic50b7c73c104f9e41f08101a357d30b95efccfbf
17b94a
BUG: 1810516
17b94a
Signed-off-by: Sanju Rakonde <srakonde@redhta.com>
17b94a
Reviewed-on: https://code.engineering.redhat.com/gerrit/200409
17b94a
Tested-by: Sanju Rakonde <srakonde@redhat.com>
17b94a
Tested-by: RHGS Build Bot <nigelb@redhat.com>
17b94a
Reviewed-by: Xavi Hernandez Juan <xhernandez@redhat.com>
17b94a
---
17b94a
 libglusterfs/src/glusterfs/syncop.h                |  52 +++-
17b94a
 libglusterfs/src/libglusterfs.sym                  |   7 +
17b94a
 libglusterfs/src/syncop.c                          | 306 ++++++++++++++++-----
17b94a
 xlators/cluster/dht/src/dht-rebalance.c            |   2 +-
17b94a
 xlators/mgmt/glusterd/src/glusterd-op-sm.c         |   9 +-
17b94a
 xlators/mgmt/glusterd/src/glusterd-proc-mgmt.c     |   2 +-
17b94a
 .../mgmt/glusterd/src/glusterd-snapshot-utils.c    |   5 +-
17b94a
 xlators/mgmt/glusterd/src/glusterd-syncop.h        |   2 +-
17b94a
 xlators/mgmt/glusterd/src/glusterd-utils.c         |  29 +-
17b94a
 xlators/mgmt/glusterd/src/glusterd.c               |   2 +
17b94a
 xlators/mgmt/glusterd/src/glusterd.h               |   2 +
17b94a
 11 files changed, 317 insertions(+), 101 deletions(-)
17b94a
17b94a
diff --git a/libglusterfs/src/glusterfs/syncop.h b/libglusterfs/src/glusterfs/syncop.h
17b94a
index e0f1017..3011b4c 100644
17b94a
--- a/libglusterfs/src/glusterfs/syncop.h
17b94a
+++ b/libglusterfs/src/glusterfs/syncop.h
17b94a
@@ -15,6 +15,7 @@
17b94a
 #include <sys/time.h>
17b94a
 #include <pthread.h>
17b94a
 #include <ucontext.h>
17b94a
+#include "glusterfs/timer.h"
17b94a
 
17b94a
 #define SYNCENV_PROC_MAX 16
17b94a
 #define SYNCENV_PROC_MIN 2
17b94a
@@ -32,6 +33,7 @@
17b94a
 struct synctask;
17b94a
 struct syncproc;
17b94a
 struct syncenv;
17b94a
+struct synccond;
17b94a
 
17b94a
 typedef int (*synctask_cbk_t)(int ret, call_frame_t *frame, void *opaque);
17b94a
 
17b94a
@@ -55,9 +57,12 @@ struct synctask {
17b94a
     call_frame_t *opframe;
17b94a
     synctask_cbk_t synccbk;
17b94a
     synctask_fn_t syncfn;
17b94a
-    synctask_state_t state;
17b94a
+    struct timespec *delta;
17b94a
+    gf_timer_t *timer;
17b94a
+    struct synccond *synccond;
17b94a
     void *opaque;
17b94a
     void *stack;
17b94a
+    synctask_state_t state;
17b94a
     int woken;
17b94a
     int slept;
17b94a
     int ret;
17b94a
@@ -85,19 +90,21 @@ struct syncproc {
17b94a
 /* hosts the scheduler thread and framework for executing synctasks */
17b94a
 struct syncenv {
17b94a
     struct syncproc proc[SYNCENV_PROC_MAX];
17b94a
-    int procs;
17b94a
+
17b94a
+    pthread_mutex_t mutex;
17b94a
+    pthread_cond_t cond;
17b94a
 
17b94a
     struct list_head runq;
17b94a
-    int runcount;
17b94a
     struct list_head waitq;
17b94a
-    int waitcount;
17b94a
+
17b94a
+    int procs;
17b94a
+    int procs_idle;
17b94a
+
17b94a
+    int runcount;
17b94a
 
17b94a
     int procmin;
17b94a
     int procmax;
17b94a
 
17b94a
-    pthread_mutex_t mutex;
17b94a
-    pthread_cond_t cond;
17b94a
-
17b94a
     size_t stacksize;
17b94a
 
17b94a
     int destroy; /* FLAG to mark syncenv is in destroy mode
17b94a
@@ -123,6 +130,13 @@ struct synclock {
17b94a
 };
17b94a
 typedef struct synclock synclock_t;
17b94a
 
17b94a
+struct synccond {
17b94a
+    pthread_mutex_t pmutex;
17b94a
+    pthread_cond_t pcond;
17b94a
+    struct list_head waitq;
17b94a
+};
17b94a
+typedef struct synccond synccond_t;
17b94a
+
17b94a
 struct syncbarrier {
17b94a
     gf_boolean_t initialized; /*Set on successful initialization*/
17b94a
     pthread_mutex_t guard;    /* guard the remaining members, pair @cond */
17b94a
@@ -219,7 +233,7 @@ struct syncopctx {
17b94a
 #define __yield(args)                                                          \
17b94a
     do {                                                                       \
17b94a
         if (args->task) {                                                      \
17b94a
-            synctask_yield(args->task);                                        \
17b94a
+            synctask_yield(args->task, NULL);                                  \
17b94a
         } else {                                                               \
17b94a
             pthread_mutex_lock(&args->mutex);                                  \
17b94a
             {                                                                  \
17b94a
@@ -307,7 +321,9 @@ synctask_join(struct synctask *task);
17b94a
 void
17b94a
 synctask_wake(struct synctask *task);
17b94a
 void
17b94a
-synctask_yield(struct synctask *task);
17b94a
+synctask_yield(struct synctask *task, struct timespec *delta);
17b94a
+void
17b94a
+synctask_sleep(int32_t secs);
17b94a
 void
17b94a
 synctask_waitfor(struct synctask *task, int count);
17b94a
 
17b94a
@@ -405,6 +421,24 @@ synclock_trylock(synclock_t *lock);
17b94a
 int
17b94a
 synclock_unlock(synclock_t *lock);
17b94a
 
17b94a
+int32_t
17b94a
+synccond_init(synccond_t *cond);
17b94a
+
17b94a
+void
17b94a
+synccond_destroy(synccond_t *cond);
17b94a
+
17b94a
+int
17b94a
+synccond_wait(synccond_t *cond, synclock_t *lock);
17b94a
+
17b94a
+int
17b94a
+synccond_timedwait(synccond_t *cond, synclock_t *lock, struct timespec *delta);
17b94a
+
17b94a
+void
17b94a
+synccond_signal(synccond_t *cond);
17b94a
+
17b94a
+void
17b94a
+synccond_broadcast(synccond_t *cond);
17b94a
+
17b94a
 int
17b94a
 syncbarrier_init(syncbarrier_t *barrier);
17b94a
 int
17b94a
diff --git a/libglusterfs/src/libglusterfs.sym b/libglusterfs/src/libglusterfs.sym
17b94a
index 467a1b7..5a721e0 100644
17b94a
--- a/libglusterfs/src/libglusterfs.sym
17b94a
+++ b/libglusterfs/src/libglusterfs.sym
17b94a
@@ -938,6 +938,12 @@ syncbarrier_destroy
17b94a
 syncbarrier_init
17b94a
 syncbarrier_wait
17b94a
 syncbarrier_wake
17b94a
+synccond_init
17b94a
+synccond_destroy
17b94a
+synccond_wait
17b94a
+synccond_timedwait
17b94a
+synccond_signal
17b94a
+synccond_broadcast
17b94a
 syncenv_destroy
17b94a
 syncenv_new
17b94a
 synclock_destroy
17b94a
@@ -1015,6 +1021,7 @@ synctask_new
17b94a
 synctask_new1
17b94a
 synctask_set
17b94a
 synctask_setid
17b94a
+synctask_sleep
17b94a
 synctask_wake
17b94a
 synctask_yield
17b94a
 sys_access
17b94a
diff --git a/libglusterfs/src/syncop.c b/libglusterfs/src/syncop.c
17b94a
index 693970f..71d37b7 100644
17b94a
--- a/libglusterfs/src/syncop.c
17b94a
+++ b/libglusterfs/src/syncop.c
17b94a
@@ -154,10 +154,14 @@ out:
17b94a
     return ret;
17b94a
 }
17b94a
 
17b94a
+void *
17b94a
+syncenv_processor(void *thdata);
17b94a
+
17b94a
 static void
17b94a
 __run(struct synctask *task)
17b94a
 {
17b94a
     struct syncenv *env = NULL;
17b94a
+    int32_t total, ret, i;
17b94a
 
17b94a
     env = task->env;
17b94a
 
17b94a
@@ -173,7 +177,6 @@ __run(struct synctask *task)
17b94a
             env->runcount--;
17b94a
             break;
17b94a
         case SYNCTASK_WAIT:
17b94a
-            env->waitcount--;
17b94a
             break;
17b94a
         case SYNCTASK_DONE:
17b94a
             gf_msg(task->xl->name, GF_LOG_WARNING, 0, LG_MSG_COMPLETED_TASK,
17b94a
@@ -187,8 +190,27 @@ __run(struct synctask *task)
17b94a
     }
17b94a
 
17b94a
     list_add_tail(&task->all_tasks, &env->runq);
17b94a
-    env->runcount++;
17b94a
     task->state = SYNCTASK_RUN;
17b94a
+
17b94a
+    env->runcount++;
17b94a
+
17b94a
+    total = env->procs + env->runcount - env->procs_idle;
17b94a
+    if (total > env->procmax) {
17b94a
+        total = env->procmax;
17b94a
+    }
17b94a
+    if (total > env->procs) {
17b94a
+        for (i = 0; i < env->procmax; i++) {
17b94a
+            if (env->proc[i].env == NULL) {
17b94a
+                env->proc[i].env = env;
17b94a
+                ret = gf_thread_create(&env->proc[i].processor, NULL,
17b94a
+                                       syncenv_processor, &env->proc[i],
17b94a
+                                       "sproc%d", i);
17b94a
+                if ((ret < 0) || (++env->procs >= total)) {
17b94a
+                    break;
17b94a
+                }
17b94a
+            }
17b94a
+        }
17b94a
+    }
17b94a
 }
17b94a
 
17b94a
 static void
17b94a
@@ -210,7 +232,6 @@ __wait(struct synctask *task)
17b94a
             gf_msg(task->xl->name, GF_LOG_WARNING, 0, LG_MSG_REWAITING_TASK,
17b94a
                    "re-waiting already waiting "
17b94a
                    "task");
17b94a
-            env->waitcount--;
17b94a
             break;
17b94a
         case SYNCTASK_DONE:
17b94a
             gf_msg(task->xl->name, GF_LOG_WARNING, 0, LG_MSG_COMPLETED_TASK,
17b94a
@@ -223,12 +244,11 @@ __wait(struct synctask *task)
17b94a
     }
17b94a
 
17b94a
     list_add_tail(&task->all_tasks, &env->waitq);
17b94a
-    env->waitcount++;
17b94a
     task->state = SYNCTASK_WAIT;
17b94a
 }
17b94a
 
17b94a
 void
17b94a
-synctask_yield(struct synctask *task)
17b94a
+synctask_yield(struct synctask *task, struct timespec *delta)
17b94a
 {
17b94a
     xlator_t *oldTHIS = THIS;
17b94a
 
17b94a
@@ -237,6 +257,8 @@ synctask_yield(struct synctask *task)
17b94a
     task->proc->sched.uc_flags &= ~_UC_TLSBASE;
17b94a
 #endif
17b94a
 
17b94a
+    task->delta = delta;
17b94a
+
17b94a
     if (task->state != SYNCTASK_DONE) {
17b94a
         task->state = SYNCTASK_SUSPEND;
17b94a
     }
17b94a
@@ -249,6 +271,35 @@ synctask_yield(struct synctask *task)
17b94a
 }
17b94a
 
17b94a
 void
17b94a
+synctask_sleep(int32_t secs)
17b94a
+{
17b94a
+    struct timespec delta;
17b94a
+    struct synctask *task;
17b94a
+
17b94a
+    task = synctask_get();
17b94a
+
17b94a
+    if (task == NULL) {
17b94a
+        sleep(secs);
17b94a
+    } else {
17b94a
+        delta.tv_sec = secs;
17b94a
+        delta.tv_nsec = 0;
17b94a
+
17b94a
+        synctask_yield(task, &delta);
17b94a
+    }
17b94a
+}
17b94a
+
17b94a
+static void
17b94a
+__synctask_wake(struct synctask *task)
17b94a
+{
17b94a
+    task->woken = 1;
17b94a
+
17b94a
+    if (task->slept)
17b94a
+        __run(task);
17b94a
+
17b94a
+    pthread_cond_broadcast(&task->env->cond);
17b94a
+}
17b94a
+
17b94a
+void
17b94a
 synctask_wake(struct synctask *task)
17b94a
 {
17b94a
     struct syncenv *env = NULL;
17b94a
@@ -257,13 +308,18 @@ synctask_wake(struct synctask *task)
17b94a
 
17b94a
     pthread_mutex_lock(&env->mutex);
17b94a
     {
17b94a
-        task->woken = 1;
17b94a
+        if (task->timer != NULL) {
17b94a
+            if (gf_timer_call_cancel(task->xl->ctx, task->timer) != 0) {
17b94a
+                goto unlock;
17b94a
+            }
17b94a
 
17b94a
-        if (task->slept)
17b94a
-            __run(task);
17b94a
+            task->timer = NULL;
17b94a
+            task->synccond = NULL;
17b94a
+        }
17b94a
 
17b94a
-        pthread_cond_broadcast(&env->cond);
17b94a
+        __synctask_wake(task);
17b94a
     }
17b94a
+unlock:
17b94a
     pthread_mutex_unlock(&env->mutex);
17b94a
 }
17b94a
 
17b94a
@@ -282,7 +338,7 @@ synctask_wrap(void)
17b94a
 
17b94a
     task->state = SYNCTASK_DONE;
17b94a
 
17b94a
-    synctask_yield(task);
17b94a
+    synctask_yield(task, NULL);
17b94a
 }
17b94a
 
17b94a
 void
17b94a
@@ -422,11 +478,6 @@ synctask_create(struct syncenv *env, size_t stacksize, synctask_fn_t fn,
17b94a
     }
17b94a
 
17b94a
     synctask_wake(newtask);
17b94a
-    /*
17b94a
-     * Make sure someone's there to execute anything we just put on the
17b94a
-     * run queue.
17b94a
-     */
17b94a
-    syncenv_scale(env);
17b94a
 
17b94a
     return newtask;
17b94a
 err:
17b94a
@@ -520,8 +571,12 @@ syncenv_task(struct syncproc *proc)
17b94a
                 goto unlock;
17b94a
             }
17b94a
 
17b94a
+            env->procs_idle++;
17b94a
+
17b94a
             sleep_till.tv_sec = time(NULL) + SYNCPROC_IDLE_TIME;
17b94a
             ret = pthread_cond_timedwait(&env->cond, &env->mutex, &sleep_till);
17b94a
+
17b94a
+            env->procs_idle--;
17b94a
         }
17b94a
 
17b94a
         task = list_entry(env->runq.next, struct synctask, all_tasks);
17b94a
@@ -540,6 +595,34 @@ unlock:
17b94a
     return task;
17b94a
 }
17b94a
 
17b94a
+static void
17b94a
+synctask_timer(void *data)
17b94a
+{
17b94a
+    struct synctask *task = data;
17b94a
+    struct synccond *cond;
17b94a
+
17b94a
+    cond = task->synccond;
17b94a
+    if (cond != NULL) {
17b94a
+        pthread_mutex_lock(&cond->pmutex);
17b94a
+
17b94a
+        list_del_init(&task->waitq);
17b94a
+        task->synccond = NULL;
17b94a
+
17b94a
+        pthread_mutex_unlock(&cond->pmutex);
17b94a
+
17b94a
+        task->ret = -ETIMEDOUT;
17b94a
+    }
17b94a
+
17b94a
+    pthread_mutex_lock(&task->env->mutex);
17b94a
+
17b94a
+    gf_timer_call_cancel(task->xl->ctx, task->timer);
17b94a
+    task->timer = NULL;
17b94a
+
17b94a
+    __synctask_wake(task);
17b94a
+
17b94a
+    pthread_mutex_unlock(&task->env->mutex);
17b94a
+}
17b94a
+
17b94a
 void
17b94a
 synctask_switchto(struct synctask *task)
17b94a
 {
17b94a
@@ -572,7 +655,14 @@ synctask_switchto(struct synctask *task)
17b94a
         } else {
17b94a
             task->slept = 1;
17b94a
             __wait(task);
17b94a
+
17b94a
+            if (task->delta != NULL) {
17b94a
+                task->timer = gf_timer_call_after(task->xl->ctx, *task->delta,
17b94a
+                                                  synctask_timer, task);
17b94a
+            }
17b94a
         }
17b94a
+
17b94a
+        task->delta = NULL;
17b94a
     }
17b94a
     pthread_mutex_unlock(&env->mutex);
17b94a
 }
17b94a
@@ -580,65 +670,18 @@ synctask_switchto(struct synctask *task)
17b94a
 void *
17b94a
 syncenv_processor(void *thdata)
17b94a
 {
17b94a
-    struct syncenv *env = NULL;
17b94a
     struct syncproc *proc = NULL;
17b94a
     struct synctask *task = NULL;
17b94a
 
17b94a
     proc = thdata;
17b94a
-    env = proc->env;
17b94a
-
17b94a
-    for (;;) {
17b94a
-        task = syncenv_task(proc);
17b94a
-        if (!task)
17b94a
-            break;
17b94a
 
17b94a
+    while ((task = syncenv_task(proc)) != NULL) {
17b94a
         synctask_switchto(task);
17b94a
-
17b94a
-        syncenv_scale(env);
17b94a
     }
17b94a
 
17b94a
     return NULL;
17b94a
 }
17b94a
 
17b94a
-void
17b94a
-syncenv_scale(struct syncenv *env)
17b94a
-{
17b94a
-    int diff = 0;
17b94a
-    int scale = 0;
17b94a
-    int i = 0;
17b94a
-    int ret = 0;
17b94a
-
17b94a
-    pthread_mutex_lock(&env->mutex);
17b94a
-    {
17b94a
-        if (env->procs > env->runcount)
17b94a
-            goto unlock;
17b94a
-
17b94a
-        scale = env->runcount;
17b94a
-        if (scale > env->procmax)
17b94a
-            scale = env->procmax;
17b94a
-        if (scale > env->procs)
17b94a
-            diff = scale - env->procs;
17b94a
-        while (diff) {
17b94a
-            diff--;
17b94a
-            for (; (i < env->procmax); i++) {
17b94a
-                if (env->proc[i].processor == 0)
17b94a
-                    break;
17b94a
-            }
17b94a
-
17b94a
-            env->proc[i].env = env;
17b94a
-            ret = gf_thread_create(&env->proc[i].processor, NULL,
17b94a
-                                   syncenv_processor, &env->proc[i],
17b94a
-                                   "sproc%03hx", env->procs & 0x3ff);
17b94a
-            if (ret)
17b94a
-                break;
17b94a
-            env->procs++;
17b94a
-            i++;
17b94a
-        }
17b94a
-    }
17b94a
-unlock:
17b94a
-    pthread_mutex_unlock(&env->mutex);
17b94a
-}
17b94a
-
17b94a
 /* The syncenv threads are cleaned up in this routine.
17b94a
  */
17b94a
 void
17b94a
@@ -715,12 +758,13 @@ syncenv_new(size_t stacksize, int procmin, int procmax)
17b94a
         newenv->stacksize = stacksize;
17b94a
     newenv->procmin = procmin;
17b94a
     newenv->procmax = procmax;
17b94a
+    newenv->procs_idle = 0;
17b94a
 
17b94a
     for (i = 0; i < newenv->procmin; i++) {
17b94a
         newenv->proc[i].env = newenv;
17b94a
         ret = gf_thread_create(&newenv->proc[i].processor, NULL,
17b94a
                                syncenv_processor, &newenv->proc[i], "sproc%d",
17b94a
-                               newenv->procs);
17b94a
+                               i);
17b94a
         if (ret)
17b94a
             break;
17b94a
         newenv->procs++;
17b94a
@@ -810,7 +854,7 @@ __synclock_lock(struct synclock *lock)
17b94a
             task->woken = 0;
17b94a
             list_add_tail(&task->waitq, &lock->waitq);
17b94a
             pthread_mutex_unlock(&lock->guard);
17b94a
-            synctask_yield(task);
17b94a
+            synctask_yield(task, NULL);
17b94a
             /* task is removed from waitq in unlock,
17b94a
              * under lock->guard.*/
17b94a
             pthread_mutex_lock(&lock->guard);
17b94a
@@ -963,6 +1007,136 @@ synclock_unlock(synclock_t *lock)
17b94a
     return ret;
17b94a
 }
17b94a
 
17b94a
+/* Condition variables */
17b94a
+
17b94a
+int32_t
17b94a
+synccond_init(synccond_t *cond)
17b94a
+{
17b94a
+    int32_t ret;
17b94a
+
17b94a
+    INIT_LIST_HEAD(&cond->waitq);
17b94a
+
17b94a
+    ret = pthread_mutex_init(&cond->pmutex, NULL);
17b94a
+    if (ret != 0) {
17b94a
+        return -ret;
17b94a
+    }
17b94a
+
17b94a
+    ret = pthread_cond_init(&cond->pcond, NULL);
17b94a
+    if (ret != 0) {
17b94a
+        pthread_mutex_destroy(&cond->pmutex);
17b94a
+    }
17b94a
+
17b94a
+    return -ret;
17b94a
+}
17b94a
+
17b94a
+void
17b94a
+synccond_destroy(synccond_t *cond)
17b94a
+{
17b94a
+    pthread_cond_destroy(&cond->pcond);
17b94a
+    pthread_mutex_destroy(&cond->pmutex);
17b94a
+}
17b94a
+
17b94a
+int
17b94a
+synccond_timedwait(synccond_t *cond, synclock_t *lock, struct timespec *delta)
17b94a
+{
17b94a
+    struct timespec now;
17b94a
+    struct synctask *task = NULL;
17b94a
+    int ret;
17b94a
+
17b94a
+    task = synctask_get();
17b94a
+
17b94a
+    if (task == NULL) {
17b94a
+        if (delta != NULL) {
17b94a
+            timespec_now_realtime(&now;;
17b94a
+            timespec_adjust_delta(&now, *delta);
17b94a
+        }
17b94a
+
17b94a
+        pthread_mutex_lock(&cond->pmutex);
17b94a
+
17b94a
+        if (delta == NULL) {
17b94a
+            ret = -pthread_cond_wait(&cond->pcond, &cond->pmutex);
17b94a
+        } else {
17b94a
+            ret = -pthread_cond_timedwait(&cond->pcond, &cond->pmutex, &now;;
17b94a
+        }
17b94a
+    } else {
17b94a
+        pthread_mutex_lock(&cond->pmutex);
17b94a
+
17b94a
+        list_add_tail(&task->waitq, &cond->waitq);
17b94a
+        task->synccond = cond;
17b94a
+
17b94a
+        ret = synclock_unlock(lock);
17b94a
+        if (ret == 0) {
17b94a
+            pthread_mutex_unlock(&cond->pmutex);
17b94a
+
17b94a
+            synctask_yield(task, delta);
17b94a
+
17b94a
+            ret = synclock_lock(lock);
17b94a
+            if (ret == 0) {
17b94a
+                ret = task->ret;
17b94a
+            }
17b94a
+            task->ret = 0;
17b94a
+
17b94a
+            return ret;
17b94a
+        }
17b94a
+
17b94a
+        list_del_init(&task->waitq);
17b94a
+    }
17b94a
+
17b94a
+    pthread_mutex_unlock(&cond->pmutex);
17b94a
+
17b94a
+    return ret;
17b94a
+}
17b94a
+
17b94a
+int
17b94a
+synccond_wait(synccond_t *cond, synclock_t *lock)
17b94a
+{
17b94a
+    return synccond_timedwait(cond, lock, NULL);
17b94a
+}
17b94a
+
17b94a
+void
17b94a
+synccond_signal(synccond_t *cond)
17b94a
+{
17b94a
+    struct synctask *task;
17b94a
+
17b94a
+    pthread_mutex_lock(&cond->pmutex);
17b94a
+
17b94a
+    if (!list_empty(&cond->waitq)) {
17b94a
+        task = list_first_entry(&cond->waitq, struct synctask, waitq);
17b94a
+        list_del_init(&task->waitq);
17b94a
+
17b94a
+        pthread_mutex_unlock(&cond->pmutex);
17b94a
+
17b94a
+        synctask_wake(task);
17b94a
+    } else {
17b94a
+        pthread_cond_signal(&cond->pcond);
17b94a
+
17b94a
+        pthread_mutex_unlock(&cond->pmutex);
17b94a
+    }
17b94a
+}
17b94a
+
17b94a
+void
17b94a
+synccond_broadcast(synccond_t *cond)
17b94a
+{
17b94a
+    struct list_head list;
17b94a
+    struct synctask *task;
17b94a
+
17b94a
+    INIT_LIST_HEAD(&list);
17b94a
+
17b94a
+    pthread_mutex_lock(&cond->pmutex);
17b94a
+
17b94a
+    list_splice_init(&cond->waitq, &list);
17b94a
+    pthread_cond_broadcast(&cond->pcond);
17b94a
+
17b94a
+    pthread_mutex_unlock(&cond->pmutex);
17b94a
+
17b94a
+    while (!list_empty(&list)) {
17b94a
+        task = list_first_entry(&list, struct synctask, waitq);
17b94a
+        list_del_init(&task->waitq);
17b94a
+
17b94a
+        synctask_wake(task);
17b94a
+    }
17b94a
+}
17b94a
+
17b94a
 /* Barriers */
17b94a
 
17b94a
 int
17b94a
@@ -1032,7 +1206,7 @@ __syncbarrier_wait(struct syncbarrier *barrier, int waitfor)
17b94a
             /* called within a synctask */
17b94a
             list_add_tail(&task->waitq, &barrier->waitq);
17b94a
             pthread_mutex_unlock(&barrier->guard);
17b94a
-            synctask_yield(task);
17b94a
+            synctask_yield(task, NULL);
17b94a
             pthread_mutex_lock(&barrier->guard);
17b94a
         } else {
17b94a
             /* called by a non-synctask */
17b94a
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
17b94a
index c692119..957deaa 100644
17b94a
--- a/xlators/cluster/dht/src/dht-rebalance.c
17b94a
+++ b/xlators/cluster/dht/src/dht-rebalance.c
17b94a
@@ -5224,7 +5224,7 @@ gf_defrag_pause_tier(xlator_t *this, gf_defrag_info_t *defrag)
17b94a
     defrag->tier_conf.pause_timer = gf_timer_call_after(
17b94a
         this->ctx, delta, gf_defrag_pause_tier_timeout, this);
17b94a
 
17b94a
-    synctask_yield(defrag->tier_conf.pause_synctask);
17b94a
+    synctask_yield(defrag->tier_conf.pause_synctask, NULL);
17b94a
 
17b94a
     if (gf_defrag_get_pause_state(&defrag->tier_conf) == TIER_PAUSED)
17b94a
         goto out;
17b94a
diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
17b94a
index 0d29de2..6475611 100644
17b94a
--- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c
17b94a
+++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
17b94a
@@ -6076,13 +6076,8 @@ glusterd_op_stage_validate(glusterd_op_t op, dict_t *dict, char **op_errstr,
17b94a
 static void
17b94a
 glusterd_wait_for_blockers(glusterd_conf_t *priv)
17b94a
 {
17b94a
-    uint64_t blockers = GF_ATOMIC_GET(priv->blockers);
17b94a
-
17b94a
-    while (blockers) {
17b94a
-        synclock_unlock(&priv->big_lock);
17b94a
-        sleep(1);
17b94a
-        blockers = GF_ATOMIC_GET(priv->blockers);
17b94a
-        synclock_lock(&priv->big_lock);
17b94a
+    while (GF_ATOMIC_GET(priv->blockers)) {
17b94a
+        synccond_wait(&priv->cond_blockers, &priv->big_lock);
17b94a
     }
17b94a
 }
17b94a
 
17b94a
diff --git a/xlators/mgmt/glusterd/src/glusterd-proc-mgmt.c b/xlators/mgmt/glusterd/src/glusterd-proc-mgmt.c
17b94a
index 36018a0..f55a5fd 100644
17b94a
--- a/xlators/mgmt/glusterd/src/glusterd-proc-mgmt.c
17b94a
+++ b/xlators/mgmt/glusterd/src/glusterd-proc-mgmt.c
17b94a
@@ -112,7 +112,7 @@ glusterd_proc_stop(glusterd_proc_t *proc, int sig, int flags)
17b94a
         goto out;
17b94a
 
17b94a
     synclock_unlock(&conf->big_lock);
17b94a
-    sleep(1);
17b94a
+    synctask_sleep(1);
17b94a
     synclock_lock(&conf->big_lock);
17b94a
     if (gf_is_service_running(proc->pidfile, &pid)) {
17b94a
         ret = kill(pid, SIGKILL);
17b94a
diff --git a/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c b/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c
17b94a
index d225854..386eed2 100644
17b94a
--- a/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c
17b94a
+++ b/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c
17b94a
@@ -1961,9 +1961,7 @@ glusterd_update_snaps_synctask(void *opaque)
17b94a
     synclock_lock(&conf->big_lock);
17b94a
 
17b94a
     while (conf->restart_bricks) {
17b94a
-        synclock_unlock(&conf->big_lock);
17b94a
-        sleep(2);
17b94a
-        synclock_lock(&conf->big_lock);
17b94a
+        synccond_wait(&conf->cond_restart_bricks, &conf->big_lock);
17b94a
     }
17b94a
     conf->restart_bricks = _gf_true;
17b94a
 
17b94a
@@ -2070,6 +2068,7 @@ out:
17b94a
     if (dict)
17b94a
         dict_unref(dict);
17b94a
     conf->restart_bricks = _gf_false;
17b94a
+    synccond_broadcast(&conf->cond_restart_bricks);
17b94a
 
17b94a
     return ret;
17b94a
 }
17b94a
diff --git a/xlators/mgmt/glusterd/src/glusterd-syncop.h b/xlators/mgmt/glusterd/src/glusterd-syncop.h
17b94a
index ce4a940..a265f21 100644
17b94a
--- a/xlators/mgmt/glusterd/src/glusterd-syncop.h
17b94a
+++ b/xlators/mgmt/glusterd/src/glusterd-syncop.h
17b94a
@@ -32,7 +32,7 @@
17b94a
         ret = gd_syncop_submit_request(rpc, req, stb, cookie, prog, procnum,   \
17b94a
                                        cbk, (xdrproc_t)xdrproc);               \
17b94a
         if (!ret)                                                              \
17b94a
-            synctask_yield(stb->task);                                         \
17b94a
+            synctask_yield(stb->task, NULL);                                   \
17b94a
         else                                                                   \
17b94a
             gf_asprintf(&stb->errstr,                                          \
17b94a
                         "%s failed. Check log file"                            \
17b94a
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
17b94a
index 812c698..ce9931c 100644
17b94a
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
17b94a
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
17b94a
@@ -5068,22 +5068,22 @@ glusterd_import_friend_volumes_synctask(void *opaque)
17b94a
      * restarted (refer glusterd_restart_bricks ())
17b94a
      */
17b94a
     while (conf->restart_bricks) {
17b94a
-        synclock_unlock(&conf->big_lock);
17b94a
-        sleep(2);
17b94a
-        synclock_lock(&conf->big_lock);
17b94a
+        synccond_wait(&conf->cond_restart_bricks, &conf->big_lock);
17b94a
     }
17b94a
     conf->restart_bricks = _gf_true;
17b94a
 
17b94a
     while (i <= count) {
17b94a
         ret = glusterd_import_friend_volume(peer_data, i);
17b94a
         if (ret) {
17b94a
-            conf->restart_bricks = _gf_false;
17b94a
-            goto out;
17b94a
+            break;
17b94a
         }
17b94a
         i++;
17b94a
     }
17b94a
-    glusterd_svcs_manager(NULL);
17b94a
+    if (i > count) {
17b94a
+        glusterd_svcs_manager(NULL);
17b94a
+    }
17b94a
     conf->restart_bricks = _gf_false;
17b94a
+    synccond_broadcast(&conf->cond_restart_bricks);
17b94a
 out:
17b94a
     if (peer_data)
17b94a
         dict_unref(peer_data);
17b94a
@@ -5769,7 +5769,9 @@ my_callback(struct rpc_req *req, struct iovec *iov, int count, void *v_frame)
17b94a
     call_frame_t *frame = v_frame;
17b94a
     glusterd_conf_t *conf = frame->this->private;
17b94a
 
17b94a
-    GF_ATOMIC_DEC(conf->blockers);
17b94a
+    if (GF_ATOMIC_DEC(conf->blockers) == 0) {
17b94a
+        synccond_broadcast(&conf->cond_blockers);
17b94a
+    }
17b94a
 
17b94a
     STACK_DESTROY(frame->root);
17b94a
     return 0;
17b94a
@@ -5865,7 +5867,9 @@ attach_brick_callback(struct rpc_req *req, struct iovec *iov, int count,
17b94a
         }
17b94a
     }
17b94a
 out:
17b94a
-    GF_ATOMIC_DEC(conf->blockers);
17b94a
+    if (GF_ATOMIC_DEC(conf->blockers) == 0) {
17b94a
+        synccond_broadcast(&conf->cond_blockers);
17b94a
+    }
17b94a
     STACK_DESTROY(frame->root);
17b94a
     return 0;
17b94a
 }
17b94a
@@ -6053,7 +6057,7 @@ attach_brick(xlator_t *this, glusterd_brickinfo_t *brickinfo,
17b94a
          * TBD: see if there's a better way
17b94a
          */
17b94a
         synclock_unlock(&conf->big_lock);
17b94a
-        sleep(1);
17b94a
+        synctask_sleep(1);
17b94a
         synclock_lock(&conf->big_lock);
17b94a
     }
17b94a
 
17b94a
@@ -6193,7 +6197,7 @@ find_compat_brick_in_vol(glusterd_conf_t *conf,
17b94a
                          "brick %s is still"
17b94a
                          " starting, waiting for 2 seconds ",
17b94a
                          other_brick->path);
17b94a
-            sleep(2);
17b94a
+            synctask_sleep(2);
17b94a
             synclock_lock(&conf->big_lock);
17b94a
             retries--;
17b94a
         }
17b94a
@@ -6680,9 +6684,7 @@ glusterd_restart_bricks(void *opaque)
17b94a
      * glusterd_compare_friend_data ())
17b94a
      */
17b94a
     while (conf->restart_bricks) {
17b94a
-        synclock_unlock(&conf->big_lock);
17b94a
-        sleep(2);
17b94a
-        synclock_lock(&conf->big_lock);
17b94a
+        synccond_wait(&conf->cond_restart_bricks, &conf->big_lock);
17b94a
     }
17b94a
     conf->restart_bricks = _gf_true;
17b94a
 
17b94a
@@ -6798,6 +6800,7 @@ out:
17b94a
     GF_ATOMIC_DEC(conf->blockers);
17b94a
     conf->restart_done = _gf_true;
17b94a
     conf->restart_bricks = _gf_false;
17b94a
+    synccond_broadcast(&conf->cond_restart_bricks);
17b94a
 
17b94a
 return_block:
17b94a
     return ret;
17b94a
diff --git a/xlators/mgmt/glusterd/src/glusterd.c b/xlators/mgmt/glusterd/src/glusterd.c
17b94a
index d360312..a01034a 100644
17b94a
--- a/xlators/mgmt/glusterd/src/glusterd.c
17b94a
+++ b/xlators/mgmt/glusterd/src/glusterd.c
17b94a
@@ -1845,6 +1845,8 @@ init(xlator_t *this)
17b94a
     (void)strncpy(conf->rundir, rundir, sizeof(conf->rundir));
17b94a
 
17b94a
     synclock_init(&conf->big_lock, SYNC_LOCK_RECURSIVE);
17b94a
+    synccond_init(&conf->cond_restart_bricks);
17b94a
+    synccond_init(&conf->cond_blockers);
17b94a
     pthread_mutex_init(&conf->xprt_lock, NULL);
17b94a
     INIT_LIST_HEAD(&conf->xprt_list);
17b94a
     pthread_mutex_init(&conf->import_volumes, NULL);
17b94a
diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h
17b94a
index 2be005c..1c6c3b1 100644
17b94a
--- a/xlators/mgmt/glusterd/src/glusterd.h
17b94a
+++ b/xlators/mgmt/glusterd/src/glusterd.h
17b94a
@@ -209,6 +209,8 @@ typedef struct {
17b94a
     dict_t *opts;
17b94a
     synclock_t big_lock;
17b94a
     gf_boolean_t restart_done;
17b94a
+    synccond_t cond_restart_bricks;
17b94a
+    synccond_t cond_blockers;
17b94a
     rpcsvc_t *uds_rpc; /* RPCSVC for the unix domain socket */
17b94a
     uint32_t base_port;
17b94a
     uint32_t max_port;
17b94a
-- 
17b94a
1.8.3.1
17b94a