cead9d
From 66600fb55522d405a68d7340a5680a2633c4237e Mon Sep 17 00:00:00 2001
cead9d
From: Xavi Hernandez <xhernandez@redhat.com>
cead9d
Date: Thu, 30 Apr 2020 11:19:01 +0200
cead9d
Subject: [PATCH 377/379] syncop: improve scaling and implement more tools
cead9d
cead9d
The current scaling of the syncop thread pool is not working properly
cead9d
and can leave some tasks in the run queue more time than necessary
cead9d
when the maximum number of threads is not reached.
cead9d
cead9d
This patch provides a better scaling condition to react faster to
cead9d
pending work.
cead9d
cead9d
Condition variables and sleep in the context of a synctask have also
cead9d
been implemented. Their purpose is to replace regular condition
cead9d
variables and sleeps that block synctask threads and prevent other
cead9d
tasks to be executed.
cead9d
cead9d
The new features have been applied to several places in glusterd.
cead9d
cead9d
upstream patch: https://review.gluster.org/#/c/glusterfs/+/24396/
cead9d
cead9d
> Change-Id: Ic50b7c73c104f9e41f08101a357d30b95efccfbf
cead9d
> Fixes: #1116
cead9d
> Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
cead9d
cead9d
Change-Id: Ic50b7c73c104f9e41f08101a357d30b95efccfbf
cead9d
BUG: 1810516
cead9d
Signed-off-by: Sanju Rakonde <srakonde@redhta.com>
cead9d
Reviewed-on: https://code.engineering.redhat.com/gerrit/200409
cead9d
Tested-by: Sanju Rakonde <srakonde@redhat.com>
cead9d
Tested-by: RHGS Build Bot <nigelb@redhat.com>
cead9d
Reviewed-by: Xavi Hernandez Juan <xhernandez@redhat.com>
cead9d
---
cead9d
 libglusterfs/src/glusterfs/syncop.h                |  52 +++-
cead9d
 libglusterfs/src/libglusterfs.sym                  |   7 +
cead9d
 libglusterfs/src/syncop.c                          | 306 ++++++++++++++++-----
cead9d
 xlators/cluster/dht/src/dht-rebalance.c            |   2 +-
cead9d
 xlators/mgmt/glusterd/src/glusterd-op-sm.c         |   9 +-
cead9d
 xlators/mgmt/glusterd/src/glusterd-proc-mgmt.c     |   2 +-
cead9d
 .../mgmt/glusterd/src/glusterd-snapshot-utils.c    |   5 +-
cead9d
 xlators/mgmt/glusterd/src/glusterd-syncop.h        |   2 +-
cead9d
 xlators/mgmt/glusterd/src/glusterd-utils.c         |  29 +-
cead9d
 xlators/mgmt/glusterd/src/glusterd.c               |   2 +
cead9d
 xlators/mgmt/glusterd/src/glusterd.h               |   2 +
cead9d
 11 files changed, 317 insertions(+), 101 deletions(-)
cead9d
cead9d
diff --git a/libglusterfs/src/glusterfs/syncop.h b/libglusterfs/src/glusterfs/syncop.h
cead9d
index e0f1017..3011b4c 100644
cead9d
--- a/libglusterfs/src/glusterfs/syncop.h
cead9d
+++ b/libglusterfs/src/glusterfs/syncop.h
cead9d
@@ -15,6 +15,7 @@
cead9d
 #include <sys/time.h>
cead9d
 #include <pthread.h>
cead9d
 #include <ucontext.h>
cead9d
+#include "glusterfs/timer.h"
cead9d
 
cead9d
 #define SYNCENV_PROC_MAX 16
cead9d
 #define SYNCENV_PROC_MIN 2
cead9d
@@ -32,6 +33,7 @@
cead9d
 struct synctask;
cead9d
 struct syncproc;
cead9d
 struct syncenv;
cead9d
+struct synccond;
cead9d
 
cead9d
 typedef int (*synctask_cbk_t)(int ret, call_frame_t *frame, void *opaque);
cead9d
 
cead9d
@@ -55,9 +57,12 @@ struct synctask {
cead9d
     call_frame_t *opframe;
cead9d
     synctask_cbk_t synccbk;
cead9d
     synctask_fn_t syncfn;
cead9d
-    synctask_state_t state;
cead9d
+    struct timespec *delta;
cead9d
+    gf_timer_t *timer;
cead9d
+    struct synccond *synccond;
cead9d
     void *opaque;
cead9d
     void *stack;
cead9d
+    synctask_state_t state;
cead9d
     int woken;
cead9d
     int slept;
cead9d
     int ret;
cead9d
@@ -85,19 +90,21 @@ struct syncproc {
cead9d
 /* hosts the scheduler thread and framework for executing synctasks */
cead9d
 struct syncenv {
cead9d
     struct syncproc proc[SYNCENV_PROC_MAX];
cead9d
-    int procs;
cead9d
+
cead9d
+    pthread_mutex_t mutex;
cead9d
+    pthread_cond_t cond;
cead9d
 
cead9d
     struct list_head runq;
cead9d
-    int runcount;
cead9d
     struct list_head waitq;
cead9d
-    int waitcount;
cead9d
+
cead9d
+    int procs;
cead9d
+    int procs_idle;
cead9d
+
cead9d
+    int runcount;
cead9d
 
cead9d
     int procmin;
cead9d
     int procmax;
cead9d
 
cead9d
-    pthread_mutex_t mutex;
cead9d
-    pthread_cond_t cond;
cead9d
-
cead9d
     size_t stacksize;
cead9d
 
cead9d
     int destroy; /* FLAG to mark syncenv is in destroy mode
cead9d
@@ -123,6 +130,13 @@ struct synclock {
cead9d
 };
cead9d
 typedef struct synclock synclock_t;
cead9d
 
cead9d
+struct synccond {
cead9d
+    pthread_mutex_t pmutex;
cead9d
+    pthread_cond_t pcond;
cead9d
+    struct list_head waitq;
cead9d
+};
cead9d
+typedef struct synccond synccond_t;
cead9d
+
cead9d
 struct syncbarrier {
cead9d
     gf_boolean_t initialized; /*Set on successful initialization*/
cead9d
     pthread_mutex_t guard;    /* guard the remaining members, pair @cond */
cead9d
@@ -219,7 +233,7 @@ struct syncopctx {
cead9d
 #define __yield(args)                                                          \
cead9d
     do {                                                                       \
cead9d
         if (args->task) {                                                      \
cead9d
-            synctask_yield(args->task);                                        \
cead9d
+            synctask_yield(args->task, NULL);                                  \
cead9d
         } else {                                                               \
cead9d
             pthread_mutex_lock(&args->mutex);                                  \
cead9d
             {                                                                  \
cead9d
@@ -307,7 +321,9 @@ synctask_join(struct synctask *task);
cead9d
 void
cead9d
 synctask_wake(struct synctask *task);
cead9d
 void
cead9d
-synctask_yield(struct synctask *task);
cead9d
+synctask_yield(struct synctask *task, struct timespec *delta);
cead9d
+void
cead9d
+synctask_sleep(int32_t secs);
cead9d
 void
cead9d
 synctask_waitfor(struct synctask *task, int count);
cead9d
 
cead9d
@@ -405,6 +421,24 @@ synclock_trylock(synclock_t *lock);
cead9d
 int
cead9d
 synclock_unlock(synclock_t *lock);
cead9d
 
cead9d
+int32_t
cead9d
+synccond_init(synccond_t *cond);
cead9d
+
cead9d
+void
cead9d
+synccond_destroy(synccond_t *cond);
cead9d
+
cead9d
+int
cead9d
+synccond_wait(synccond_t *cond, synclock_t *lock);
cead9d
+
cead9d
+int
cead9d
+synccond_timedwait(synccond_t *cond, synclock_t *lock, struct timespec *delta);
cead9d
+
cead9d
+void
cead9d
+synccond_signal(synccond_t *cond);
cead9d
+
cead9d
+void
cead9d
+synccond_broadcast(synccond_t *cond);
cead9d
+
cead9d
 int
cead9d
 syncbarrier_init(syncbarrier_t *barrier);
cead9d
 int
cead9d
diff --git a/libglusterfs/src/libglusterfs.sym b/libglusterfs/src/libglusterfs.sym
cead9d
index 467a1b7..5a721e0 100644
cead9d
--- a/libglusterfs/src/libglusterfs.sym
cead9d
+++ b/libglusterfs/src/libglusterfs.sym
cead9d
@@ -938,6 +938,12 @@ syncbarrier_destroy
cead9d
 syncbarrier_init
cead9d
 syncbarrier_wait
cead9d
 syncbarrier_wake
cead9d
+synccond_init
cead9d
+synccond_destroy
cead9d
+synccond_wait
cead9d
+synccond_timedwait
cead9d
+synccond_signal
cead9d
+synccond_broadcast
cead9d
 syncenv_destroy
cead9d
 syncenv_new
cead9d
 synclock_destroy
cead9d
@@ -1015,6 +1021,7 @@ synctask_new
cead9d
 synctask_new1
cead9d
 synctask_set
cead9d
 synctask_setid
cead9d
+synctask_sleep
cead9d
 synctask_wake
cead9d
 synctask_yield
cead9d
 sys_access
cead9d
diff --git a/libglusterfs/src/syncop.c b/libglusterfs/src/syncop.c
cead9d
index 693970f..71d37b7 100644
cead9d
--- a/libglusterfs/src/syncop.c
cead9d
+++ b/libglusterfs/src/syncop.c
cead9d
@@ -154,10 +154,14 @@ out:
cead9d
     return ret;
cead9d
 }
cead9d
 
cead9d
+void *
cead9d
+syncenv_processor(void *thdata);
cead9d
+
cead9d
 static void
cead9d
 __run(struct synctask *task)
cead9d
 {
cead9d
     struct syncenv *env = NULL;
cead9d
+    int32_t total, ret, i;
cead9d
 
cead9d
     env = task->env;
cead9d
 
cead9d
@@ -173,7 +177,6 @@ __run(struct synctask *task)
cead9d
             env->runcount--;
cead9d
             break;
cead9d
         case SYNCTASK_WAIT:
cead9d
-            env->waitcount--;
cead9d
             break;
cead9d
         case SYNCTASK_DONE:
cead9d
             gf_msg(task->xl->name, GF_LOG_WARNING, 0, LG_MSG_COMPLETED_TASK,
cead9d
@@ -187,8 +190,27 @@ __run(struct synctask *task)
cead9d
     }
cead9d
 
cead9d
     list_add_tail(&task->all_tasks, &env->runq);
cead9d
-    env->runcount++;
cead9d
     task->state = SYNCTASK_RUN;
cead9d
+
cead9d
+    env->runcount++;
cead9d
+
cead9d
+    total = env->procs + env->runcount - env->procs_idle;
cead9d
+    if (total > env->procmax) {
cead9d
+        total = env->procmax;
cead9d
+    }
cead9d
+    if (total > env->procs) {
cead9d
+        for (i = 0; i < env->procmax; i++) {
cead9d
+            if (env->proc[i].env == NULL) {
cead9d
+                env->proc[i].env = env;
cead9d
+                ret = gf_thread_create(&env->proc[i].processor, NULL,
cead9d
+                                       syncenv_processor, &env->proc[i],
cead9d
+                                       "sproc%d", i);
cead9d
+                if ((ret < 0) || (++env->procs >= total)) {
cead9d
+                    break;
cead9d
+                }
cead9d
+            }
cead9d
+        }
cead9d
+    }
cead9d
 }
cead9d
 
cead9d
 static void
cead9d
@@ -210,7 +232,6 @@ __wait(struct synctask *task)
cead9d
             gf_msg(task->xl->name, GF_LOG_WARNING, 0, LG_MSG_REWAITING_TASK,
cead9d
                    "re-waiting already waiting "
cead9d
                    "task");
cead9d
-            env->waitcount--;
cead9d
             break;
cead9d
         case SYNCTASK_DONE:
cead9d
             gf_msg(task->xl->name, GF_LOG_WARNING, 0, LG_MSG_COMPLETED_TASK,
cead9d
@@ -223,12 +244,11 @@ __wait(struct synctask *task)
cead9d
     }
cead9d
 
cead9d
     list_add_tail(&task->all_tasks, &env->waitq);
cead9d
-    env->waitcount++;
cead9d
     task->state = SYNCTASK_WAIT;
cead9d
 }
cead9d
 
cead9d
 void
cead9d
-synctask_yield(struct synctask *task)
cead9d
+synctask_yield(struct synctask *task, struct timespec *delta)
cead9d
 {
cead9d
     xlator_t *oldTHIS = THIS;
cead9d
 
cead9d
@@ -237,6 +257,8 @@ synctask_yield(struct synctask *task)
cead9d
     task->proc->sched.uc_flags &= ~_UC_TLSBASE;
cead9d
 #endif
cead9d
 
cead9d
+    task->delta = delta;
cead9d
+
cead9d
     if (task->state != SYNCTASK_DONE) {
cead9d
         task->state = SYNCTASK_SUSPEND;
cead9d
     }
cead9d
@@ -249,6 +271,35 @@ synctask_yield(struct synctask *task)
cead9d
 }
cead9d
 
cead9d
 void
cead9d
+synctask_sleep(int32_t secs)
cead9d
+{
cead9d
+    struct timespec delta;
cead9d
+    struct synctask *task;
cead9d
+
cead9d
+    task = synctask_get();
cead9d
+
cead9d
+    if (task == NULL) {
cead9d
+        sleep(secs);
cead9d
+    } else {
cead9d
+        delta.tv_sec = secs;
cead9d
+        delta.tv_nsec = 0;
cead9d
+
cead9d
+        synctask_yield(task, &delta);
cead9d
+    }
cead9d
+}
cead9d
+
cead9d
+static void
cead9d
+__synctask_wake(struct synctask *task)
cead9d
+{
cead9d
+    task->woken = 1;
cead9d
+
cead9d
+    if (task->slept)
cead9d
+        __run(task);
cead9d
+
cead9d
+    pthread_cond_broadcast(&task->env->cond);
cead9d
+}
cead9d
+
cead9d
+void
cead9d
 synctask_wake(struct synctask *task)
cead9d
 {
cead9d
     struct syncenv *env = NULL;
cead9d
@@ -257,13 +308,18 @@ synctask_wake(struct synctask *task)
cead9d
 
cead9d
     pthread_mutex_lock(&env->mutex);
cead9d
     {
cead9d
-        task->woken = 1;
cead9d
+        if (task->timer != NULL) {
cead9d
+            if (gf_timer_call_cancel(task->xl->ctx, task->timer) != 0) {
cead9d
+                goto unlock;
cead9d
+            }
cead9d
 
cead9d
-        if (task->slept)
cead9d
-            __run(task);
cead9d
+            task->timer = NULL;
cead9d
+            task->synccond = NULL;
cead9d
+        }
cead9d
 
cead9d
-        pthread_cond_broadcast(&env->cond);
cead9d
+        __synctask_wake(task);
cead9d
     }
cead9d
+unlock:
cead9d
     pthread_mutex_unlock(&env->mutex);
cead9d
 }
cead9d
 
cead9d
@@ -282,7 +338,7 @@ synctask_wrap(void)
cead9d
 
cead9d
     task->state = SYNCTASK_DONE;
cead9d
 
cead9d
-    synctask_yield(task);
cead9d
+    synctask_yield(task, NULL);
cead9d
 }
cead9d
 
cead9d
 void
cead9d
@@ -422,11 +478,6 @@ synctask_create(struct syncenv *env, size_t stacksize, synctask_fn_t fn,
cead9d
     }
cead9d
 
cead9d
     synctask_wake(newtask);
cead9d
-    /*
cead9d
-     * Make sure someone's there to execute anything we just put on the
cead9d
-     * run queue.
cead9d
-     */
cead9d
-    syncenv_scale(env);
cead9d
 
cead9d
     return newtask;
cead9d
 err:
cead9d
@@ -520,8 +571,12 @@ syncenv_task(struct syncproc *proc)
cead9d
                 goto unlock;
cead9d
             }
cead9d
 
cead9d
+            env->procs_idle++;
cead9d
+
cead9d
             sleep_till.tv_sec = time(NULL) + SYNCPROC_IDLE_TIME;
cead9d
             ret = pthread_cond_timedwait(&env->cond, &env->mutex, &sleep_till);
cead9d
+
cead9d
+            env->procs_idle--;
cead9d
         }
cead9d
 
cead9d
         task = list_entry(env->runq.next, struct synctask, all_tasks);
cead9d
@@ -540,6 +595,34 @@ unlock:
cead9d
     return task;
cead9d
 }
cead9d
 
cead9d
+static void
cead9d
+synctask_timer(void *data)
cead9d
+{
cead9d
+    struct synctask *task = data;
cead9d
+    struct synccond *cond;
cead9d
+
cead9d
+    cond = task->synccond;
cead9d
+    if (cond != NULL) {
cead9d
+        pthread_mutex_lock(&cond->pmutex);
cead9d
+
cead9d
+        list_del_init(&task->waitq);
cead9d
+        task->synccond = NULL;
cead9d
+
cead9d
+        pthread_mutex_unlock(&cond->pmutex);
cead9d
+
cead9d
+        task->ret = -ETIMEDOUT;
cead9d
+    }
cead9d
+
cead9d
+    pthread_mutex_lock(&task->env->mutex);
cead9d
+
cead9d
+    gf_timer_call_cancel(task->xl->ctx, task->timer);
cead9d
+    task->timer = NULL;
cead9d
+
cead9d
+    __synctask_wake(task);
cead9d
+
cead9d
+    pthread_mutex_unlock(&task->env->mutex);
cead9d
+}
cead9d
+
cead9d
 void
cead9d
 synctask_switchto(struct synctask *task)
cead9d
 {
cead9d
@@ -572,7 +655,14 @@ synctask_switchto(struct synctask *task)
cead9d
         } else {
cead9d
             task->slept = 1;
cead9d
             __wait(task);
cead9d
+
cead9d
+            if (task->delta != NULL) {
cead9d
+                task->timer = gf_timer_call_after(task->xl->ctx, *task->delta,
cead9d
+                                                  synctask_timer, task);
cead9d
+            }
cead9d
         }
cead9d
+
cead9d
+        task->delta = NULL;
cead9d
     }
cead9d
     pthread_mutex_unlock(&env->mutex);
cead9d
 }
cead9d
@@ -580,65 +670,18 @@ synctask_switchto(struct synctask *task)
cead9d
 void *
cead9d
 syncenv_processor(void *thdata)
cead9d
 {
cead9d
-    struct syncenv *env = NULL;
cead9d
     struct syncproc *proc = NULL;
cead9d
     struct synctask *task = NULL;
cead9d
 
cead9d
     proc = thdata;
cead9d
-    env = proc->env;
cead9d
-
cead9d
-    for (;;) {
cead9d
-        task = syncenv_task(proc);
cead9d
-        if (!task)
cead9d
-            break;
cead9d
 
cead9d
+    while ((task = syncenv_task(proc)) != NULL) {
cead9d
         synctask_switchto(task);
cead9d
-
cead9d
-        syncenv_scale(env);
cead9d
     }
cead9d
 
cead9d
     return NULL;
cead9d
 }
cead9d
 
cead9d
-void
cead9d
-syncenv_scale(struct syncenv *env)
cead9d
-{
cead9d
-    int diff = 0;
cead9d
-    int scale = 0;
cead9d
-    int i = 0;
cead9d
-    int ret = 0;
cead9d
-
cead9d
-    pthread_mutex_lock(&env->mutex);
cead9d
-    {
cead9d
-        if (env->procs > env->runcount)
cead9d
-            goto unlock;
cead9d
-
cead9d
-        scale = env->runcount;
cead9d
-        if (scale > env->procmax)
cead9d
-            scale = env->procmax;
cead9d
-        if (scale > env->procs)
cead9d
-            diff = scale - env->procs;
cead9d
-        while (diff) {
cead9d
-            diff--;
cead9d
-            for (; (i < env->procmax); i++) {
cead9d
-                if (env->proc[i].processor == 0)
cead9d
-                    break;
cead9d
-            }
cead9d
-
cead9d
-            env->proc[i].env = env;
cead9d
-            ret = gf_thread_create(&env->proc[i].processor, NULL,
cead9d
-                                   syncenv_processor, &env->proc[i],
cead9d
-                                   "sproc%03hx", env->procs & 0x3ff);
cead9d
-            if (ret)
cead9d
-                break;
cead9d
-            env->procs++;
cead9d
-            i++;
cead9d
-        }
cead9d
-    }
cead9d
-unlock:
cead9d
-    pthread_mutex_unlock(&env->mutex);
cead9d
-}
cead9d
-
cead9d
 /* The syncenv threads are cleaned up in this routine.
cead9d
  */
cead9d
 void
cead9d
@@ -715,12 +758,13 @@ syncenv_new(size_t stacksize, int procmin, int procmax)
cead9d
         newenv->stacksize = stacksize;
cead9d
     newenv->procmin = procmin;
cead9d
     newenv->procmax = procmax;
cead9d
+    newenv->procs_idle = 0;
cead9d
 
cead9d
     for (i = 0; i < newenv->procmin; i++) {
cead9d
         newenv->proc[i].env = newenv;
cead9d
         ret = gf_thread_create(&newenv->proc[i].processor, NULL,
cead9d
                                syncenv_processor, &newenv->proc[i], "sproc%d",
cead9d
-                               newenv->procs);
cead9d
+                               i);
cead9d
         if (ret)
cead9d
             break;
cead9d
         newenv->procs++;
cead9d
@@ -810,7 +854,7 @@ __synclock_lock(struct synclock *lock)
cead9d
             task->woken = 0;
cead9d
             list_add_tail(&task->waitq, &lock->waitq);
cead9d
             pthread_mutex_unlock(&lock->guard);
cead9d
-            synctask_yield(task);
cead9d
+            synctask_yield(task, NULL);
cead9d
             /* task is removed from waitq in unlock,
cead9d
              * under lock->guard.*/
cead9d
             pthread_mutex_lock(&lock->guard);
cead9d
@@ -963,6 +1007,136 @@ synclock_unlock(synclock_t *lock)
cead9d
     return ret;
cead9d
 }
cead9d
 
cead9d
+/* Condition variables */
cead9d
+
cead9d
+int32_t
cead9d
+synccond_init(synccond_t *cond)
cead9d
+{
cead9d
+    int32_t ret;
cead9d
+
cead9d
+    INIT_LIST_HEAD(&cond->waitq);
cead9d
+
cead9d
+    ret = pthread_mutex_init(&cond->pmutex, NULL);
cead9d
+    if (ret != 0) {
cead9d
+        return -ret;
cead9d
+    }
cead9d
+
cead9d
+    ret = pthread_cond_init(&cond->pcond, NULL);
cead9d
+    if (ret != 0) {
cead9d
+        pthread_mutex_destroy(&cond->pmutex);
cead9d
+    }
cead9d
+
cead9d
+    return -ret;
cead9d
+}
cead9d
+
cead9d
+void
cead9d
+synccond_destroy(synccond_t *cond)
cead9d
+{
cead9d
+    pthread_cond_destroy(&cond->pcond);
cead9d
+    pthread_mutex_destroy(&cond->pmutex);
cead9d
+}
cead9d
+
cead9d
+int
cead9d
+synccond_timedwait(synccond_t *cond, synclock_t *lock, struct timespec *delta)
cead9d
+{
cead9d
+    struct timespec now;
cead9d
+    struct synctask *task = NULL;
cead9d
+    int ret;
cead9d
+
cead9d
+    task = synctask_get();
cead9d
+
cead9d
+    if (task == NULL) {
cead9d
+        if (delta != NULL) {
cead9d
+            timespec_now_realtime(&now;;
cead9d
+            timespec_adjust_delta(&now, *delta);
cead9d
+        }
cead9d
+
cead9d
+        pthread_mutex_lock(&cond->pmutex);
cead9d
+
cead9d
+        if (delta == NULL) {
cead9d
+            ret = -pthread_cond_wait(&cond->pcond, &cond->pmutex);
cead9d
+        } else {
cead9d
+            ret = -pthread_cond_timedwait(&cond->pcond, &cond->pmutex, &now;;
cead9d
+        }
cead9d
+    } else {
cead9d
+        pthread_mutex_lock(&cond->pmutex);
cead9d
+
cead9d
+        list_add_tail(&task->waitq, &cond->waitq);
cead9d
+        task->synccond = cond;
cead9d
+
cead9d
+        ret = synclock_unlock(lock);
cead9d
+        if (ret == 0) {
cead9d
+            pthread_mutex_unlock(&cond->pmutex);
cead9d
+
cead9d
+            synctask_yield(task, delta);
cead9d
+
cead9d
+            ret = synclock_lock(lock);
cead9d
+            if (ret == 0) {
cead9d
+                ret = task->ret;
cead9d
+            }
cead9d
+            task->ret = 0;
cead9d
+
cead9d
+            return ret;
cead9d
+        }
cead9d
+
cead9d
+        list_del_init(&task->waitq);
cead9d
+    }
cead9d
+
cead9d
+    pthread_mutex_unlock(&cond->pmutex);
cead9d
+
cead9d
+    return ret;
cead9d
+}
cead9d
+
cead9d
+int
cead9d
+synccond_wait(synccond_t *cond, synclock_t *lock)
cead9d
+{
cead9d
+    return synccond_timedwait(cond, lock, NULL);
cead9d
+}
cead9d
+
cead9d
+void
cead9d
+synccond_signal(synccond_t *cond)
cead9d
+{
cead9d
+    struct synctask *task;
cead9d
+
cead9d
+    pthread_mutex_lock(&cond->pmutex);
cead9d
+
cead9d
+    if (!list_empty(&cond->waitq)) {
cead9d
+        task = list_first_entry(&cond->waitq, struct synctask, waitq);
cead9d
+        list_del_init(&task->waitq);
cead9d
+
cead9d
+        pthread_mutex_unlock(&cond->pmutex);
cead9d
+
cead9d
+        synctask_wake(task);
cead9d
+    } else {
cead9d
+        pthread_cond_signal(&cond->pcond);
cead9d
+
cead9d
+        pthread_mutex_unlock(&cond->pmutex);
cead9d
+    }
cead9d
+}
cead9d
+
cead9d
+void
cead9d
+synccond_broadcast(synccond_t *cond)
cead9d
+{
cead9d
+    struct list_head list;
cead9d
+    struct synctask *task;
cead9d
+
cead9d
+    INIT_LIST_HEAD(&list);
cead9d
+
cead9d
+    pthread_mutex_lock(&cond->pmutex);
cead9d
+
cead9d
+    list_splice_init(&cond->waitq, &list);
cead9d
+    pthread_cond_broadcast(&cond->pcond);
cead9d
+
cead9d
+    pthread_mutex_unlock(&cond->pmutex);
cead9d
+
cead9d
+    while (!list_empty(&list)) {
cead9d
+        task = list_first_entry(&list, struct synctask, waitq);
cead9d
+        list_del_init(&task->waitq);
cead9d
+
cead9d
+        synctask_wake(task);
cead9d
+    }
cead9d
+}
cead9d
+
cead9d
 /* Barriers */
cead9d
 
cead9d
 int
cead9d
@@ -1032,7 +1206,7 @@ __syncbarrier_wait(struct syncbarrier *barrier, int waitfor)
cead9d
             /* called within a synctask */
cead9d
             list_add_tail(&task->waitq, &barrier->waitq);
cead9d
             pthread_mutex_unlock(&barrier->guard);
cead9d
-            synctask_yield(task);
cead9d
+            synctask_yield(task, NULL);
cead9d
             pthread_mutex_lock(&barrier->guard);
cead9d
         } else {
cead9d
             /* called by a non-synctask */
cead9d
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
cead9d
index c692119..957deaa 100644
cead9d
--- a/xlators/cluster/dht/src/dht-rebalance.c
cead9d
+++ b/xlators/cluster/dht/src/dht-rebalance.c
cead9d
@@ -5224,7 +5224,7 @@ gf_defrag_pause_tier(xlator_t *this, gf_defrag_info_t *defrag)
cead9d
     defrag->tier_conf.pause_timer = gf_timer_call_after(
cead9d
         this->ctx, delta, gf_defrag_pause_tier_timeout, this);
cead9d
 
cead9d
-    synctask_yield(defrag->tier_conf.pause_synctask);
cead9d
+    synctask_yield(defrag->tier_conf.pause_synctask, NULL);
cead9d
 
cead9d
     if (gf_defrag_get_pause_state(&defrag->tier_conf) == TIER_PAUSED)
cead9d
         goto out;
cead9d
diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
cead9d
index 0d29de2..6475611 100644
cead9d
--- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c
cead9d
+++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
cead9d
@@ -6076,13 +6076,8 @@ glusterd_op_stage_validate(glusterd_op_t op, dict_t *dict, char **op_errstr,
cead9d
 static void
cead9d
 glusterd_wait_for_blockers(glusterd_conf_t *priv)
cead9d
 {
cead9d
-    uint64_t blockers = GF_ATOMIC_GET(priv->blockers);
cead9d
-
cead9d
-    while (blockers) {
cead9d
-        synclock_unlock(&priv->big_lock);
cead9d
-        sleep(1);
cead9d
-        blockers = GF_ATOMIC_GET(priv->blockers);
cead9d
-        synclock_lock(&priv->big_lock);
cead9d
+    while (GF_ATOMIC_GET(priv->blockers)) {
cead9d
+        synccond_wait(&priv->cond_blockers, &priv->big_lock);
cead9d
     }
cead9d
 }
cead9d
 
cead9d
diff --git a/xlators/mgmt/glusterd/src/glusterd-proc-mgmt.c b/xlators/mgmt/glusterd/src/glusterd-proc-mgmt.c
cead9d
index 36018a0..f55a5fd 100644
cead9d
--- a/xlators/mgmt/glusterd/src/glusterd-proc-mgmt.c
cead9d
+++ b/xlators/mgmt/glusterd/src/glusterd-proc-mgmt.c
cead9d
@@ -112,7 +112,7 @@ glusterd_proc_stop(glusterd_proc_t *proc, int sig, int flags)
cead9d
         goto out;
cead9d
 
cead9d
     synclock_unlock(&conf->big_lock);
cead9d
-    sleep(1);
cead9d
+    synctask_sleep(1);
cead9d
     synclock_lock(&conf->big_lock);
cead9d
     if (gf_is_service_running(proc->pidfile, &pid)) {
cead9d
         ret = kill(pid, SIGKILL);
cead9d
diff --git a/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c b/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c
cead9d
index d225854..386eed2 100644
cead9d
--- a/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c
cead9d
+++ b/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c
cead9d
@@ -1961,9 +1961,7 @@ glusterd_update_snaps_synctask(void *opaque)
cead9d
     synclock_lock(&conf->big_lock);
cead9d
 
cead9d
     while (conf->restart_bricks) {
cead9d
-        synclock_unlock(&conf->big_lock);
cead9d
-        sleep(2);
cead9d
-        synclock_lock(&conf->big_lock);
cead9d
+        synccond_wait(&conf->cond_restart_bricks, &conf->big_lock);
cead9d
     }
cead9d
     conf->restart_bricks = _gf_true;
cead9d
 
cead9d
@@ -2070,6 +2068,7 @@ out:
cead9d
     if (dict)
cead9d
         dict_unref(dict);
cead9d
     conf->restart_bricks = _gf_false;
cead9d
+    synccond_broadcast(&conf->cond_restart_bricks);
cead9d
 
cead9d
     return ret;
cead9d
 }
cead9d
diff --git a/xlators/mgmt/glusterd/src/glusterd-syncop.h b/xlators/mgmt/glusterd/src/glusterd-syncop.h
cead9d
index ce4a940..a265f21 100644
cead9d
--- a/xlators/mgmt/glusterd/src/glusterd-syncop.h
cead9d
+++ b/xlators/mgmt/glusterd/src/glusterd-syncop.h
cead9d
@@ -32,7 +32,7 @@
cead9d
         ret = gd_syncop_submit_request(rpc, req, stb, cookie, prog, procnum,   \
cead9d
                                        cbk, (xdrproc_t)xdrproc);               \
cead9d
         if (!ret)                                                              \
cead9d
-            synctask_yield(stb->task);                                         \
cead9d
+            synctask_yield(stb->task, NULL);                                   \
cead9d
         else                                                                   \
cead9d
             gf_asprintf(&stb->errstr,                                          \
cead9d
                         "%s failed. Check log file"                            \
cead9d
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
cead9d
index 812c698..ce9931c 100644
cead9d
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
cead9d
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
cead9d
@@ -5068,22 +5068,22 @@ glusterd_import_friend_volumes_synctask(void *opaque)
cead9d
      * restarted (refer glusterd_restart_bricks ())
cead9d
      */
cead9d
     while (conf->restart_bricks) {
cead9d
-        synclock_unlock(&conf->big_lock);
cead9d
-        sleep(2);
cead9d
-        synclock_lock(&conf->big_lock);
cead9d
+        synccond_wait(&conf->cond_restart_bricks, &conf->big_lock);
cead9d
     }
cead9d
     conf->restart_bricks = _gf_true;
cead9d
 
cead9d
     while (i <= count) {
cead9d
         ret = glusterd_import_friend_volume(peer_data, i);
cead9d
         if (ret) {
cead9d
-            conf->restart_bricks = _gf_false;
cead9d
-            goto out;
cead9d
+            break;
cead9d
         }
cead9d
         i++;
cead9d
     }
cead9d
-    glusterd_svcs_manager(NULL);
cead9d
+    if (i > count) {
cead9d
+        glusterd_svcs_manager(NULL);
cead9d
+    }
cead9d
     conf->restart_bricks = _gf_false;
cead9d
+    synccond_broadcast(&conf->cond_restart_bricks);
cead9d
 out:
cead9d
     if (peer_data)
cead9d
         dict_unref(peer_data);
cead9d
@@ -5769,7 +5769,9 @@ my_callback(struct rpc_req *req, struct iovec *iov, int count, void *v_frame)
cead9d
     call_frame_t *frame = v_frame;
cead9d
     glusterd_conf_t *conf = frame->this->private;
cead9d
 
cead9d
-    GF_ATOMIC_DEC(conf->blockers);
cead9d
+    if (GF_ATOMIC_DEC(conf->blockers) == 0) {
cead9d
+        synccond_broadcast(&conf->cond_blockers);
cead9d
+    }
cead9d
 
cead9d
     STACK_DESTROY(frame->root);
cead9d
     return 0;
cead9d
@@ -5865,7 +5867,9 @@ attach_brick_callback(struct rpc_req *req, struct iovec *iov, int count,
cead9d
         }
cead9d
     }
cead9d
 out:
cead9d
-    GF_ATOMIC_DEC(conf->blockers);
cead9d
+    if (GF_ATOMIC_DEC(conf->blockers) == 0) {
cead9d
+        synccond_broadcast(&conf->cond_blockers);
cead9d
+    }
cead9d
     STACK_DESTROY(frame->root);
cead9d
     return 0;
cead9d
 }
cead9d
@@ -6053,7 +6057,7 @@ attach_brick(xlator_t *this, glusterd_brickinfo_t *brickinfo,
cead9d
          * TBD: see if there's a better way
cead9d
          */
cead9d
         synclock_unlock(&conf->big_lock);
cead9d
-        sleep(1);
cead9d
+        synctask_sleep(1);
cead9d
         synclock_lock(&conf->big_lock);
cead9d
     }
cead9d
 
cead9d
@@ -6193,7 +6197,7 @@ find_compat_brick_in_vol(glusterd_conf_t *conf,
cead9d
                          "brick %s is still"
cead9d
                          " starting, waiting for 2 seconds ",
cead9d
                          other_brick->path);
cead9d
-            sleep(2);
cead9d
+            synctask_sleep(2);
cead9d
             synclock_lock(&conf->big_lock);
cead9d
             retries--;
cead9d
         }
cead9d
@@ -6680,9 +6684,7 @@ glusterd_restart_bricks(void *opaque)
cead9d
      * glusterd_compare_friend_data ())
cead9d
      */
cead9d
     while (conf->restart_bricks) {
cead9d
-        synclock_unlock(&conf->big_lock);
cead9d
-        sleep(2);
cead9d
-        synclock_lock(&conf->big_lock);
cead9d
+        synccond_wait(&conf->cond_restart_bricks, &conf->big_lock);
cead9d
     }
cead9d
     conf->restart_bricks = _gf_true;
cead9d
 
cead9d
@@ -6798,6 +6800,7 @@ out:
cead9d
     GF_ATOMIC_DEC(conf->blockers);
cead9d
     conf->restart_done = _gf_true;
cead9d
     conf->restart_bricks = _gf_false;
cead9d
+    synccond_broadcast(&conf->cond_restart_bricks);
cead9d
 
cead9d
 return_block:
cead9d
     return ret;
cead9d
diff --git a/xlators/mgmt/glusterd/src/glusterd.c b/xlators/mgmt/glusterd/src/glusterd.c
cead9d
index d360312..a01034a 100644
cead9d
--- a/xlators/mgmt/glusterd/src/glusterd.c
cead9d
+++ b/xlators/mgmt/glusterd/src/glusterd.c
cead9d
@@ -1845,6 +1845,8 @@ init(xlator_t *this)
cead9d
     (void)strncpy(conf->rundir, rundir, sizeof(conf->rundir));
cead9d
 
cead9d
     synclock_init(&conf->big_lock, SYNC_LOCK_RECURSIVE);
cead9d
+    synccond_init(&conf->cond_restart_bricks);
cead9d
+    synccond_init(&conf->cond_blockers);
cead9d
     pthread_mutex_init(&conf->xprt_lock, NULL);
cead9d
     INIT_LIST_HEAD(&conf->xprt_list);
cead9d
     pthread_mutex_init(&conf->import_volumes, NULL);
cead9d
diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h
cead9d
index 2be005c..1c6c3b1 100644
cead9d
--- a/xlators/mgmt/glusterd/src/glusterd.h
cead9d
+++ b/xlators/mgmt/glusterd/src/glusterd.h
cead9d
@@ -209,6 +209,8 @@ typedef struct {
cead9d
     dict_t *opts;
cead9d
     synclock_t big_lock;
cead9d
     gf_boolean_t restart_done;
cead9d
+    synccond_t cond_restart_bricks;
cead9d
+    synccond_t cond_blockers;
cead9d
     rpcsvc_t *uds_rpc; /* RPCSVC for the unix domain socket */
cead9d
     uint32_t base_port;
cead9d
     uint32_t max_port;
cead9d
-- 
cead9d
1.8.3.1
cead9d