17b94a
From bf24623765817ede84ea47f3265f5e6c2ae17ee7 Mon Sep 17 00:00:00 2001
17b94a
From: Mohit Agrawal <moagrawal@redhat.com>
17b94a
Date: Tue, 16 Jul 2019 20:36:57 +0530
17b94a
Subject: [PATCH 279/284] posix: In brick_mux brick is crashed while start/stop
17b94a
 volume in loop
17b94a
17b94a
Problem: In brick_mux environment sometime brick is crashed while
17b94a
         volume stop/start in a loop.Brick is crashed in janitor task
17b94a
         at the time of accessing priv.If posix priv is cleaned up before
17b94a
         call janitor task then janitor task is crashed.
17b94a
17b94a
Solution: To avoid the crash in brick_mux environment introduce a new
17b94a
          flag janitor_task_stop in posix_private and before send CHILD_DOWN event
17b94a
          wait for update the flag by janitor_task_done
17b94a
17b94a
> Change-Id: Id9fa5d183a463b2b682774ab5cb9868357d139a4
17b94a
> fixes: bz#1730409
17b94a
> Signed-off-by: Mohit Agrawal <moagrawal@redhat.com>
17b94a
> (Cherry picked from commit f138d3fa2237e7fa940ecf17153fd700350c4138)
17b94a
> (Reviewed on upstream link https://review.gluster.org/#/c/glusterfs/+/23060/)
17b94a
17b94a
Change-Id: Id9fa5d183a463b2b682774ab5cb9868357d139a4
17b94a
fixex: bz#1729971
17b94a
Signed-off-by: Mohit Agrawal <moagrawal@redhat.com>
17b94a
Reviewed-on: https://code.engineering.redhat.com/gerrit/178934
17b94a
Tested-by: Mohit Agrawal <moagrawa@redhat.com>
17b94a
Tested-by: RHGS Build Bot <nigelb@redhat.com>
17b94a
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
17b94a
---
17b94a
 libglusterfs/src/glusterfs/xlator.h        |  3 +++
17b94a
 xlators/mgmt/glusterd/src/glusterd-utils.c |  5 ++--
17b94a
 xlators/protocol/server/src/server.c       |  6 ++++-
17b94a
 xlators/storage/posix/src/posix-common.c   | 40 +++++++++++++++++++++++++++++-
17b94a
 xlators/storage/posix/src/posix-helpers.c  | 16 ++++++++++++
17b94a
 xlators/storage/posix/src/posix.h          |  3 +++
17b94a
 6 files changed, 69 insertions(+), 4 deletions(-)
17b94a
17b94a
diff --git a/libglusterfs/src/glusterfs/xlator.h b/libglusterfs/src/glusterfs/xlator.h
17b94a
index b78daad..da551e9 100644
17b94a
--- a/libglusterfs/src/glusterfs/xlator.h
17b94a
+++ b/libglusterfs/src/glusterfs/xlator.h
17b94a
@@ -861,6 +861,9 @@ struct _xlator {
17b94a
 
17b94a
     /* Flag to notify got CHILD_DOWN event for detach brick */
17b94a
     uint32_t notify_down;
17b94a
+
17b94a
+    /* Flag to avoid throw duplicate PARENT_DOWN event */
17b94a
+    uint32_t parent_down;
17b94a
 };
17b94a
 
17b94a
 /* This would be the only structure which needs to be exported by
17b94a
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
17b94a
index 2aa975b..812c698 100644
17b94a
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
17b94a
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
17b94a
@@ -4082,8 +4082,9 @@ out:
17b94a
     if (msg[0]) {
17b94a
         gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_BRICK_IMPORT_FAIL, "%s",
17b94a
                msg);
17b94a
-        gf_event(EVENT_IMPORT_BRICK_FAILED, "peer=%s;brick=%s",
17b94a
-                 new_brickinfo->hostname, new_brickinfo->path);
17b94a
+        if (new_brickinfo)
17b94a
+            gf_event(EVENT_IMPORT_BRICK_FAILED, "peer=%s;brick=%s",
17b94a
+                     new_brickinfo->hostname, new_brickinfo->path);
17b94a
     }
17b94a
     gf_msg_debug("glusterd", 0, "Returning with %d", ret);
17b94a
     return ret;
17b94a
diff --git a/xlators/protocol/server/src/server.c b/xlators/protocol/server/src/server.c
17b94a
index 6ae63ba..a5f09fe 100644
17b94a
--- a/xlators/protocol/server/src/server.c
17b94a
+++ b/xlators/protocol/server/src/server.c
17b94a
@@ -580,6 +580,7 @@ server_graph_janitor_threads(void *data)
17b94a
     gf_boolean_t victim_found = _gf_false;
17b94a
     xlator_list_t **trav_p = NULL;
17b94a
     xlator_t *top = NULL;
17b94a
+    uint32_t parent_down = 0;
17b94a
 
17b94a
     GF_ASSERT(data);
17b94a
 
17b94a
@@ -598,7 +599,10 @@ server_graph_janitor_threads(void *data)
17b94a
         victim = (*trav_p)->xlator;
17b94a
         if (victim->cleanup_starting &&
17b94a
             strcmp(victim->name, victim_name) == 0) {
17b94a
-            victim_found = _gf_true;
17b94a
+            parent_down = victim->parent_down;
17b94a
+            victim->parent_down = 1;
17b94a
+            if (!parent_down)
17b94a
+                victim_found = _gf_true;
17b94a
             break;
17b94a
         }
17b94a
     }
17b94a
diff --git a/xlators/storage/posix/src/posix-common.c b/xlators/storage/posix/src/posix-common.c
17b94a
index d738692..69857d9 100644
17b94a
--- a/xlators/storage/posix/src/posix-common.c
17b94a
+++ b/xlators/storage/posix/src/posix-common.c
17b94a
@@ -146,10 +146,15 @@ int32_t
17b94a
 posix_notify(xlator_t *this, int32_t event, void *data, ...)
17b94a
 {
17b94a
     xlator_t *victim = data;
17b94a
+    struct posix_private *priv = this->private;
17b94a
+    int ret = 0;
17b94a
+    struct timespec sleep_till = {
17b94a
+        0,
17b94a
+    };
17b94a
 
17b94a
     switch (event) {
17b94a
         case GF_EVENT_PARENT_UP: {
17b94a
-            /* Tell the parent that posix xlator is up */
17b94a
+            /* the parent that posix xlator is up */
17b94a
             default_notify(this, GF_EVENT_CHILD_UP, data);
17b94a
         } break;
17b94a
 
17b94a
@@ -158,6 +163,31 @@ posix_notify(xlator_t *this, int32_t event, void *data, ...)
17b94a
                 break;
17b94a
             gf_log(this->name, GF_LOG_INFO, "Sending CHILD_DOWN for brick %s",
17b94a
                    victim->name);
17b94a
+
17b94a
+            if (priv->janitor) {
17b94a
+                pthread_mutex_lock(&priv->janitor_mutex);
17b94a
+                {
17b94a
+                    priv->janitor_task_stop = _gf_true;
17b94a
+                    ret = gf_tw_del_timer(this->ctx->tw->timer_wheel,
17b94a
+                                          priv->janitor);
17b94a
+                    if (!ret) {
17b94a
+                        clock_gettime(CLOCK_REALTIME, &sleep_till);
17b94a
+                        sleep_till.tv_sec += 1;
17b94a
+                        /* Wait to set janitor_task flag to _gf_false by
17b94a
+                         * janitor_task_done */
17b94a
+                        while (priv->janitor_task_stop) {
17b94a
+                            (void)pthread_cond_timedwait(&priv->janitor_cond,
17b94a
+                                                         &priv->janitor_mutex,
17b94a
+                                                         &sleep_till);
17b94a
+                            clock_gettime(CLOCK_REALTIME, &sleep_till);
17b94a
+                            sleep_till.tv_sec += 1;
17b94a
+                        }
17b94a
+                    }
17b94a
+                }
17b94a
+                pthread_mutex_unlock(&priv->janitor_mutex);
17b94a
+                GF_FREE(priv->janitor);
17b94a
+            }
17b94a
+            priv->janitor = NULL;
17b94a
             default_notify(this->parents->xlator, GF_EVENT_CHILD_DOWN, data);
17b94a
         } break;
17b94a
         default:
17b94a
@@ -1008,6 +1038,8 @@ posix_init(xlator_t *this)
17b94a
 
17b94a
     pthread_mutex_init(&_private->fsync_mutex, NULL);
17b94a
     pthread_cond_init(&_private->fsync_cond, NULL);
17b94a
+    pthread_mutex_init(&_private->janitor_mutex, NULL);
17b94a
+    pthread_cond_init(&_private->janitor_cond, NULL);
17b94a
     INIT_LIST_HEAD(&_private->fsyncs);
17b94a
     ret = posix_spawn_ctx_janitor_thread(this);
17b94a
     if (ret)
17b94a
@@ -1128,6 +1160,7 @@ posix_fini(xlator_t *this)
17b94a
         (void)gf_thread_cleanup_xint(priv->disk_space_check);
17b94a
         priv->disk_space_check = 0;
17b94a
     }
17b94a
+
17b94a
     if (priv->janitor) {
17b94a
         /*TODO: Make sure the synctask is also complete */
17b94a
         ret = gf_tw_del_timer(this->ctx->tw->timer_wheel, priv->janitor);
17b94a
@@ -1135,8 +1168,10 @@ posix_fini(xlator_t *this)
17b94a
             gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_TIMER_DELETE_FAILED,
17b94a
                    "Failed to delete janitor timer");
17b94a
         }
17b94a
+        GF_FREE(priv->janitor);
17b94a
         priv->janitor = NULL;
17b94a
     }
17b94a
+
17b94a
     if (priv->fsyncer) {
17b94a
         (void)gf_thread_cleanup_xint(priv->fsyncer);
17b94a
         priv->fsyncer = 0;
17b94a
@@ -1148,6 +1183,9 @@ posix_fini(xlator_t *this)
17b94a
     GF_FREE(priv->base_path);
17b94a
     LOCK_DESTROY(&priv->lock);
17b94a
     pthread_mutex_destroy(&priv->fsync_mutex);
17b94a
+    pthread_cond_destroy(&priv->fsync_cond);
17b94a
+    pthread_mutex_destroy(&priv->janitor_mutex);
17b94a
+    pthread_cond_destroy(&priv->janitor_cond);
17b94a
     GF_FREE(priv->hostname);
17b94a
     GF_FREE(priv->trash_path);
17b94a
     GF_FREE(priv);
17b94a
diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c
17b94a
index 07169b5..ef5bfd5 100644
17b94a
--- a/xlators/storage/posix/src/posix-helpers.c
17b94a
+++ b/xlators/storage/posix/src/posix-helpers.c
17b94a
@@ -1432,12 +1432,24 @@ posix_janitor_task_done(int ret, call_frame_t *frame, void *data)
17b94a
     this = data;
17b94a
     priv = this->private;
17b94a
 
17b94a
+    pthread_mutex_lock(&priv->janitor_mutex);
17b94a
+    {
17b94a
+        if (priv->janitor_task_stop) {
17b94a
+            priv->janitor_task_stop = _gf_false;
17b94a
+            pthread_cond_signal(&priv->janitor_cond);
17b94a
+            pthread_mutex_unlock(&priv->janitor_mutex);
17b94a
+            goto out;
17b94a
+        }
17b94a
+    }
17b94a
+    pthread_mutex_unlock(&priv->janitor_mutex);
17b94a
+
17b94a
     LOCK(&priv->lock);
17b94a
     {
17b94a
         __posix_janitor_timer_start(this);
17b94a
     }
17b94a
     UNLOCK(&priv->lock);
17b94a
 
17b94a
+out:
17b94a
     return 0;
17b94a
 }
17b94a
 
17b94a
@@ -1456,6 +1468,9 @@ posix_janitor_task(void *data)
17b94a
     old_this = THIS;
17b94a
     THIS = this;
17b94a
 
17b94a
+    if (!priv)
17b94a
+        goto out;
17b94a
+
17b94a
     time(&now;;
17b94a
     if ((now - priv->last_landfill_check) > priv->janitor_sleep_duration) {
17b94a
         if (priv->disable_landfill_purge) {
17b94a
@@ -1475,6 +1490,7 @@ posix_janitor_task(void *data)
17b94a
 
17b94a
     THIS = old_this;
17b94a
 
17b94a
+out:
17b94a
     return 0;
17b94a
 }
17b94a
 
17b94a
diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h
17b94a
index b0935a7..64288a7 100644
17b94a
--- a/xlators/storage/posix/src/posix.h
17b94a
+++ b/xlators/storage/posix/src/posix.h
17b94a
@@ -203,6 +203,8 @@ struct posix_private {
17b94a
     struct list_head fsyncs;
17b94a
     pthread_mutex_t fsync_mutex;
17b94a
     pthread_cond_t fsync_cond;
17b94a
+    pthread_mutex_t janitor_mutex;
17b94a
+    pthread_cond_t janitor_cond;
17b94a
     int fsync_queue_count;
17b94a
 
17b94a
     enum {
17b94a
@@ -257,6 +259,7 @@ struct posix_private {
17b94a
 
17b94a
     gf_boolean_t fips_mode_rchecksum;
17b94a
     gf_boolean_t ctime;
17b94a
+    gf_boolean_t janitor_task_stop;
17b94a
 };
17b94a
 
17b94a
 typedef struct {
17b94a
-- 
17b94a
1.8.3.1
17b94a